From a02cc89a1d4ec9db67eb95762d579f5b891438ce Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 23 Aug 2023 04:21:11 +0200 Subject: [PATCH] Enhanced vectorization for all --- lollms/configs/config.yaml | 12 +++++-- lollms/personality.py | 31 ++++++++++++++---- lollms/utilities.py | 66 +++++++++++--------------------------- setup.py | 2 +- 4 files changed, 54 insertions(+), 57 deletions(-) diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml index 95ce2b9..fc63550 100644 --- a/lollms/configs/config.yaml +++ b/lollms/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Models Configuration file =========================== -version: 11 +version: 19 binding_name: null model_name: null @@ -36,4 +36,12 @@ user_avatar: default_user # Automatic update auto_update: false -debug: false \ No newline at end of file +debug: false + +# Data vectorization +data_vectorization_method: "ftidf_vectorizer" #"model_embedding" or "ftidf_vectorizer" +data_visualization_method: "PCA" #"PCA" or "TSNE" +data_vectorization_save_db: False # For each new session, new files +data_vectorization_chunk_size: 512 # chunk size +data_vectorization_overlap_size: 128 # overlap between chunks size +data_vectorization_nb_chunks: 2 # number of chunks to use \ No newline at end of file diff --git a/lollms/personality.py b/lollms/personality.py index 0a95c6d..b8c2ccd 100644 --- a/lollms/personality.py +++ b/lollms/personality.py @@ -19,6 +19,7 @@ from lollms.helpers import ASCIIColors from lollms.types import MSG_TYPE from typing import Callable import json +from lollms.utilities import TextVectorizer, GenericDataLoader def is_package_installed(package_name): @@ -72,6 +73,7 @@ class AIPersonality: self.callback = callback self.files = [] + self.vectorizer = None self.installation_option = installation_option @@ -282,13 +284,30 @@ Date: {{date}} return config def add_file(self, path, callback=None): - if callback is not None: - callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO) + self.files.append(path) - - return True - - + db_path = self.lollms_paths.personal_databases_path / self.name / "db.json" + db_path.parent.mkdir(parents=True, exist_ok=True) + if self.vectorizer is None: + self.vectorizer = TextVectorizer(self.config.data_vectorization_method, # supported "model_embedding" or "ftidf_vectorizer" + model=self.model, #needed in case of using model_embedding + database_path=db_path, + save_db=self.config.data_vectorization_save_db, + visualize_data_at_startup=False, + visualize_data_at_add_file=False, + visualize_data_at_generate=False, + data_visualization_method="PCA", + database_dict=None) + try: + data = GenericDataLoader.read_file(path) + self.vectorizer.add_document(path, data, self.config.data_vectorization_chunk_size, self.config.data_vectorization_overlap_size) + self.vectorizer.index() + if callback is not None: + callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO) + return True + except ValueError as ve: + ASCIIColors.error(f"Unsupported file format. Supported formats are {GenericDataLoader.get_supported_file_types()}") + return False def save_personality(self, package_path=None): """ Save the personality parameters to a YAML configuration file. diff --git a/lollms/utilities.py b/lollms/utilities.py index 6a879f9..1972df5 100644 --- a/lollms/utilities.py +++ b/lollms/utilities.py @@ -1,6 +1,4 @@ -from lollms.personality import APScript from lollms.helpers import ASCIIColors, trace_exception -from lollms.paths import LollmsPaths from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from pathlib import Path @@ -597,6 +595,24 @@ class TextVectorizer: class GenericDataLoader: @staticmethod + def read_file(file_path:Path): + if file_path.suffix ==".pdf": + return GenericDataLoader.read_pdf_file(file_path) + elif file_path.suffix == ".txt": + return GenericDataLoader.read_text_file(file_path) + elif file_path.suffix == ".docx": + return GenericDataLoader.read_docx_file(file_path) + elif file_path.suffix == ".json": + return GenericDataLoader.read_json_file(file_path) + elif file_path.suffix == ".html": + return GenericDataLoader.read_html_file(file_path) + elif file_path.suffix == ".pptx": + return GenericDataLoader.read_pptx_file(file_path) + else: + raise ValueError("Unknown file type") + def get_supported_file_types(): + return ["pdf", "txt", "docx", "json", "html", "pptx"] + @staticmethod def read_pdf_file(file_path): try: import PyPDF2 @@ -623,52 +639,6 @@ class GenericDataLoader: markdown_text = text.replace('\n', ' \n') # Adding double spaces at the end of each line for Markdown line breaks return markdown_text - """ - - - from io import BytesIO - with open(file_path, 'rb') as pdf_file: - pdf_reader = PyPDF2.PdfReader(pdf_file) - all_text = [] - for page_num in range(len(pdf_reader.pages)): - page = pdf_reader.pages[page_num] - if '/Resources' in page and '/XObject' in page['/Resources']: - xObject = page['/Resources']['/XObject'] - if xObject is not None: - for obj in xObject: - # Check if the object is an image - if xObject[obj]['/Subtype'] == '/Image': - image_data = xObject[obj].get_object() - image_stream = image_data.get_object() - image_stream_data = image_stream.get_data() - - try: - # Extract text from the image using pytesseract - extracted_text = pytesseract.image_to_string(Image.open(BytesIO(image_stream_data))) - all_text.append(extracted_text) - except pytesseract.TesseractNotFoundError: - ASCIIColors.error("Please install tesserract to enable ocr data extraction from your pdf file") - except UnidentifiedImageError: - # Ignore images that cannot be identified - pass - - # Extract regular text from the page using PyPDF2's text extraction - regular_text = page.extract_text() - if regular_text: - all_text.append(regular_text) - - return "\n\n".join(all_text) - """ - - """ - text = "" - with open(file_path, 'rb') as pdf_file: - pdf_reader = PyPDF2.PdfReader(pdf_file) - for page in pdf_reader.pages: - text += page.extract_text() - - return text - """ @staticmethod def read_docx_file(file_path): diff --git a/setup.py b/setup.py index ab37936..ea74179 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ def get_all_files(path): setuptools.setup( name="lollms", - version="4.0.1", + version="4.0.2", author="Saifeddine ALOUI", author_email="aloui.saifeddine@gmail.com", description="A python library for AI personality definition",