Enhanced vectorization for all

2025-04-08 11:24:14 +00:00 · 2023-08-23 04:21:11 +02:00 · 2023-08-23 04:21:11 +02:00 · a02cc89a1d
commit a02cc89a1d
parent 42fa3489b3
4 changed files with 54 additions and 57 deletions
--- a/lollms/configs/config.yaml
+++ b/lollms/configs/config.yaml
@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Models Configuration file =========================== 
-version: 11
+version: 19
 binding_name: null
 model_name: null

@ -36,4 +36,12 @@ user_avatar: default_user

 # Automatic update
 auto_update: false
-debug: false
+debug: false
+
+# Data vectorization
+data_vectorization_method: "ftidf_vectorizer" #"model_embedding" or "ftidf_vectorizer"
+data_visualization_method: "PCA" #"PCA" or "TSNE"
+data_vectorization_save_db: False # For each new session, new files
+data_vectorization_chunk_size: 512 # chunk size
+data_vectorization_overlap_size: 128 # overlap between chunks size
+data_vectorization_nb_chunks: 2 # number of chunks to use
--- a/lollms/personality.py
+++ b/lollms/personality.py
@ -19,6 +19,7 @@ from lollms.helpers import ASCIIColors
 from lollms.types import MSG_TYPE
 from typing import Callable
 import json
+from lollms.utilities import TextVectorizer, GenericDataLoader


 def is_package_installed(package_name):
@ -72,6 +73,7 @@ class AIPersonality:
        self.callback = callback

        self.files = []
+        self.vectorizer = None

        self.installation_option = installation_option

@ -282,13 +284,30 @@ Date: {{date}}
        return config

    def add_file(self, path, callback=None):
-        if callback is not None:
-            callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
+
        self.files.append(path)
-        
-        return True
-
-
+        db_path = self.lollms_paths.personal_databases_path / self.name / "db.json"
+        db_path.parent.mkdir(parents=True, exist_ok=True)
+        if self.vectorizer is None:
+            self.vectorizer = TextVectorizer(self.config.data_vectorization_method, # supported "model_embedding" or "ftidf_vectorizer"
+                        model=self.model, #needed in case of using model_embedding
+                        database_path=db_path,
+                        save_db=self.config.data_vectorization_save_db,
+                        visualize_data_at_startup=False,
+                        visualize_data_at_add_file=False,
+                        visualize_data_at_generate=False,
+                        data_visualization_method="PCA",
+                        database_dict=None)
+        try:
+            data = GenericDataLoader.read_file(path)
+            self.vectorizer.add_document(path, data, self.config.data_vectorization_chunk_size, self.config.data_vectorization_overlap_size)
+            self.vectorizer.index()
+            if callback is not None:
+                callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
+            return True
+        except ValueError as ve:
+            ASCIIColors.error(f"Unsupported file format. Supported formats are {GenericDataLoader.get_supported_file_types()}")
+            return False
    def save_personality(self, package_path=None):
        """
        Save the personality parameters to a YAML configuration file.
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@ -1,6 +1,4 @@
-from lollms.personality import APScript
 from lollms.helpers import ASCIIColors, trace_exception
-from lollms.paths import LollmsPaths
 from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
 from pathlib import Path
@ -597,6 +595,24 @@ class TextVectorizer:
      
 class GenericDataLoader:
    @staticmethod        
+    def read_file(file_path:Path):
+        if file_path.suffix ==".pdf":
+            return GenericDataLoader.read_pdf_file(file_path)
+        elif file_path.suffix == ".txt":
+            return GenericDataLoader.read_text_file(file_path)
+        elif file_path.suffix == ".docx":
+            return GenericDataLoader.read_docx_file(file_path)
+        elif file_path.suffix == ".json":
+            return GenericDataLoader.read_json_file(file_path)
+        elif file_path.suffix == ".html":
+            return GenericDataLoader.read_html_file(file_path)
+        elif file_path.suffix == ".pptx":
+            return GenericDataLoader.read_pptx_file(file_path)
+        else:
+            raise ValueError("Unknown file type")
+    def get_supported_file_types():
+        return ["pdf", "txt", "docx", "json", "html", "pptx"]    
+    @staticmethod        
    def read_pdf_file(file_path):
        try:
            import PyPDF2
@ -623,52 +639,6 @@ class GenericDataLoader:
        markdown_text = text.replace('\n', '  \n')  # Adding double spaces at the end of each line for Markdown line breaks
        
        return markdown_text
-        """
-
-                    
-        from io import BytesIO         
-        with open(file_path, 'rb') as pdf_file:
-            pdf_reader = PyPDF2.PdfReader(pdf_file)
-            all_text = []
-            for page_num in range(len(pdf_reader.pages)):
-                page = pdf_reader.pages[page_num]
-                if '/Resources' in page and '/XObject' in page['/Resources']:
-                    xObject = page['/Resources']['/XObject']
-                    if xObject is not None:
-                        for obj in xObject:
-                            # Check if the object is an image
-                            if xObject[obj]['/Subtype'] == '/Image':
-                                image_data = xObject[obj].get_object()
-                                image_stream = image_data.get_object()
-                                image_stream_data = image_stream.get_data()
-
-                                try:
-                                    # Extract text from the image using pytesseract
-                                    extracted_text = pytesseract.image_to_string(Image.open(BytesIO(image_stream_data)))
-                                    all_text.append(extracted_text)
-                                except pytesseract.TesseractNotFoundError:
-                                    ASCIIColors.error("Please install tesserract to enable ocr data extraction from your pdf file")
-                                except UnidentifiedImageError:
-                                    # Ignore images that cannot be identified
-                                    pass
-
-                # Extract regular text from the page using PyPDF2's text extraction
-                regular_text = page.extract_text()
-                if regular_text:
-                    all_text.append(regular_text)
-
-            return "\n\n".join(all_text)
-        """
-            
-        """
-        text = ""
-        with open(file_path, 'rb') as pdf_file:
-            pdf_reader = PyPDF2.PdfReader(pdf_file)
-            for page in pdf_reader.pages:
-                text += page.extract_text()
-        
-        return text
-        """

    @staticmethod
    def read_docx_file(file_path):
--- a/setup.py
+++ b/setup.py
@ -26,7 +26,7 @@ def get_all_files(path):

 setuptools.setup(
    name="lollms",
-    version="4.0.1",
+    version="4.0.2",
    author="Saifeddine ALOUI",
    author_email="aloui.saifeddine@gmail.com",
    description="A python library for AI personality definition",