From a02cc89a1d4ec9db67eb95762d579f5b891438ce Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Wed, 23 Aug 2023 04:21:11 +0200
Subject: [PATCH] Enhanced vectorization for all

---
 lollms/configs/config.yaml | 12 +++++--
 lollms/personality.py      | 31 ++++++++++++++----
 lollms/utilities.py        | 66 +++++++++++---------------------------
 setup.py                   |  2 +-
 4 files changed, 54 insertions(+), 57 deletions(-)

diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml
index 95ce2b9..fc63550 100644
--- a/lollms/configs/config.yaml
+++ b/lollms/configs/config.yaml
@@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Models Configuration file =========================== 
-version: 11
+version: 19
 binding_name: null
 model_name: null
 
@@ -36,4 +36,12 @@ user_avatar: default_user
 
 # Automatic update
 auto_update: false
-debug: false
\ No newline at end of file
+debug: false
+
+# Data vectorization
+data_vectorization_method: "ftidf_vectorizer" #"model_embedding" or "ftidf_vectorizer"
+data_visualization_method: "PCA" #"PCA" or "TSNE"
+data_vectorization_save_db: False # For each new session, new files
+data_vectorization_chunk_size: 512 # chunk size
+data_vectorization_overlap_size: 128 # overlap between chunks size
+data_vectorization_nb_chunks: 2 # number of chunks to use
\ No newline at end of file
diff --git a/lollms/personality.py b/lollms/personality.py
index 0a95c6d..b8c2ccd 100644
--- a/lollms/personality.py
+++ b/lollms/personality.py
@@ -19,6 +19,7 @@ from lollms.helpers import ASCIIColors
 from lollms.types import MSG_TYPE
 from typing import Callable
 import json
+from lollms.utilities import TextVectorizer, GenericDataLoader
 
 
 def is_package_installed(package_name):
@@ -72,6 +73,7 @@ class AIPersonality:
         self.callback = callback
 
         self.files = []
+        self.vectorizer = None
 
         self.installation_option = installation_option
 
@@ -282,13 +284,30 @@ Date: {{date}}
         return config
 
     def add_file(self, path, callback=None):
-        if callback is not None:
-            callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
+
         self.files.append(path)
-        
-        return True
-
-
+        db_path = self.lollms_paths.personal_databases_path / self.name / "db.json"
+        db_path.parent.mkdir(parents=True, exist_ok=True)
+        if self.vectorizer is None:
+            self.vectorizer = TextVectorizer(self.config.data_vectorization_method, # supported "model_embedding" or "ftidf_vectorizer"
+                        model=self.model, #needed in case of using model_embedding
+                        database_path=db_path,
+                        save_db=self.config.data_vectorization_save_db,
+                        visualize_data_at_startup=False,
+                        visualize_data_at_add_file=False,
+                        visualize_data_at_generate=False,
+                        data_visualization_method="PCA",
+                        database_dict=None)
+        try:
+            data = GenericDataLoader.read_file(path)
+            self.vectorizer.add_document(path, data, self.config.data_vectorization_chunk_size, self.config.data_vectorization_overlap_size)
+            self.vectorizer.index()
+            if callback is not None:
+                callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
+            return True
+        except ValueError as ve:
+            ASCIIColors.error(f"Unsupported file format. Supported formats are {GenericDataLoader.get_supported_file_types()}")
+            return False
     def save_personality(self, package_path=None):
         """
         Save the personality parameters to a YAML configuration file.
diff --git a/lollms/utilities.py b/lollms/utilities.py
index 6a879f9..1972df5 100644
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@@ -1,6 +1,4 @@
-from lollms.personality import APScript
 from lollms.helpers import ASCIIColors, trace_exception
-from lollms.paths import LollmsPaths
 from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
 from pathlib import Path
@@ -597,6 +595,24 @@ class TextVectorizer:
       
 class GenericDataLoader:
     @staticmethod        
+    def read_file(file_path:Path):
+        if file_path.suffix ==".pdf":
+            return GenericDataLoader.read_pdf_file(file_path)
+        elif file_path.suffix == ".txt":
+            return GenericDataLoader.read_text_file(file_path)
+        elif file_path.suffix == ".docx":
+            return GenericDataLoader.read_docx_file(file_path)
+        elif file_path.suffix == ".json":
+            return GenericDataLoader.read_json_file(file_path)
+        elif file_path.suffix == ".html":
+            return GenericDataLoader.read_html_file(file_path)
+        elif file_path.suffix == ".pptx":
+            return GenericDataLoader.read_pptx_file(file_path)
+        else:
+            raise ValueError("Unknown file type")
+    def get_supported_file_types():
+        return ["pdf", "txt", "docx", "json", "html", "pptx"]    
+    @staticmethod        
     def read_pdf_file(file_path):
         try:
             import PyPDF2
@@ -623,52 +639,6 @@ class GenericDataLoader:
         markdown_text = text.replace('\n', '  \n')  # Adding double spaces at the end of each line for Markdown line breaks
         
         return markdown_text
-        """
-
-                    
-        from io import BytesIO         
-        with open(file_path, 'rb') as pdf_file:
-            pdf_reader = PyPDF2.PdfReader(pdf_file)
-            all_text = []
-            for page_num in range(len(pdf_reader.pages)):
-                page = pdf_reader.pages[page_num]
-                if '/Resources' in page and '/XObject' in page['/Resources']:
-                    xObject = page['/Resources']['/XObject']
-                    if xObject is not None:
-                        for obj in xObject:
-                            # Check if the object is an image
-                            if xObject[obj]['/Subtype'] == '/Image':
-                                image_data = xObject[obj].get_object()
-                                image_stream = image_data.get_object()
-                                image_stream_data = image_stream.get_data()
-
-                                try:
-                                    # Extract text from the image using pytesseract
-                                    extracted_text = pytesseract.image_to_string(Image.open(BytesIO(image_stream_data)))
-                                    all_text.append(extracted_text)
-                                except pytesseract.TesseractNotFoundError:
-                                    ASCIIColors.error("Please install tesserract to enable ocr data extraction from your pdf file")
-                                except UnidentifiedImageError:
-                                    # Ignore images that cannot be identified
-                                    pass
-
-                # Extract regular text from the page using PyPDF2's text extraction
-                regular_text = page.extract_text()
-                if regular_text:
-                    all_text.append(regular_text)
-
-            return "\n\n".join(all_text)
-        """
-            
-        """
-        text = ""
-        with open(file_path, 'rb') as pdf_file:
-            pdf_reader = PyPDF2.PdfReader(pdf_file)
-            for page in pdf_reader.pages:
-                text += page.extract_text()
-        
-        return text
-        """
 
     @staticmethod
     def read_docx_file(file_path):
diff --git a/setup.py b/setup.py
index ab37936..ea74179 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ def get_all_files(path):
 
 setuptools.setup(
     name="lollms",
-    version="4.0.1",
+    version="4.0.2",
     author="Saifeddine ALOUI",
     author_email="aloui.saifeddine@gmail.com",
     description="A python library for AI personality definition",