From 789e800381daa5799eff0c5904b2bdb48b3fa5f1 Mon Sep 17 00:00:00 2001
From: saloui <saifeddine.aloui@cea.fr>
Date: Fri, 21 Jul 2023 17:01:21 +0200
Subject: [PATCH] upgraded

---
 lollms/personality.py |  3 +-
 lollms/utilities.py   | 99 +++++++++++++++++++++++++++++++++++++++++--
 setup.py              |  2 +-
 3 files changed, 99 insertions(+), 5 deletions(-)
diff --git a/lollms/personality.py b/lollms/personality.py
index 41a1177..bbdd138 100644
--- a/lollms/personality.py
+++ b/lollms/personality.py
@@ -1072,7 +1072,7 @@ class APScript(StateMachine):
     def generate(self, prompt, max_size, temperature = None, top_k = None, top_p=None, repeat_penalty=None ):
         self.bot_says = ""
         ASCIIColors.info("Text generation started: Warming up")
-        return self.personality.model.generate(
+        self.personality.model.generate(
                                 prompt, 
                                 max_size, 
                                 self.process,
@@ -1081,6 +1081,7 @@ class APScript(StateMachine):
                                 top_p=self.personality.model_top_p if top_p is None else top_p,
                                 repeat_penalty=self.personality.model_repeat_penalty if repeat_penalty is None else repeat_penalty,
                                 ).strip()    
+        return self.bot_says
 
     def run_workflow(self, prompt:str, previous_discussion_text:str="", callback=None):
         """
diff --git a/lollms/utilities.py b/lollms/utilities.py
index 9451a47..6b02642 100644
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@@ -101,7 +101,11 @@ class TextVectorizer:
             if use_pca:
                 # Use PCA for dimensionality reduction
                 pca = PCA(n_components=2)
-                embeddings_2d = pca.fit_transform(combined_embeddings)
+                try:
+                    embeddings_2d = pca.fit_transform(combined_embeddings)
+                except Exception as ex:
+                    
+                    embeddings_2d = []
             else:
                 # Use t-SNE for dimensionality reduction
                 # Adjust the perplexity value
@@ -219,7 +223,7 @@ class TextVectorizer:
             data=[]
             for chunk in chunks:
                 try:
-                    data.append(self.model.detokenize(chunk) ) 
+                    data.append(self.model.detokenize(chunk).replace("<s>","").replace("</s>","") ) 
                 except Exception as ex:
                     print("oups")
             self.vectorizer.fit(data)
@@ -305,4 +309,93 @@ class TextVectorizer:
         self.embeddings = {}
         self.texts={}
         if self.personality_config.save_db:
-            self.save_to_json()
\ No newline at end of file
+            self.save_to_json()
+            
+      
+class GenericDataLoader:
+    @staticmethod
+    def install_package(package_name):
+        import subprocess
+        import sys
+        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
+
+    @staticmethod        
+    def read_pdf_file(file_path):
+        try:
+            import PyPDF2
+        except ImportError:
+            GenericDataLoader.install_package("PyPDF2")
+            import PyPDF2
+        with open(file_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+        return text
+
+    @staticmethod
+    def read_docx_file(file_path):
+        try:
+            from docx import Document
+        except ImportError:
+            GenericDataLoader.install_package("python-docx")
+            from docx import Document
+        doc = Document(file_path)
+        text = ""
+        for paragraph in doc.paragraphs:
+            text += paragraph.text + "\n"
+        return text
+
+    @staticmethod
+    def read_json_file(file_path):
+        import json
+        with open(file_path, 'r') as file:
+            data = json.load(file)
+        return data
+    
+    @staticmethod
+    def read_csv_file(file_path):
+        try:
+            import csv
+        except ImportError:
+            GenericDataLoader.install_package("csv")
+            import csv
+        with open(file_path, 'r') as file:
+            csv_reader = csv.reader(file)
+            lines = [row for row in csv_reader]
+        return lines    
+
+    @staticmethod
+    def read_html_file(file_path):
+        try:
+            from bs4 import BeautifulSoup
+        except ImportError:
+            GenericDataLoader.install_package("beautifulsoup4")
+            from bs4 import BeautifulSoup
+        with open(file_path, 'r') as file:
+            soup = BeautifulSoup(file, 'html.parser')
+            text = soup.get_text()
+        return text
+    
+    @staticmethod
+    def read_pptx_file(file_path):
+        try:
+            from pptx import Presentation
+        except ImportError:
+            GenericDataLoader.install_package("python-pptx")
+            from pptx import Presentation
+        prs = Presentation(file_path)
+        text = ""
+        for slide in prs.slides:
+            for shape in slide.shapes:
+                if shape.has_text_frame:
+                    for paragraph in shape.text_frame.paragraphs:
+                        for run in paragraph.runs:
+                            text += run.text
+        return text
+    
+    @staticmethod
+    def read_text_file(file_path):
+        with open(file_path, 'r', encoding='utf-8') as file:
+            content = file.read()
+        return content
diff --git a/setup.py b/setup.py
index 50f9fad..3908e92 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ def get_all_files(path):
 
 setuptools.setup(
     name="lollms",
-    version="2.1.50",
+    version="2.1.53",
     author="Saifeddine ALOUI",
     author_email="aloui.saifeddine@gmail.com",
     description="A python library for AI personality definition",