Enhanced

2024-12-19 20:57:58 +00:00 · 2023-07-27 16:27:08 +02:00 · 2023-07-27 16:27:08 +02:00 · 58a0c986c3
commit 58a0c986c3
parent d21ba70dff
10 changed files with 281 additions and 107 deletions
--- a/examples/chat_forever/console.py
+++ b/examples/chat_forever/console.py
@ -14,7 +14,7 @@ class MyConversation(Conversation):
        self.menu.main_menu()
      full_discussion += self.personality.user_message_prefix+prompt+self.personality.link_text
      full_discussion += self.personality.ai_message_prefix
-      def callback(text, type=None):
+      def callback(text, type=None, metadata:dict={}):
          print(text, end="")
          sys.stdout = sys.__stdout__
          sys.stdout.flush()
--- a/examples/simple_story/console.py
+++ b/examples/simple_story/console.py
@ -7,7 +7,7 @@ class MyConversation(Conversation):
  def start_conversation(self):
    prompt = "Once apon a time"
-    def callback(text, type=None):
+    def callback(text, type=None, metadata:dict={}):
        print(text, end="")
        sys.stdout = sys.__stdout__
        sys.stdout.flush()
--- a/lollms/apps/console/init.py
+++ b/lollms/apps/console/init.py
@ -7,6 +7,7 @@ from lollms.paths import LollmsPaths
 from lollms.app import LollmsApplication
 from lollms.terminal import MainMenu
 from typing import Callable
 from pathlib import Path
 import argparse
 import yaml
@ -133,7 +134,7 @@ Participating personalities:
            full_discussion = ""
        return full_discussion
-    def safe_generate(self, full_discussion:str, n_predict=None, callback=None):
+    def safe_generate(self, full_discussion:str, n_predict=None, callback: Callable[[str, int, dict], bool]=None):
        """safe_generate
        Args:
@ -236,7 +237,7 @@ Participating personalities:
                            self.personality.ai_message_prefix
                        )
-                def callback(text, type:MSG_TYPE=None):
+                def callback(text, type:MSG_TYPE=None, metadata:dict={}):
                    if type == MSG_TYPE.MSG_TYPE_CHUNK:
                        # Replace stdout with the default stdout
                        sys.stdout = sys.__stdout__
--- a/lollms/apps/server/init.py
+++ b/lollms/apps/server/init.py
@ -12,6 +12,7 @@ from lollms.apps.console import MainMenu
 from lollms.paths import LollmsPaths
 from lollms.apps.console import MainMenu
 from lollms.app import LollmsApplication
 from lollms.utilities import TextVectorizer
 from typing import List, Tuple
 import importlib
 from pathlib import Path
@ -20,6 +21,7 @@ import logging
 import yaml
 import copy
 import gc
 import json
 def reset_all_installs(lollms_paths:LollmsPaths):
    ASCIIColors.info("Removeing all configuration files to force reinstall")
    ASCIIColors.info(f"Searching files from {lollms_paths.personal_configuration_path}")
@ -322,6 +324,96 @@ class LoLLMsServer(LollmsApplication):
                emit('personality_add_failed', {'success':False, 'error': error_message}, room=request.sid)
        @self.socketio.on('vectorize_text')
        def vectorize_text(parameters:dict):
            """Vectorizes text
            Args:
                parameters (dict): contains
                'chunk_size': the maximum size of a text chunk (512 by default)
                'vectorization_method': can be either "model_embedding" or "ftidf_vectorizer" (default is "ftidf_vectorizer")
                'payloads': a list of dicts. each entry has the following format
                {
                    "path": the path to the document
                    "text": the text of the document
                },
                'return_database': If true the vectorized database will be sent to the client (default is True)
                'database_path': the path to store the database (default is none)
            returns a dict
                status: True if success and false if not
                if you asked for the database to be sent back you will ahve those fields too:
                embeddings: a dictionary containing the text chunks with their ids and embeddings
                "texts": a dictionary of text chunks for each embedding (use index for correspondance)
                "infos": extra information
                "vectorizer": The vectorize if this is using tfidf or none if it uses model
            """
            vectorization_method = parameters.get('vectorization_method',"ftidf_vectorizer")
            chunk_size = parameters.get("chunk_size",512)
            payloads = parameters["payloads"]
            database_path = parameters.get("database_path",None)
            return_database = parameters.get("return_database",True)
            if database_path is None and return_database is None:
                ASCIIColors.warning("Vectorization should either ask to save the database or to recover it. You didn't ask for any one!")
                emit('vectorized_db',{"status":False, "error":"Vectorization should either ask to save the database or to recover it. You didn't ask for any one!"}) 
                return
            tv = TextVectorizer(vectorization_method, self.model)
            for payload in payloads:
                tv.add_document(payload["path"],payload["text"],chunk_size=chunk_size)
            json_db = tv.toJson()
            if return_database:
                emit('vectorized_db',{**{"status":True}, **json_db}) 
            else:
                emit('vectorized_db',{"status":True}) 
                with open(database_path, "w") as file:
                    json.dump(json_db, file, indent=4)
        @self.socketio.on('query_database')
        def query_database(parameters:dict):
            """queries a database
            Args:
                parameters (dict): contains
                'vectorization_method': can be either "model_embedding" or "ftidf_vectorizer"
                'database': a list of dicts. each entry has the following format
                {
                    embeddings: a dictionary containing the text chunks with their ids and embeddings
                    "texts": a dictionary of text chunks for each embedding (use index for correspondance)
                    "infos": extra information
                    "vectorizer": The vectorize if this is using tfidf or none if it uses model
                }
                'database_path': If supplied, the database is loaded from a path
                'query': a query to search in the database
            """
            vectorization_method = parameters['vectorization_method']
            database = parameters.get("database",None)
            query = parameters.get("query",None)
            if query is None:
                ASCIIColors.error("No query given!")
                emit('vector_db_query',{"status":False, "error":"Please supply a query"}) 
                return
            if database is None:
                database_path = parameters.get("database_path",None)
                if database_path is None:
                    ASCIIColors.error("No database given!")
                    emit('vector_db_query',{"status":False, "error":"You did not supply a database file nor a database content"}) 
                    return
                else:
                    with open(database_path, "r") as file:
                        database = json.load(file)
            tv = TextVectorizer(vectorization_method, self.model, database_dict=database)
            docs, sorted_similarities = tv.recover_text(tv.embed_query(query))
            emit('vectorized_db',{
                "chunks":docs,
                "refs":sorted_similarities
            }) 
        @self.socketio.on('list_active_personalities')
        def handle_list_active_personalities():
            personality_names = [p.name for p in self.personalities]
@ -394,7 +486,7 @@ class LoLLMsServer(LollmsApplication):
                if personality_id==-1:
                    # Raw text generation
                    self.answer = {"full_text":""}
-                    def callback(text, message_type: MSG_TYPE):
+                    def callback(text, message_type: MSG_TYPE, metadata:dict={}):
                        if message_type == MSG_TYPE.MSG_TYPE_CHUNK:
                            ASCIIColors.success(f"generated:{len(self.answer['full_text'].split())} words", end='\r')
                            self.answer["full_text"] = self.answer["full_text"] + text
@ -467,7 +559,7 @@ class LoLLMsServer(LollmsApplication):
                        full_discussion = personality.personality_conditioning + ''.join(full_discussion_blocks)
-                        def callback(text, message_type: MSG_TYPE):
+                        def callback(text, message_type: MSG_TYPE, metadata:dict={}):
                            if message_type == MSG_TYPE.MSG_TYPE_CHUNK:
                                self.answer["full_text"] = self.answer["full_text"] + text
                                self.socketio.emit('text_chunk', {'chunk': text}, room=client_id)
--- a/lollms/apps/settings/init.py
+++ b/lollms/apps/settings/init.py
@ -120,7 +120,7 @@ Participating personalities:
            full_discussion = ""
        return full_discussion
-    def safe_generate(self, full_discussion:str, n_predict=None, callback=None):
+    def safe_generate(self, full_discussion:str, n_predict=None, callback: Callable[[str, int, dict], bool]=None):
        """safe_generate
        Args:
--- a/lollms/binding.py
+++ b/lollms/binding.py
@ -195,9 +195,9 @@ class LLMBinding:
    def generate(self, 
-                 prompt:str,                  
+                 prompt:str,
                 n_predict: int = 128,
-                 callback: Callable[[str], None] = None,
+                 callback: Callable[[str, int, dict], bool] = None,
                 verbose: bool = False,
                 **gpt_params ):
        """Generates text out of a prompt
@ -206,7 +206,7 @@ class LLMBinding:
        Args:
            prompt (str): The prompt to use for generation
            n_predict (int, optional): Number of tokens to prodict. Defaults to 128.
-            callback (Callable[[str], None], optional): A callback function that is called everytime a new text element is generated. Defaults to None.
+            callback (Callable[[str, int, dict], None], optional): A callback function that is called everytime a new text element is generated. Defaults to None.
            verbose (bool, optional): If true, the code will spit many informations about the generation process. Defaults to False.
        """
        pass
--- a/lollms/personality.py
+++ b/lollms/personality.py
@ -17,6 +17,7 @@ import subprocess
 import yaml
 from lollms.helpers import ASCIIColors
 from lollms.types import MSG_TYPE
 from typing import Callable
 import json
@ -54,7 +55,8 @@ class AIPersonality:
                    model:LLMBinding=None, 
                    run_scripts=True, 
                    is_relative_path=True,
-                    installation_option:InstallOption=InstallOption.INSTALL_IF_NECESSARY
+                    installation_option:InstallOption=InstallOption.INSTALL_IF_NECESSARY,
                    callback: Callable[[str, int, dict], bool]=None
                ):
        """
        Initialize an AIPersonality instance.
@ -68,6 +70,7 @@ class AIPersonality:
        self.lollms_paths = lollms_paths
        self.model = model
        self.config = config
        self.callback = callback
        self.files = []
@ -248,7 +251,7 @@ Date: {{date}}
                module = importlib.util.module_from_spec(module_spec)
                module_spec.loader.exec_module(module)
                if hasattr(module, "Processor"):
-                    self._processor = module.Processor(self)
+                    self._processor = module.Processor(self, callback=self.callback)
                else:
                    self._processor = None
            else:
@ -881,7 +884,7 @@ class StateMachine:
-    def process_state(self, command, full_context, callback=None):
+    def process_state(self, command, full_context, callback: Callable[[str, int, dict], bool]=None):
        """
        Process the given command based on the current state.
@ -922,7 +925,8 @@ class APScript(StateMachine):
                    self, 
                    personality         :AIPersonality,
                    personality_config  :TypedConfig,
-                    states_dict         :dict   = {}
+                    states_dict         :dict   = {},
                    callback            = None
                ) -> None:
        super().__init__(states_dict)
        self.files=[]
@ -932,6 +936,7 @@ class APScript(StateMachine):
        self.configuration_file_path            = self.personality.lollms_paths.personal_configuration_path/f"personality_{self.personality.personality_folder_name}.yaml"
        self.personality_config.config.file_path    = self.configuration_file_path
        self.callback = callback
        # Installation
        if (not self.configuration_file_path.exists() or self.installation_option==InstallOption.FORCE_INSTALL) and self.installation_option!=InstallOption.NEVER_INSTALL:
            self.install()
@ -990,8 +995,7 @@ class APScript(StateMachine):
            else:
                ASCIIColors.error("Pytorch installed successfully!!")
-    def add_file(self, path, callback=None):
+    def add_file(self, path):
        self.callback=callback
        self.files.append(path)
        return True
@ -1104,7 +1108,7 @@ class APScript(StateMachine):
        else:
            return False
-    def run_workflow(self, prompt:str, previous_discussion_text:str="", callback=None):
+    def run_workflow(self, prompt:str, previous_discussion_text:str="", callback: Callable[[str, int, dict], bool]=None):
        """
        Runs the workflow for processing the model input and output.
@ -1121,7 +1125,7 @@ class APScript(StateMachine):
        """
        return None
-    def step_start(self, step_text, callback=None):
+    def step_start(self, step_text, callback: Callable[[str, int, dict], bool]=None):
        """This triggers a step start
        Args:
@ -1131,7 +1135,7 @@ class APScript(StateMachine):
        if callback:
            callback(step_text, MSG_TYPE.MSG_TYPE_STEP_START)
-    def step_end(self, step_text, callback=None):
+    def step_end(self, step_text, status=True, callback: Callable[[str, int, dict], bool]=None):
        """This triggers a step end
        Args:
@ -1139,9 +1143,9 @@ class APScript(StateMachine):
            callback (callable, optional): A callable with this signature (str, MSG_TYPE) to send the step end to. Defaults to None.
        """
        if callback:
-            callback(step_text, MSG_TYPE.MSG_TYPE_STEP_END)
+            callback(step_text, MSG_TYPE.MSG_TYPE_STEP_END, {'status':status})
-    def step(self, step_text, callback=None):
+    def step(self, step_text, callback: Callable[[str, int, dict], bool]=None):
        """This triggers a step information
        Args:
@ -1151,7 +1155,7 @@ class APScript(StateMachine):
        if callback:
            callback(step_text, MSG_TYPE.MSG_TYPE_STEP)
-    def exception(self, ex, callback=None):
+    def exception(self, ex, callback: Callable[[str, int, dict], bool]=None):
        """This sends exception to the client
        Args:
@ -1161,7 +1165,7 @@ class APScript(StateMachine):
        if callback:
            callback(str(ex), MSG_TYPE.MSG_TYPE_EXCEPTION)
-    def warning(self, warning:str, callback=None):
+    def warning(self, warning:str, callback: Callable[[str, int, dict], bool]=None):
        """This sends exception to the client
        Args:
@ -1171,7 +1175,7 @@ class APScript(StateMachine):
        if callback:
            callback(warning, MSG_TYPE.MSG_TYPE_EXCEPTION)
-    def info(self, info:str, callback=None):
+    def info(self, info:str, callback: Callable[[str, int, dict], bool]=None):
        """This sends exception to the client
        Args:
@ -1181,7 +1185,7 @@ class APScript(StateMachine):
        if callback:
            callback(info, MSG_TYPE.MSG_TYPE_INFO)
-    def json(self, json_infos:dict, callback=None):
+    def json(self, json_infos:dict, callback: Callable[[str, int, dict], bool]=None):
        """This sends json data to front end
        Args:
@ -1191,7 +1195,7 @@ class APScript(StateMachine):
        if callback:
            callback(json.dumps(json_infos), MSG_TYPE.MSG_TYPE_JSON_INFOS)
-    def ui(self, html_ui:str, callback=None):
+    def ui(self, html_ui:str, callback: Callable[[str, int, dict], bool]=None):
        """This sends ui elements to front end
        Args:
@ -1201,7 +1205,7 @@ class APScript(StateMachine):
        if callback:
            callback(html_ui, MSG_TYPE.MSG_TYPE_UI)
-    def code(self, code:str, callback=None):
+    def code(self, code:str, callback: Callable[[str, int, dict], bool]=None):
        """This sends code to front end
        Args:
@ -1211,7 +1215,7 @@ class APScript(StateMachine):
        if callback:
            callback(code, MSG_TYPE.MSG_TYPE_CODE)
-    def full(self, full_text:str, callback=None):
+    def full(self, full_text:str, callback: Callable[[str, int, dict], bool]=None):
        """This sends full text to front end
        Args:
@ -1224,7 +1228,7 @@ class APScript(StateMachine):
        if callback:
            callback(full_text, MSG_TYPE.MSG_TYPE_FULL)
-    def full_invisible_to_ai(self, full_text:str, callback=None):
+    def full_invisible_to_ai(self, full_text:str, callback: Callable[[str, int, dict], bool]=None):
        """This sends full text to front end (INVISIBLE to AI)
        Args:
@ -1237,7 +1241,7 @@ class APScript(StateMachine):
        if callback:
            callback(full_text, MSG_TYPE.MSG_TYPE_FULL_INVISIBLE_TO_AI)
-    def full_invisible_to_user(self, full_text:str, callback=None):
+    def full_invisible_to_user(self, full_text:str, callback: Callable[[str, int, dict], bool]=None):
        """This sends full text to front end (INVISIBLE to user)
        Args:
@ -1251,7 +1255,7 @@ class APScript(StateMachine):
            callback(full_text, MSG_TYPE.MSG_TYPE_FULL_INVISIBLE_TO_USER)
-    def info(self, info_text:str, callback=None):
+    def info(self, info_text:str, callback: Callable[[str, int, dict], bool]=None):
        """This sends info text to front end
        Args:
@ -1264,7 +1268,7 @@ class APScript(StateMachine):
        if callback:
            callback(info_text, MSG_TYPE.MSG_TYPE_FULL)
-    def step_progress(self, progress:float, callback=None):
+    def step_progress(self, step_text:str, progress:float, callback: Callable[[str, int, dict], bool]=None):
        """This sends step rogress to front end
        Args:
@ -1275,8 +1279,16 @@ class APScript(StateMachine):
            callback = self.callback
        if callback:
-            callback(str(progress), MSG_TYPE.MSG_TYPE_STEP_PROGRESS)
+            callback(step_text, MSG_TYPE.MSG_TYPE_STEP_PROGRESS, {'progress':progress})
-
+            
    #Helper method to convert outputs path to url
    def path2url(file):
        file = str(file).replace("\\","/")
        pth = file.split('/')
        idx = pth.index("outputs")
        pth = "/".join(pth[idx:])
        file_path = f"![](/{pth})\n"
        return file_path
 # ===========================================================
 class AIPersonalityInstaller:
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@ -1,51 +1,85 @@
 from lollms.personality import APScript
 from lollms.helpers import ASCIIColors, trace_exception
-
+from lollms.paths import LollmsPaths
 from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
 import json
 from pathlib import Path
 import numpy as np
 import json
 class TFIDFLoader:
    @staticmethod
    def create_vectorizer_from_dict(tfidf_info):
        vectorizer = TfidfVectorizer(**tfidf_info['params'])
        vectorizer.vocabulary_ = tfidf_info['vocabulary']
        vectorizer.idf_ = [tfidf_info['idf_values'][feature] for feature in vectorizer.get_feature_names()]
        return vectorizer
    @staticmethod
    def create_dict_from_vectorizer(vectorizer):
        tfidf_info = {
            "vocabulary": vectorizer.vocabulary_,
            "idf_values": dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)),
            "params": vectorizer.get_params()
        }
        return tfidf_info
 class TextVectorizer:
-    def __init__(self, processor):
+    def __init__(
                    self, 
                    vectorization_method, # supported "model_embedding" or "ftidf_vectorizer"
                    model=None, #needed in case of using model_embedding
                    database_path=None,
                    save_db=False,
                    visualize_data_at_startup=False,
                    visualize_data_at_add_file=False,
                    visualize_data_at_generate=False,
                    data_visualization_method="PCA",
                    database_dict=None
                    ):
-        self.processor:APScript = processor
+        self.vectorization_method = vectorization_method
-        self.personality = self.processor.personality
+        self.save_db = save_db
-        self.model = self.personality.model
+        self.model = model
-        self.personality_config = self.processor.personality_config
+        self.database_file = database_path
        self.lollms_paths = self.personality.lollms_paths
        self.embeddings = {}
        self.texts = {}
        self.ready = False
        self.vectorizer = None
        self.database_file = Path(self.lollms_paths.personal_data_path/self.personality_config["database_path"])
-        self.visualize_data_at_startup=self.personality_config["visualize_data_at_startup"]
+        self.visualize_data_at_startup=visualize_data_at_startup
-        self.visualize_data_at_add_file=self.personality_config["visualize_data_at_add_file"]
+        self.visualize_data_at_add_file=visualize_data_at_add_file
-        self.visualize_data_at_generate=self.personality_config["visualize_data_at_generate"]
+        self.visualize_data_at_generate=visualize_data_at_generate
-        if self.personality_config.vectorization_method=="model_embedding":
+        self.data_visualization_method = data_visualization_method
-            try:
+        
-                if self.model.embed("hi")==None:
+        if database_dict is not None:
-                    self.personality_config.vectorization_method="ftidf_vectorizer"
+            self.chunks = []
            self.embeddings = database_dict["embeddings"]
            self.texts =  database_dict["text"]
            self.infos =   database_dict["infos"]
            self.ready = True
            self.vectorizer = database_dict["vectorizer"]
        else:
            self.chunks = []
            self.embeddings = {}
            self.texts = {}
            self.ready = False
            self.vectorizer = None
            if vectorization_method=="model_embedding":
                try:
                    if not self.model or self.model.embed("hi")==None: # test
                        self.vectorization_method="ftidf_vectorizer"
                        self.infos={
                            "vectorization_method":"ftidf_vectorizer"
                        }
                    else:
                        self.infos={
                            "vectorization_method":"model_embedding"
                        }
                except Exception as ex:
                    ASCIIColors.error("Couldn't embed the text, so trying to use tfidf instead.")
                    trace_exception(ex)
                    self.infos={
                        "vectorization_method":"ftidf_vectorizer"
                    }
                else:
                    self.infos={
                        "vectorization_method":"model_embedding"
                    }
            except Exception as ex:
                ASCIIColors.error("Couldn't embed the text, so trying to use tfidf instead.")
                trace_exception(ex)
                self.infos={
                    "vectorization_method":"ftidf_vectorizer"
                }
        # Load previous state from the JSON file
-        if self.personality_config.save_db:
+        if self.save_db:
            if Path(self.database_file).exists():
                ASCIIColors.success(f"Database file found : {self.database_file}")
                self.load_from_json()
@ -56,7 +90,7 @@ class TextVectorizer:
                ASCIIColors.info(f"No database file found : {self.database_file}")
-    def show_document(self, query_text=None):
+    def show_document(self, query_text=None, save_fig_path =None, show_interactive_form=False):
        import textwrap
        import seaborn as sns
        import matplotlib.pyplot as plt
@ -66,9 +100,8 @@ class TextVectorizer:
        from sklearn.manifold import TSNE
        from sklearn.decomposition import PCA
        import torch
-        if self.personality_config.data_visualization_method=="PCA":
+        if self.data_visualization_method=="PCA":
            use_pca =  True
        else:
            use_pca =  False
@ -80,6 +113,7 @@ class TextVectorizer:
        texts = list(self.texts.values())
        embeddings = self.embeddings
        emb = list(embeddings.values())
        ref = list(embeddings.keys())
        if len(emb)>=2:
            # Normalize embeddings
            emb = np.vstack(emb)
@ -94,6 +128,7 @@ class TextVectorizer:
                # Combine the query embedding with the document embeddings
                combined_embeddings = np.vstack((normalized_embeddings, query_normalized_embedding))
                ref.append("Quey_chunk_0")
            else:
                # Combine the query embedding with the document embeddings
                combined_embeddings = normalized_embeddings
@ -113,13 +148,19 @@ class TextVectorizer:
                tsne = TSNE(n_components=2, perplexity=perplexity)
                embeddings_2d = tsne.fit_transform(combined_embeddings)
            # Create a dictionary to map document paths to colors
            document_path_colors = {}
            for i, path in enumerate(ref):
                document_path = "_".join(path.split("_")[:-1])  # Extract the document path (excluding chunk and chunk number)
                if document_path not in document_path_colors:
                    # Assign a new color to the document path if it's not in the dictionary
                    document_path_colors[document_path] = sns.color_palette("hls", len(document_path_colors) + 1)[-1]
            # Generate a list of colors for each data point based on the document path
            point_colors = [document_path_colors["_".join(path.split("_")[:-1])] for path in ref]
            # Create a scatter plot using Seaborn
-            if query_text is not None:
+            sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=point_colors)  # Plot document embeddings
                sns.scatterplot(x=embeddings_2d[:-1, 0], y=embeddings_2d[:-1, 1])  # Plot document embeddings
                plt.scatter(embeddings_2d[-1, 0], embeddings_2d[-1, 1], color='red')  # Plot query embedding
            else:
                sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1])  # Plot document embeddings
            # Add labels to the scatter plot
            for i, (x, y) in enumerate(embeddings_2d[:-1]):
                plt.text(x, y, str(i), fontsize=8)
@ -176,11 +217,12 @@ class TextVectorizer:
            # Connect the click event handler to the figure
            plt.gcf().canvas.mpl_connect("button_press_event", on_click)
-            plt.savefig(self.lollms_paths.personal_uploads_path / self.personality.personality_folder_name/ "db.png")
+            if save_fig_path:
-            plt.show()
+                plt.savefig(save_fig_path)
            if show_interactive_form:
                plt.show()
-    def index_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
+    def add_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
        if document_id in self.embeddings and not force_vectorize:
            print(f"Document {document_id} already exists. Skipping vectorization.")
            return
@ -188,15 +230,13 @@ class TextVectorizer:
        # Split tokens into sentences
        sentences = text.split('. ')
        def remove_empty_sentences(sentences):
-            return [sentence for sentence in sentences if sentence.strip() != '']
+            return [self.model.tokenize(sentence) for sentence in sentences if sentence.strip() != '']
        sentences = remove_empty_sentences(sentences)
        # Generate chunks with overlap and sentence boundaries
        chunks = []
        current_chunk = []
        for i in range(len(sentences)):
-            sentence = sentences[i]
+            sentence_tokens = sentences[i]
            sentence_tokens = self.model.tokenize(sentence)
            # ASCIIColors.yellow(len(sentence_tokens))
            if len(current_chunk) + len(sentence_tokens) <= chunk_size:
@ -204,45 +244,51 @@ class TextVectorizer:
            else:
                if current_chunk:
                    chunks.append(current_chunk)
-
+                    
-                while len(sentence_tokens)>chunk_size:
+                current_chunk=[]
-                    current_chunk = sentence_tokens[0:chunk_size]
+                for j in reversed(range(overlap_size)):
-                    sentence_tokens = sentence_tokens[chunk_size:]
+                    current_chunk.extend(sentences[i-j-1])
-                    chunks.append(current_chunk)
+                current_chunk.extend(sentence_tokens)
-                current_chunk = sentence_tokens
+            
        if current_chunk:
-            chunks.append(current_chunk)
+            for i, chunk_text in enumerate(chunks):
-
+                chunk_id = f"{document_id}_chunk_{i + 1}"
-        if self.personality_config.vectorization_method=="ftidf_vectorizer":
+                chunk_dict = {
-            from sklearn.feature_extraction.text import TfidfVectorizer
+                    "chunk_id": chunk_id,
                    "chunk_text": chunk_text
                }
                self.chunks.append(chunk_dict)
    def index(self):
        if self.vectorization_method=="ftidf_vectorizer":
            self.vectorizer = TfidfVectorizer()
-            #if self.personality.config.debug:
+            #if self.debug:
            #    ASCIIColors.yellow(','.join([len(chunk) for chunk in chunks]))
            data=[]
-            for chunk in chunks:
+            for chunk in self.chunks:
                try:
-                    data.append(self.model.detokenize(chunk).replace("<s>","").replace("</s>","") ) 
+                    data.append(self.model.detokenize(chunk["chunk_text"]).replace("<s>","").replace("</s>","") ) 
                except Exception as ex:
                    print("oups")
            self.vectorizer.fit(data)
        self.embeddings = {}
        # Generate embeddings for each chunk
-        for i, chunk in enumerate(chunks):
+        for i, chunk in enumerate(self.chunks):
            # Store chunk ID, embedding, and original text
-            chunk_id = f"{document_id}_chunk_{i + 1}"
+            chunk_id = chunk["chunk_id"]
            chunk_text = chunk["chunk_text"]
            try:
-                self.texts[chunk_id] = self.model.detokenize(chunk[:chunk_size])
+                self.texts[chunk_id] = self.model.detokenize(chunk_text)
-                if self.personality_config.vectorization_method=="ftidf_vectorizer":
+                if self.vectorization_method=="ftidf_vectorizer":
                    self.embeddings[chunk_id] = self.vectorizer.transform([self.texts[chunk_id]]).toarray()
                else:
                    self.embeddings[chunk_id] = self.model.embed(self.texts[chunk_id])
            except Exception as ex:
                print("oups")
-        if self.personality_config.save_db:
+        if self.save_db:
            self.save_to_json()
        self.ready = True
@ -252,10 +298,14 @@ class TextVectorizer:
    def embed_query(self, query_text):
        # Generate query embedding
-        if self.personality_config.vectorization_method=="ftidf_vectorizer":
+        if self.vectorization_method=="ftidf_vectorizer":
            query_embedding = self.vectorizer.transform([query_text]).toarray()
        else:
            query_embedding = self.model.embed(query_text)
            if query_embedding is None:
                ASCIIColors.warning("The model doesn't implement embedding extraction")
                self.vectorization_method="ftidf_vectorizer"
                query_embedding = self.vectorizer.transform([query_text]).toarray()
        return query_embedding
@ -277,6 +327,18 @@ class TextVectorizer:
        return texts, sorted_similarities
    def toJson(self):
        state = {
            "embeddings": {str(k): v.tolist()  if type(v)!=list else v for k, v in self.embeddings.items() },
            "texts": self.texts,
            "infos": self.infos,
            "vectorizer": TFIDFLoader.create_vectorizer_from_dict(self.vectorizer) if self.vectorization_method=="ftidf_vectorizer" else None
        }
        return state
    def setVectorizer(self, vectorizer_dict:dict):
        self.vectorizer=TFIDFLoader.create_vectorizer_from_dict(vectorizer_dict)
    def save_to_json(self):
        state = {
            "embeddings": {str(k): v.tolist()  if type(v)!=list else v for k, v in self.embeddings.items() },
@ -295,7 +357,7 @@ class TextVectorizer:
            self.texts = state["texts"]
            self.infos= state["infos"]
            self.ready = True
-        if self.personality_config.vectorization_method=="ftidf_vectorizer":
+        if self.vectorization_method=="ftidf_vectorizer":
            from sklearn.feature_extraction.text import TfidfVectorizer
            data = list(self.texts.values())
            if len(data)>0:
@ -304,11 +366,13 @@ class TextVectorizer:
                self.embeddings={}
                for k,v in self.texts.items():
                    self.embeddings[k]= self.vectorizer.transform([v]).toarray()
    def clear_database(self):
        self.vectorizer=None
        self.embeddings = {}
        self.texts={}
-        if self.personality_config.save_db:
+        if self.save_db:
            self.save_to_json()
--- a/requirements.txt
+++ b/requirements.txt
@ -8,4 +8,9 @@ simple-websocket
 eventlet
 wget
 setuptools
-requests
+requests
 matplotlib
 seaborn
 mplcursors
 scikit-learn
--- a/setup.py
+++ b/setup.py
@ -26,7 +26,7 @@ def get_all_files(path):
 setuptools.setup(
    name="lollms",
-    version="2.1.56",
+    version="2.1.59",
    author="Saifeddine ALOUI",
    author_email="aloui.saifeddine@gmail.com",
    description="A python library for AI personality definition",