diff --git a/configs/config.yaml b/configs/config.yaml index e101f8d..93dad81 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 122 +version: 123 binding_name: null model_name: null model_variant: null @@ -247,6 +247,8 @@ rag_vectorizer: bert # possible values bert, tfidf, word2vec rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable rag_vectorizer_parameters: null # Parameters of the model in json format rag_chunk_size: 512 # number of tokens per chunk +rag_overlap: 0 # number of tokens of overlap + rag_n_chunks: 4 #Number of chunks to recover from the database rag_clean_chunks: true #Removed all uinecessary spaces and line returns rag_follow_subfolders: true #if true the vectorizer will vectorize the content of subfolders too diff --git a/lollms/app.py b/lollms/app.py index ccd231c..bfcb952 100644 --- a/lollms/app.py +++ b/lollms/app.py @@ -14,6 +14,8 @@ from lollms.client_session import Client, Session from lollms.databases.skills_database import SkillsLibrary from lollms.tasks import TasksLibrary from safe_store import TextVectorizer, VectorizationMethod, VisualizationMethod + +from lollmsvectordb.database_elements.chunk import Chunk from typing import Callable from pathlib import Path from datetime import datetime @@ -905,7 +907,7 @@ class LollmsApplication(LoLLMsCom): if len(conditionning)>0: conditionning = self.start_header_id_template + system_message_template + self.end_header_id_template + self.personality.replace_keys(conditionning, self.personality.conditionning_commands) + ("" if conditionning[-1]==self.separator_template else self.separator_template) - + # Check if there are document files to add to the prompt internet_search_results = "" internet_search_infos = [] @@ -1093,25 +1095,12 @@ class LollmsApplication(LoLLMsCom): query = current_message.content try: - if self.config.data_vectorization_force_first_chunk and len(client.discussion.vectorizer.chunks)>0: - doc_index = list(client.discussion.vectorizer.chunks.keys())[0] - - doc_id = client.discussion.vectorizer.chunks[doc_index]['document_id'] - content = client.discussion.vectorizer.chunks[doc_index]['chunk_text'] - + chunks:List[Chunk] = client.discussion.vectorizer.search(query, int(self.config.rag_n_chunks)) + for chunk in chunks: if self.config.data_vectorization_put_chunk_informations_into_context: - documentation += f"{self.start_header_id_template}document chunk{self.end_header_id_template}\nchunk_infos:{doc_id}\ncontent:{content}\n" + documentation += f"{self.start_header_id_template}document chunk{self.end_header_id_template}\ndocument title: {chunk.doc.title}\nchunk content:\n{chunk.text}\n" else: - documentation += f"{self.start_header_id_template}chunk{self.end_header_id_template}\n{content}\n" - - docs, sorted_similarities, document_ids = client.discussion.vectorizer.recover_text(query, top_k=int(self.config.data_vectorization_nb_chunks)) - for doc, infos in zip(docs, sorted_similarities): - if self.config.data_vectorization_force_first_chunk and len(client.discussion.vectorizer.chunks)>0 and infos[0]==doc_id: - continue - if self.config.data_vectorization_put_chunk_informations_into_context: - documentation += f"{self.start_header_id_template}document chunk{self.end_header_id_template}\nchunk path: {infos[0]}\nchunk content:\n{doc}\n" - else: - documentation += f"{self.start_header_id_template}chunk{self.end_header_id_template}\n{doc}\n" + documentation += f"{self.start_header_id_template}chunk{self.end_header_id_template}\n{chunk.text}\n" documentation += f"{self.separator_template}{self.start_header_id_template}important information: Use the documentation data to answer the user questions. If the data is not present in the documentation, please tell the user that the information he is asking for does not exist in the documentation section. It is strictly forbidden to give the user an answer without having actual proof from the documentation.\n" except Exception as ex: diff --git a/lollms/com.py b/lollms/com.py index c1bed36..bfd7680 100644 --- a/lollms/com.py +++ b/lollms/com.py @@ -48,7 +48,8 @@ class LoLLMsCom: self.rt_com = None - + self.model = None + def InfoMessage(self, content, client_id=None, verbose:bool=None): self.notify( content, diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml index e101f8d..93dad81 100644 --- a/lollms/configs/config.yaml +++ b/lollms/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 122 +version: 123 binding_name: null model_name: null model_variant: null @@ -247,6 +247,8 @@ rag_vectorizer: bert # possible values bert, tfidf, word2vec rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable rag_vectorizer_parameters: null # Parameters of the model in json format rag_chunk_size: 512 # number of tokens per chunk +rag_overlap: 0 # number of tokens of overlap + rag_n_chunks: 4 #Number of chunks to recover from the database rag_clean_chunks: true #Removed all uinecessary spaces and line returns rag_follow_subfolders: true #if true the vectorizer will vectorize the content of subfolders too diff --git a/lollms/databases/discussions_database.py b/lollms/databases/discussions_database.py index 51f684d..8848fcf 100644 --- a/lollms/databases/discussions_database.py +++ b/lollms/databases/discussions_database.py @@ -10,6 +10,11 @@ from lollms.paths import LollmsPaths from lollms.databases.skills_database import SkillsLibrary from lollms.com import LoLLMsCom from safe_store import TextVectorizer, VisualizationMethod, GenericDataLoader +from lollmsvectordb.vector_database import VectorDatabase +from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer +from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer +from lollmsvectordb.text_document_loader import TextDocumentsLoader +import gc import json import shutil from lollms.tasks import TasksLibrary @@ -651,26 +656,28 @@ class Discussion: # Initialize the file lists self.update_file_lists() - - self.vectorizer = TextVectorizer( - self.lollms.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer" - model=self.lollms.model, #needed in case of using model_embedding - database_path=self.discussion_rag_folder/"db.json", - save_db=self.lollms.config.data_vectorization_save_db, - data_visualization_method=VisualizationMethod.PCA, - database_dict=None) - - if len(self.vectorizer.chunks)==0 and len(self.text_files)>0: - for path in self.text_files: - data = GenericDataLoader.read_file(path) + if len(self.text_files)>0: + self.vectorizer = VectorDatabase( + self.discussion_rag_folder/"db.sqli", + BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(), + self.lollms.model, + chunk_size=self.lollms.config.rag_chunk_size, + overlap=self.lollms.config.rag_overlap + ) + + if len(self.vectorizer.list_documents())==0 and len(self.text_files)>0: + for path in self.text_files: + data = GenericDataLoader.read_file(path) + try: + self.vectorizer.add_document(path.stem, data, path, True) + except Exception as ex: + trace_exception(ex) try: - self.vectorizer.add_document(path, data, self.lollms.config.data_vectorization_chunk_size, self.lollms.config.data_vectorization_overlap_size, add_first_line_to_all_chunks=True if path.suffix==".csv" else False) + self.vectorizer.index() except Exception as ex: - trace_exception(ex) - try: - self.vectorizer.index() - except Exception as ex: - trace_exception(ex) + trace_exception(ex) + else: + self.vectorizer = None def update_file_lists(self): self.text_files = [Path(file) for file in self.discussion_text_folder.glob('*')] @@ -685,6 +692,13 @@ class Discussion: if any(file_name == entry.name for entry in self.text_files): fn = [entry for entry in self.text_files if entry.name == file_name][0] self.text_files = [entry for entry in self.text_files if entry.name != file_name] + try: + text = TextDocumentsLoader.read_file(fn) + hash = self.vectorizer._hash_document(text) + self.vectorizer.remove_document(hash) + except Exception as ex: + trace_exception(ex) + Path(fn).unlink() if len(self.text_files)>0: try: @@ -713,14 +727,28 @@ class Discussion: def remove_all_files(self): # Iterate over each directory and remove all files for path in [self.discussion_images_folder, self.discussion_rag_folder, self.discussion_audio_folder, self.discussion_text_folder]: + for file in path.glob('*'): - if file.is_file(): # Ensure it's a file, not a directory + if file.is_file() and file.suffix!=".sqli": # Ensure it's a file, not a directory + try: + text = TextDocumentsLoader.read_file(file) + hash = self.vectorizer._hash_document(text) + self.vectorizer.remove_document(hash) + except Exception as ex: + trace_exception(ex) file.unlink() # Delete the file # Clear the lists to reflect the current state (empty directories) self.text_files.clear() self.image_files.clear() self.audio_files.clear() + self.vectorizer = None + gc.collect() + fn = self.discussion_rag_folder/"db.sqli" + try: + fn.unlink() + except Exception as ex: + trace_exception(ex) def add_file(self, path, client, tasks_library:TasksLibrary, callback=None, process=True): output = "" @@ -787,16 +815,14 @@ class Discussion: self.lollms.ShowBlockingMessage("Processing file\nPlease wait ...") if process: if self.vectorizer is None: - self.vectorizer = TextVectorizer( - self.lollms.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer" - model=self.lollms.model, #needed in case of using model_embedding - database_path=self.discussion_rag_folder/"db.json", - save_db=self.lollms.config.data_vectorization_save_db, - data_visualization_method=VisualizationMethod.PCA, - database_dict=None) - data = GenericDataLoader.read_file(path) - self.vectorizer.add_document(path, data, self.lollms.config.data_vectorization_chunk_size, self.lollms.config.data_vectorization_overlap_size, add_first_line_to_all_chunks=True if path.suffix==".csv" else False) - self.vectorizer.index() + self.vectorizer = VectorDatabase( + self.discussion_rag_folder/"db.sqli", + BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(), + self.lollms.model, + ) + data = TextDocumentsLoader.read_file(path) + self.vectorizer.add_document(path.stem, data, path, True) + self.vectorizer.build_index() if callback is not None: callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO) self.lollms.HideBlockingMessage(client.client_id) diff --git a/lollms/personality.py b/lollms/personality.py index 1f70f3e..6e7c631 100644 --- a/lollms/personality.py +++ b/lollms/personality.py @@ -16,6 +16,10 @@ from lollms.binding import LLMBinding, BindingType from lollms.utilities import PromptReshaper, PackageManager, discussion_path_to_url, process_ai_output, remove_text_from_string from lollms.com import NotificationType, NotificationDisplayType from lollms.client_session import Session, Client +from lollmsvectordb.vector_database import VectorDatabase +from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer +from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer +from lollmsvectordb.text_document_loader import TextDocumentsLoader import pkg_resources from pathlib import Path @@ -1081,16 +1085,16 @@ class AIPersonality: self.ShowBlockingMessage("Processing file\nPlease wait ...") if process: if self.vectorizer is None: - self.vectorizer = TextVectorizer( - self.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer" - model=self.model, #needed in case of using model_embedding - database_path=client.discussion.discussion_rag_folder/"db.json", - save_db=self.config.data_vectorization_save_db, - data_visualization_method=VisualizationMethod.PCA, - database_dict=None) - data = GenericDataLoader.read_file(path) - self.vectorizer.add_document(path, data, self.config.data_vectorization_chunk_size, self.config.data_vectorization_overlap_size, add_first_line_to_all_chunks=True if path.suffix==".csv" else False) - self.vectorizer.index() + self.vectorizer = VectorDatabase( + client.discussion.discussion_rag_folder/"db.sqli", + BERTVectorizer(self.config.rag_vectorizer_model) if self.config.rag_vectorizer=="bert" else TFIDFVectorizer(), + self.model, + chunk_size=self.config.rag_chunk_size, + overlap=self.config.rag_overlap + ) + data = TextDocumentsLoader.read_file(path) + self.vectorizer.add_document(path.stem, data, path, True) + self.vectorizer.build_index() if callback is not None: callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO) self.HideBlockingMessage(client.client_id)