From 3b808e65e61133116eba90eb17e19b7f5af79f88 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Tue, 3 Dec 2024 00:39:08 +0100 Subject: [PATCH] Moved to new vectordb --- configs/config.yaml | 6 ++--- lollms/app.py | 4 +-- lollms/configs/config.yaml | 6 ++--- lollms/databases/skills_database.py | 42 +++++++++++++++++++++++++---- 4 files changed, 45 insertions(+), 13 deletions(-) diff --git a/configs/config.yaml b/configs/config.yaml index 86f1caf..80f5ae7 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 142 +version: 143 # video viewing and news recovering last_viewed_video: null @@ -280,7 +280,7 @@ audio_silenceTimer: 5000 # Data vectorization rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data -rag_vectorizer: tfidf # possible values semantic, tfidf, openai +rag_vectorizer: semantic # possible values semantic, tfidf, openai rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable rag_vectorizer_parameters: null # Parameters of the model in json format rag_chunk_size: 512 # number of tokens per chunk @@ -309,7 +309,7 @@ data_vectorization_visualize_on_vectorization: false data_vectorization_activate: true # To activate/deactivate data vectorization data_vectorization_method: "tfidf_vectorizer" #"model_embedding" or "tfidf_vectorizer" data_visualization_method: "PCA" #"PCA" or "TSNE" -data_vectorization_sentense_transformer_model: "all-MiniLM-L6-v2" # you can use another model by setting its name here or its path +data_vectorization_sentense_transformer_model: "BAAI/bge-m3" # you can use another model by setting its name here or its path data_vectorization_save_db: true # For each new session, new files data_vectorization_chunk_size: 512 # chunk size diff --git a/lollms/app.py b/lollms/app.py index ccefc6c..1abe9ad 100644 --- a/lollms/app.py +++ b/lollms/app.py @@ -1271,8 +1271,8 @@ The reformulation must be placed inside a json markdown tag like this: self.personality.step_start("Adding skills") if self.config.debug: ASCIIColors.info(f"Query : {query}") - skill_titles, skills = self.skills_library.query_vector_db(query, top_k=3, min_dist=self.config.rag_min_correspondance)#query_entry_fts(query) - knowledge_infos={"titles":skill_titles,"contents":skills} + skill_titles, skills, similarities = self.skills_library.query_vector_db(query, top_k=3, min_similarity=self.config.rag_min_correspondance)#query_entry_fts(query) + knowledge_infos={"titles":skill_titles,"contents":skills, "similarities":similarities} if len(skills)>0: if knowledge=="": knowledge=f"{self.system_custom_header(knowledge)}\n" diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml index 86f1caf..80f5ae7 100644 --- a/lollms/configs/config.yaml +++ b/lollms/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 142 +version: 143 # video viewing and news recovering last_viewed_video: null @@ -280,7 +280,7 @@ audio_silenceTimer: 5000 # Data vectorization rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data -rag_vectorizer: tfidf # possible values semantic, tfidf, openai +rag_vectorizer: semantic # possible values semantic, tfidf, openai rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable rag_vectorizer_parameters: null # Parameters of the model in json format rag_chunk_size: 512 # number of tokens per chunk @@ -309,7 +309,7 @@ data_vectorization_visualize_on_vectorization: false data_vectorization_activate: true # To activate/deactivate data vectorization data_vectorization_method: "tfidf_vectorizer" #"model_embedding" or "tfidf_vectorizer" data_visualization_method: "PCA" #"PCA" or "TSNE" -data_vectorization_sentense_transformer_model: "all-MiniLM-L6-v2" # you can use another model by setting its name here or its path +data_vectorization_sentense_transformer_model: "BAAI/bge-m3" # you can use another model by setting its name here or its path data_vectorization_save_db: true # For each new session, new files data_vectorization_chunk_size: 512 # chunk size diff --git a/lollms/databases/skills_database.py b/lollms/databases/skills_database.py index ffcc465..82c93c2 100644 --- a/lollms/databases/skills_database.py +++ b/lollms/databases/skills_database.py @@ -5,11 +5,38 @@ import numpy as np from ascii_colors import ASCIIColors class SkillsLibrary: - def __init__(self, db_path, chunk_size:int=512, overlap:int=0, n_neighbors:int=5): + def __init__(self, db_path, chunk_size:int=512, overlap:int=0, n_neighbors:int=5, config=None): self.db_path =db_path + self.config = config self._initialize_db() - self.vectorizer = VectorDatabase(db_path, TFIDFVectorizer(), TikTokenTokenizer(),chunk_size, overlap, n_neighbors) + from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer + self.config = config + if config is not None: + vectorizer = self.config.rag_vectorizer + if vectorizer == "semantic": + from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer + v = SemanticVectorizer(self.config.rag_vectorizer_model) + elif vectorizer == "tfidf": + from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer + v = TFIDFVectorizer() + elif vectorizer == "openai": + from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer + v = OpenAIVectorizer() + else: + from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer + v = SemanticVectorizer("BAAI/bge-m3") + + self.vectorizer = VectorDatabase("", v, TikTokenTokenizer(),chunk_size, overlap, n_neighbors) ASCIIColors.green("Vecorizer ready") + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + cursor.execute("SELECT * FROM skills_library") + res = cursor.fetchall() + for entry in res: + self.vectorizer.add_document(entry[3], entry[4], "",True, category_id=entry[2]) + self.vectorizer.build_index() + cursor.close() + conn.close() def _initialize_db(self): @@ -90,6 +117,8 @@ class SkillsLibrary: conn.commit() cursor.close() conn.close() + self.vectorizer.add_document(title, content, "",True) + self.vectorizer.build_index() def list_entries(self): conn = sqlite3.connect(self.db_path) @@ -124,18 +153,20 @@ class SkillsLibrary: conn.close() return res - def query_vector_db(self, query_, top_k=3, min_dist=0): + def query_vector_db(self, query_, top_k=3, min_similarity=0): # Use direct string concatenation for the MATCH expression. # Ensure text is safely escaped to avoid SQL injection. skills = [] + similarities = [] skill_titles = [] chunks = self.vectorizer.search(query_, top_k) for chunk in chunks: - if chunk.distance>min_dist: + if 1-chunk.distance>min_similarity: skills.append(chunk.text) + similarities.append(1-chunk.distance) skill_titles.append(chunk.doc.title) - return skill_titles, skills + return skill_titles, skills, similarities def dump(self): @@ -231,6 +262,7 @@ class SkillsLibrary: conn.commit() cursor.close() conn.close() + self.vectorizer.remove_document_by_id(id) def export_entries(self, file_path): with open(file_path, 'w') as f: