From 488229b9128126f5f5d99d4ac301eec214fb0ff9 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 10 Jul 2024 01:34:23 +0200 Subject: [PATCH] moved skills database to the new system --- lollms/app.py | 2 +- lollms/databases/skills_database.py | 43 ++++++++++++++++------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/lollms/app.py b/lollms/app.py index 4c70099..92e2936 100644 --- a/lollms/app.py +++ b/lollms/app.py @@ -263,7 +263,7 @@ class LollmsApplication(LoLLMsCom): def _generate_text(self, prompt): - max_tokens = self.config.ctx_size - self.model.get_nb_tokens(prompt) + max_tokens = min(self.config.ctx_size - self.model.get_nb_tokens(prompt),self.config.max_n_predict) generated_text = self.model.generate(prompt, max_tokens) return generated_text.strip() diff --git a/lollms/databases/skills_database.py b/lollms/databases/skills_database.py index 93e9dd3..ec689ac 100644 --- a/lollms/databases/skills_database.py +++ b/lollms/databases/skills_database.py @@ -1,11 +1,15 @@ import sqlite3 -from safe_store.text_vectorizer import TextVectorizer, VectorizationMethod, VisualizationMethod +from lollmsvectordb import VectorDatabase, BERTVectorizer +from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer import numpy as np +from ascii_colors import ASCIIColors class SkillsLibrary: - def __init__(self, db_path): + def __init__(self, db_path, model_name: str = 'bert-base-nli-mean-tokens', chunk_size:int=512, overlap:int=0, n_neighbors:int=5): self.db_path =db_path self._initialize_db() + self.vectorizer = VectorDatabase("", BERTVectorizer(), TikTokenTokenizer(),chunk_size, overlap, n_neighbors) + ASCIIColors.green("Vecorizer ready") def _initialize_db(self): @@ -121,37 +125,38 @@ class SkillsLibrary: return res def query_vector_db(self, query_, top_k=3, max_dist=1000): - vectorizer = TextVectorizer(VectorizationMethod.TFIDF_VECTORIZER) conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # Use direct string concatenation for the MATCH expression. # Ensure text is safely escaped to avoid SQL injection. - query = "SELECT id, title FROM skills_library" + query = "SELECT id, title, content FROM skills_library" cursor.execute(query) res = cursor.fetchall() cursor.close() conn.close() skills = [] + skill_titles = [] if len(res)>0: for entry in res: - vectorizer.add_document(entry[0],entry[1]) - vectorizer.index() + self.vectorizer.add_document(entry[0],"Title:"+entry[1]+"\n"+entry[2]) + self.vectorizer.build_index() - skill_titles, sorted_similarities, document_ids = vectorizer.recover_text(query_, top_k) - for skill_title, sim, id in zip(skill_titles, sorted_similarities, document_ids): - if np.linalg.norm(sim[1])