diff --git a/lollms/app.py b/lollms/app.py index 4c70099..92e2936 100644 --- a/lollms/app.py +++ b/lollms/app.py @@ -263,7 +263,7 @@ class LollmsApplication(LoLLMsCom): def _generate_text(self, prompt): - max_tokens = self.config.ctx_size - self.model.get_nb_tokens(prompt) + max_tokens = min(self.config.ctx_size - self.model.get_nb_tokens(prompt),self.config.max_n_predict) generated_text = self.model.generate(prompt, max_tokens) return generated_text.strip() diff --git a/lollms/databases/skills_database.py b/lollms/databases/skills_database.py index 93e9dd3..ec689ac 100644 --- a/lollms/databases/skills_database.py +++ b/lollms/databases/skills_database.py @@ -1,11 +1,15 @@ import sqlite3 -from safe_store.text_vectorizer import TextVectorizer, VectorizationMethod, VisualizationMethod +from lollmsvectordb import VectorDatabase, BERTVectorizer +from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer import numpy as np +from ascii_colors import ASCIIColors class SkillsLibrary: - def __init__(self, db_path): + def __init__(self, db_path, model_name: str = 'bert-base-nli-mean-tokens', chunk_size:int=512, overlap:int=0, n_neighbors:int=5): self.db_path =db_path self._initialize_db() + self.vectorizer = VectorDatabase("", BERTVectorizer(), TikTokenTokenizer(),chunk_size, overlap, n_neighbors) + ASCIIColors.green("Vecorizer ready") def _initialize_db(self): @@ -121,37 +125,38 @@ class SkillsLibrary: return res def query_vector_db(self, query_, top_k=3, max_dist=1000): - vectorizer = TextVectorizer(VectorizationMethod.TFIDF_VECTORIZER) conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # Use direct string concatenation for the MATCH expression. # Ensure text is safely escaped to avoid SQL injection. - query = "SELECT id, title FROM skills_library" + query = "SELECT id, title, content FROM skills_library" cursor.execute(query) res = cursor.fetchall() cursor.close() conn.close() skills = [] + skill_titles = [] if len(res)>0: for entry in res: - vectorizer.add_document(entry[0],entry[1]) - vectorizer.index() + self.vectorizer.add_document(entry[0],"Title:"+entry[1]+"\n"+entry[2]) + self.vectorizer.build_index() - skill_titles, sorted_similarities, document_ids = vectorizer.recover_text(query_, top_k) - for skill_title, sim, id in zip(skill_titles, sorted_similarities, document_ids): - if np.linalg.norm(sim[1])<max_dist: - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() + chunks = self.vectorizer.search(query_, top_k) + for chunk in chunks: + if chunk.distance<max_dist: + skills.append(chunk.text) + skill_titles.append(chunk.doc.title) + # conn = sqlite3.connect(self.db_path) + # cursor = conn.cursor() # Use direct string concatenation for the MATCH expression. # Ensure text is safely escaped to avoid SQL injection. - query = "SELECT content FROM skills_library WHERE id = ?" - cursor.execute(query, (id,)) - res = cursor.fetchall() - skills.append(res[0]) - cursor.close() - conn.close() - else: - skill_titles = [] + #query = "SELECT content FROM skills_library WHERE id = ?" + #cursor.execute(query, (chunk.chunk_id,)) + #res = cursor.fetchall() + #skills.append(res[0]) + #cursor.close() + #conn.close() + return skill_titles, skills