mirror of
https://github.com/ParisNeo/lollms.git
synced 2024-12-19 20:57:58 +00:00
Moved to new vectordb
This commit is contained in:
parent
1e963ab4d4
commit
3b808e65e6
@ -1,5 +1,5 @@
|
|||||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||||
version: 142
|
version: 143
|
||||||
|
|
||||||
# video viewing and news recovering
|
# video viewing and news recovering
|
||||||
last_viewed_video: null
|
last_viewed_video: null
|
||||||
@ -280,7 +280,7 @@ audio_silenceTimer: 5000
|
|||||||
|
|
||||||
# Data vectorization
|
# Data vectorization
|
||||||
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
||||||
rag_vectorizer: tfidf # possible values semantic, tfidf, openai
|
rag_vectorizer: semantic # possible values semantic, tfidf, openai
|
||||||
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
||||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||||
rag_chunk_size: 512 # number of tokens per chunk
|
rag_chunk_size: 512 # number of tokens per chunk
|
||||||
@ -309,7 +309,7 @@ data_vectorization_visualize_on_vectorization: false
|
|||||||
data_vectorization_activate: true # To activate/deactivate data vectorization
|
data_vectorization_activate: true # To activate/deactivate data vectorization
|
||||||
data_vectorization_method: "tfidf_vectorizer" #"model_embedding" or "tfidf_vectorizer"
|
data_vectorization_method: "tfidf_vectorizer" #"model_embedding" or "tfidf_vectorizer"
|
||||||
data_visualization_method: "PCA" #"PCA" or "TSNE"
|
data_visualization_method: "PCA" #"PCA" or "TSNE"
|
||||||
data_vectorization_sentense_transformer_model: "all-MiniLM-L6-v2" # you can use another model by setting its name here or its path
|
data_vectorization_sentense_transformer_model: "BAAI/bge-m3" # you can use another model by setting its name here or its path
|
||||||
|
|
||||||
data_vectorization_save_db: true # For each new session, new files
|
data_vectorization_save_db: true # For each new session, new files
|
||||||
data_vectorization_chunk_size: 512 # chunk size
|
data_vectorization_chunk_size: 512 # chunk size
|
||||||
|
@ -1271,8 +1271,8 @@ The reformulation must be placed inside a json markdown tag like this:
|
|||||||
self.personality.step_start("Adding skills")
|
self.personality.step_start("Adding skills")
|
||||||
if self.config.debug:
|
if self.config.debug:
|
||||||
ASCIIColors.info(f"Query : {query}")
|
ASCIIColors.info(f"Query : {query}")
|
||||||
skill_titles, skills = self.skills_library.query_vector_db(query, top_k=3, min_dist=self.config.rag_min_correspondance)#query_entry_fts(query)
|
skill_titles, skills, similarities = self.skills_library.query_vector_db(query, top_k=3, min_similarity=self.config.rag_min_correspondance)#query_entry_fts(query)
|
||||||
knowledge_infos={"titles":skill_titles,"contents":skills}
|
knowledge_infos={"titles":skill_titles,"contents":skills, "similarities":similarities}
|
||||||
if len(skills)>0:
|
if len(skills)>0:
|
||||||
if knowledge=="":
|
if knowledge=="":
|
||||||
knowledge=f"{self.system_custom_header(knowledge)}\n"
|
knowledge=f"{self.system_custom_header(knowledge)}\n"
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||||
version: 142
|
version: 143
|
||||||
|
|
||||||
# video viewing and news recovering
|
# video viewing and news recovering
|
||||||
last_viewed_video: null
|
last_viewed_video: null
|
||||||
@ -280,7 +280,7 @@ audio_silenceTimer: 5000
|
|||||||
|
|
||||||
# Data vectorization
|
# Data vectorization
|
||||||
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
||||||
rag_vectorizer: tfidf # possible values semantic, tfidf, openai
|
rag_vectorizer: semantic # possible values semantic, tfidf, openai
|
||||||
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
||||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||||
rag_chunk_size: 512 # number of tokens per chunk
|
rag_chunk_size: 512 # number of tokens per chunk
|
||||||
@ -309,7 +309,7 @@ data_vectorization_visualize_on_vectorization: false
|
|||||||
data_vectorization_activate: true # To activate/deactivate data vectorization
|
data_vectorization_activate: true # To activate/deactivate data vectorization
|
||||||
data_vectorization_method: "tfidf_vectorizer" #"model_embedding" or "tfidf_vectorizer"
|
data_vectorization_method: "tfidf_vectorizer" #"model_embedding" or "tfidf_vectorizer"
|
||||||
data_visualization_method: "PCA" #"PCA" or "TSNE"
|
data_visualization_method: "PCA" #"PCA" or "TSNE"
|
||||||
data_vectorization_sentense_transformer_model: "all-MiniLM-L6-v2" # you can use another model by setting its name here or its path
|
data_vectorization_sentense_transformer_model: "BAAI/bge-m3" # you can use another model by setting its name here or its path
|
||||||
|
|
||||||
data_vectorization_save_db: true # For each new session, new files
|
data_vectorization_save_db: true # For each new session, new files
|
||||||
data_vectorization_chunk_size: 512 # chunk size
|
data_vectorization_chunk_size: 512 # chunk size
|
||||||
|
@ -5,11 +5,38 @@ import numpy as np
|
|||||||
from ascii_colors import ASCIIColors
|
from ascii_colors import ASCIIColors
|
||||||
class SkillsLibrary:
|
class SkillsLibrary:
|
||||||
|
|
||||||
def __init__(self, db_path, chunk_size:int=512, overlap:int=0, n_neighbors:int=5):
|
def __init__(self, db_path, chunk_size:int=512, overlap:int=0, n_neighbors:int=5, config=None):
|
||||||
self.db_path =db_path
|
self.db_path =db_path
|
||||||
|
self.config = config
|
||||||
self._initialize_db()
|
self._initialize_db()
|
||||||
self.vectorizer = VectorDatabase(db_path, TFIDFVectorizer(), TikTokenTokenizer(),chunk_size, overlap, n_neighbors)
|
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||||
|
self.config = config
|
||||||
|
if config is not None:
|
||||||
|
vectorizer = self.config.rag_vectorizer
|
||||||
|
if vectorizer == "semantic":
|
||||||
|
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||||
|
v = SemanticVectorizer(self.config.rag_vectorizer_model)
|
||||||
|
elif vectorizer == "tfidf":
|
||||||
|
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||||
|
v = TFIDFVectorizer()
|
||||||
|
elif vectorizer == "openai":
|
||||||
|
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||||
|
v = OpenAIVectorizer()
|
||||||
|
else:
|
||||||
|
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||||
|
v = SemanticVectorizer("BAAI/bge-m3")
|
||||||
|
|
||||||
|
self.vectorizer = VectorDatabase("", v, TikTokenTokenizer(),chunk_size, overlap, n_neighbors)
|
||||||
ASCIIColors.green("Vecorizer ready")
|
ASCIIColors.green("Vecorizer ready")
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT * FROM skills_library")
|
||||||
|
res = cursor.fetchall()
|
||||||
|
for entry in res:
|
||||||
|
self.vectorizer.add_document(entry[3], entry[4], "",True, category_id=entry[2])
|
||||||
|
self.vectorizer.build_index()
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
def _initialize_db(self):
|
def _initialize_db(self):
|
||||||
@ -90,6 +117,8 @@ class SkillsLibrary:
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
self.vectorizer.add_document(title, content, "",True)
|
||||||
|
self.vectorizer.build_index()
|
||||||
|
|
||||||
def list_entries(self):
|
def list_entries(self):
|
||||||
conn = sqlite3.connect(self.db_path)
|
conn = sqlite3.connect(self.db_path)
|
||||||
@ -124,18 +153,20 @@ class SkillsLibrary:
|
|||||||
conn.close()
|
conn.close()
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def query_vector_db(self, query_, top_k=3, min_dist=0):
|
def query_vector_db(self, query_, top_k=3, min_similarity=0):
|
||||||
# Use direct string concatenation for the MATCH expression.
|
# Use direct string concatenation for the MATCH expression.
|
||||||
# Ensure text is safely escaped to avoid SQL injection.
|
# Ensure text is safely escaped to avoid SQL injection.
|
||||||
skills = []
|
skills = []
|
||||||
|
similarities = []
|
||||||
skill_titles = []
|
skill_titles = []
|
||||||
chunks = self.vectorizer.search(query_, top_k)
|
chunks = self.vectorizer.search(query_, top_k)
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
if chunk.distance>min_dist:
|
if 1-chunk.distance>min_similarity:
|
||||||
skills.append(chunk.text)
|
skills.append(chunk.text)
|
||||||
|
similarities.append(1-chunk.distance)
|
||||||
skill_titles.append(chunk.doc.title)
|
skill_titles.append(chunk.doc.title)
|
||||||
|
|
||||||
return skill_titles, skills
|
return skill_titles, skills, similarities
|
||||||
|
|
||||||
|
|
||||||
def dump(self):
|
def dump(self):
|
||||||
@ -231,6 +262,7 @@ class SkillsLibrary:
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
self.vectorizer.remove_document_by_id(id)
|
||||||
|
|
||||||
def export_entries(self, file_path):
|
def export_entries(self, file_path):
|
||||||
with open(file_path, 'w') as f:
|
with open(file_path, 'w') as f:
|
||||||
|
Loading…
Reference in New Issue
Block a user