moved to new rag system

This commit is contained in:
Saifeddine ALOUI 2024-06-28 02:45:15 +02:00
parent 9cbcfbb282
commit d670770464
6 changed files with 84 additions and 60 deletions

View File

@ -1,5 +1,5 @@
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
version: 122
version: 123
binding_name: null
model_name: null
model_variant: null
@ -247,6 +247,8 @@ rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_overlap: 0 # number of tokens of overlap
rag_n_chunks: 4 #Number of chunks to recover from the database
rag_clean_chunks: true #Removed all uinecessary spaces and line returns
rag_follow_subfolders: true #if true the vectorizer will vectorize the content of subfolders too

View File

@ -14,6 +14,8 @@ from lollms.client_session import Client, Session
from lollms.databases.skills_database import SkillsLibrary
from lollms.tasks import TasksLibrary
from safe_store import TextVectorizer, VectorizationMethod, VisualizationMethod
from lollmsvectordb.database_elements.chunk import Chunk
from typing import Callable
from pathlib import Path
from datetime import datetime
@ -905,7 +907,7 @@ class LollmsApplication(LoLLMsCom):
if len(conditionning)>0:
conditionning = self.start_header_id_template + system_message_template + self.end_header_id_template + self.personality.replace_keys(conditionning, self.personality.conditionning_commands) + ("" if conditionning[-1]==self.separator_template else self.separator_template)
# Check if there are document files to add to the prompt
internet_search_results = ""
internet_search_infos = []
@ -1093,25 +1095,12 @@ class LollmsApplication(LoLLMsCom):
query = current_message.content
try:
if self.config.data_vectorization_force_first_chunk and len(client.discussion.vectorizer.chunks)>0:
doc_index = list(client.discussion.vectorizer.chunks.keys())[0]
doc_id = client.discussion.vectorizer.chunks[doc_index]['document_id']
content = client.discussion.vectorizer.chunks[doc_index]['chunk_text']
chunks:List[Chunk] = client.discussion.vectorizer.search(query, int(self.config.rag_n_chunks))
for chunk in chunks:
if self.config.data_vectorization_put_chunk_informations_into_context:
documentation += f"{self.start_header_id_template}document chunk{self.end_header_id_template}\nchunk_infos:{doc_id}\ncontent:{content}\n"
documentation += f"{self.start_header_id_template}document chunk{self.end_header_id_template}\ndocument title: {chunk.doc.title}\nchunk content:\n{chunk.text}\n"
else:
documentation += f"{self.start_header_id_template}chunk{self.end_header_id_template}\n{content}\n"
docs, sorted_similarities, document_ids = client.discussion.vectorizer.recover_text(query, top_k=int(self.config.data_vectorization_nb_chunks))
for doc, infos in zip(docs, sorted_similarities):
if self.config.data_vectorization_force_first_chunk and len(client.discussion.vectorizer.chunks)>0 and infos[0]==doc_id:
continue
if self.config.data_vectorization_put_chunk_informations_into_context:
documentation += f"{self.start_header_id_template}document chunk{self.end_header_id_template}\nchunk path: {infos[0]}\nchunk content:\n{doc}\n"
else:
documentation += f"{self.start_header_id_template}chunk{self.end_header_id_template}\n{doc}\n"
documentation += f"{self.start_header_id_template}chunk{self.end_header_id_template}\n{chunk.text}\n"
documentation += f"{self.separator_template}{self.start_header_id_template}important information: Use the documentation data to answer the user questions. If the data is not present in the documentation, please tell the user that the information he is asking for does not exist in the documentation section. It is strictly forbidden to give the user an answer without having actual proof from the documentation.\n"
except Exception as ex:

View File

@ -48,7 +48,8 @@ class LoLLMsCom:
self.rt_com = None
self.model = None
def InfoMessage(self, content, client_id=None, verbose:bool=None):
self.notify(
content,

View File

@ -1,5 +1,5 @@
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
version: 122
version: 123
binding_name: null
model_name: null
model_variant: null
@ -247,6 +247,8 @@ rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_overlap: 0 # number of tokens of overlap
rag_n_chunks: 4 #Number of chunks to recover from the database
rag_clean_chunks: true #Removed all uinecessary spaces and line returns
rag_follow_subfolders: true #if true the vectorizer will vectorize the content of subfolders too

View File

@ -10,6 +10,11 @@ from lollms.paths import LollmsPaths
from lollms.databases.skills_database import SkillsLibrary
from lollms.com import LoLLMsCom
from safe_store import TextVectorizer, VisualizationMethod, GenericDataLoader
from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
from lollmsvectordb.text_document_loader import TextDocumentsLoader
import gc
import json
import shutil
from lollms.tasks import TasksLibrary
@ -651,26 +656,28 @@ class Discussion:
# Initialize the file lists
self.update_file_lists()
self.vectorizer = TextVectorizer(
self.lollms.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer"
model=self.lollms.model, #needed in case of using model_embedding
database_path=self.discussion_rag_folder/"db.json",
save_db=self.lollms.config.data_vectorization_save_db,
data_visualization_method=VisualizationMethod.PCA,
database_dict=None)
if len(self.vectorizer.chunks)==0 and len(self.text_files)>0:
for path in self.text_files:
data = GenericDataLoader.read_file(path)
if len(self.text_files)>0:
self.vectorizer = VectorDatabase(
self.discussion_rag_folder/"db.sqli",
BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
self.lollms.model,
chunk_size=self.lollms.config.rag_chunk_size,
overlap=self.lollms.config.rag_overlap
)
if len(self.vectorizer.list_documents())==0 and len(self.text_files)>0:
for path in self.text_files:
data = GenericDataLoader.read_file(path)
try:
self.vectorizer.add_document(path.stem, data, path, True)
except Exception as ex:
trace_exception(ex)
try:
self.vectorizer.add_document(path, data, self.lollms.config.data_vectorization_chunk_size, self.lollms.config.data_vectorization_overlap_size, add_first_line_to_all_chunks=True if path.suffix==".csv" else False)
self.vectorizer.index()
except Exception as ex:
trace_exception(ex)
try:
self.vectorizer.index()
except Exception as ex:
trace_exception(ex)
trace_exception(ex)
else:
self.vectorizer = None
def update_file_lists(self):
self.text_files = [Path(file) for file in self.discussion_text_folder.glob('*')]
@ -685,6 +692,13 @@ class Discussion:
if any(file_name == entry.name for entry in self.text_files):
fn = [entry for entry in self.text_files if entry.name == file_name][0]
self.text_files = [entry for entry in self.text_files if entry.name != file_name]
try:
text = TextDocumentsLoader.read_file(fn)
hash = self.vectorizer._hash_document(text)
self.vectorizer.remove_document(hash)
except Exception as ex:
trace_exception(ex)
Path(fn).unlink()
if len(self.text_files)>0:
try:
@ -713,14 +727,28 @@ class Discussion:
def remove_all_files(self):
# Iterate over each directory and remove all files
for path in [self.discussion_images_folder, self.discussion_rag_folder, self.discussion_audio_folder, self.discussion_text_folder]:
for file in path.glob('*'):
if file.is_file(): # Ensure it's a file, not a directory
if file.is_file() and file.suffix!=".sqli": # Ensure it's a file, not a directory
try:
text = TextDocumentsLoader.read_file(file)
hash = self.vectorizer._hash_document(text)
self.vectorizer.remove_document(hash)
except Exception as ex:
trace_exception(ex)
file.unlink() # Delete the file
# Clear the lists to reflect the current state (empty directories)
self.text_files.clear()
self.image_files.clear()
self.audio_files.clear()
self.vectorizer = None
gc.collect()
fn = self.discussion_rag_folder/"db.sqli"
try:
fn.unlink()
except Exception as ex:
trace_exception(ex)
def add_file(self, path, client, tasks_library:TasksLibrary, callback=None, process=True):
output = ""
@ -787,16 +815,14 @@ class Discussion:
self.lollms.ShowBlockingMessage("Processing file\nPlease wait ...")
if process:
if self.vectorizer is None:
self.vectorizer = TextVectorizer(
self.lollms.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer"
model=self.lollms.model, #needed in case of using model_embedding
database_path=self.discussion_rag_folder/"db.json",
save_db=self.lollms.config.data_vectorization_save_db,
data_visualization_method=VisualizationMethod.PCA,
database_dict=None)
data = GenericDataLoader.read_file(path)
self.vectorizer.add_document(path, data, self.lollms.config.data_vectorization_chunk_size, self.lollms.config.data_vectorization_overlap_size, add_first_line_to_all_chunks=True if path.suffix==".csv" else False)
self.vectorizer.index()
self.vectorizer = VectorDatabase(
self.discussion_rag_folder/"db.sqli",
BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
self.lollms.model,
)
data = TextDocumentsLoader.read_file(path)
self.vectorizer.add_document(path.stem, data, path, True)
self.vectorizer.build_index()
if callback is not None:
callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
self.lollms.HideBlockingMessage(client.client_id)

View File

@ -16,6 +16,10 @@ from lollms.binding import LLMBinding, BindingType
from lollms.utilities import PromptReshaper, PackageManager, discussion_path_to_url, process_ai_output, remove_text_from_string
from lollms.com import NotificationType, NotificationDisplayType
from lollms.client_session import Session, Client
from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
from lollmsvectordb.text_document_loader import TextDocumentsLoader
import pkg_resources
from pathlib import Path
@ -1081,16 +1085,16 @@ class AIPersonality:
self.ShowBlockingMessage("Processing file\nPlease wait ...")
if process:
if self.vectorizer is None:
self.vectorizer = TextVectorizer(
self.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer"
model=self.model, #needed in case of using model_embedding
database_path=client.discussion.discussion_rag_folder/"db.json",
save_db=self.config.data_vectorization_save_db,
data_visualization_method=VisualizationMethod.PCA,
database_dict=None)
data = GenericDataLoader.read_file(path)
self.vectorizer.add_document(path, data, self.config.data_vectorization_chunk_size, self.config.data_vectorization_overlap_size, add_first_line_to_all_chunks=True if path.suffix==".csv" else False)
self.vectorizer.index()
self.vectorizer = VectorDatabase(
client.discussion.discussion_rag_folder/"db.sqli",
BERTVectorizer(self.config.rag_vectorizer_model) if self.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
self.model,
chunk_size=self.config.rag_chunk_size,
overlap=self.config.rag_overlap
)
data = TextDocumentsLoader.read_file(path)
self.vectorizer.add_document(path.stem, data, path, True)
self.vectorizer.build_index()
if callback is not None:
callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
self.HideBlockingMessage(client.client_id)