mirror of
https://github.com/ParisNeo/lollms.git
synced 2024-12-18 20:27:58 +00:00
moved to new rag system
This commit is contained in:
parent
9cbcfbb282
commit
d670770464
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 122
|
||||
version: 123
|
||||
binding_name: null
|
||||
model_name: null
|
||||
model_variant: null
|
||||
@ -247,6 +247,8 @@ rag_vectorizer: bert # possible values bert, tfidf, word2vec
|
||||
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||
rag_chunk_size: 512 # number of tokens per chunk
|
||||
rag_overlap: 0 # number of tokens of overlap
|
||||
|
||||
rag_n_chunks: 4 #Number of chunks to recover from the database
|
||||
rag_clean_chunks: true #Removed all uinecessary spaces and line returns
|
||||
rag_follow_subfolders: true #if true the vectorizer will vectorize the content of subfolders too
|
||||
|
@ -14,6 +14,8 @@ from lollms.client_session import Client, Session
|
||||
from lollms.databases.skills_database import SkillsLibrary
|
||||
from lollms.tasks import TasksLibrary
|
||||
from safe_store import TextVectorizer, VectorizationMethod, VisualizationMethod
|
||||
|
||||
from lollmsvectordb.database_elements.chunk import Chunk
|
||||
from typing import Callable
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
@ -905,7 +907,7 @@ class LollmsApplication(LoLLMsCom):
|
||||
|
||||
if len(conditionning)>0:
|
||||
conditionning = self.start_header_id_template + system_message_template + self.end_header_id_template + self.personality.replace_keys(conditionning, self.personality.conditionning_commands) + ("" if conditionning[-1]==self.separator_template else self.separator_template)
|
||||
|
||||
|
||||
# Check if there are document files to add to the prompt
|
||||
internet_search_results = ""
|
||||
internet_search_infos = []
|
||||
@ -1093,25 +1095,12 @@ class LollmsApplication(LoLLMsCom):
|
||||
query = current_message.content
|
||||
|
||||
try:
|
||||
if self.config.data_vectorization_force_first_chunk and len(client.discussion.vectorizer.chunks)>0:
|
||||
doc_index = list(client.discussion.vectorizer.chunks.keys())[0]
|
||||
|
||||
doc_id = client.discussion.vectorizer.chunks[doc_index]['document_id']
|
||||
content = client.discussion.vectorizer.chunks[doc_index]['chunk_text']
|
||||
|
||||
chunks:List[Chunk] = client.discussion.vectorizer.search(query, int(self.config.rag_n_chunks))
|
||||
for chunk in chunks:
|
||||
if self.config.data_vectorization_put_chunk_informations_into_context:
|
||||
documentation += f"{self.start_header_id_template}document chunk{self.end_header_id_template}\nchunk_infos:{doc_id}\ncontent:{content}\n"
|
||||
documentation += f"{self.start_header_id_template}document chunk{self.end_header_id_template}\ndocument title: {chunk.doc.title}\nchunk content:\n{chunk.text}\n"
|
||||
else:
|
||||
documentation += f"{self.start_header_id_template}chunk{self.end_header_id_template}\n{content}\n"
|
||||
|
||||
docs, sorted_similarities, document_ids = client.discussion.vectorizer.recover_text(query, top_k=int(self.config.data_vectorization_nb_chunks))
|
||||
for doc, infos in zip(docs, sorted_similarities):
|
||||
if self.config.data_vectorization_force_first_chunk and len(client.discussion.vectorizer.chunks)>0 and infos[0]==doc_id:
|
||||
continue
|
||||
if self.config.data_vectorization_put_chunk_informations_into_context:
|
||||
documentation += f"{self.start_header_id_template}document chunk{self.end_header_id_template}\nchunk path: {infos[0]}\nchunk content:\n{doc}\n"
|
||||
else:
|
||||
documentation += f"{self.start_header_id_template}chunk{self.end_header_id_template}\n{doc}\n"
|
||||
documentation += f"{self.start_header_id_template}chunk{self.end_header_id_template}\n{chunk.text}\n"
|
||||
|
||||
documentation += f"{self.separator_template}{self.start_header_id_template}important information: Use the documentation data to answer the user questions. If the data is not present in the documentation, please tell the user that the information he is asking for does not exist in the documentation section. It is strictly forbidden to give the user an answer without having actual proof from the documentation.\n"
|
||||
except Exception as ex:
|
||||
|
@ -48,7 +48,8 @@ class LoLLMsCom:
|
||||
|
||||
self.rt_com = None
|
||||
|
||||
|
||||
self.model = None
|
||||
|
||||
def InfoMessage(self, content, client_id=None, verbose:bool=None):
|
||||
self.notify(
|
||||
content,
|
||||
|
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 122
|
||||
version: 123
|
||||
binding_name: null
|
||||
model_name: null
|
||||
model_variant: null
|
||||
@ -247,6 +247,8 @@ rag_vectorizer: bert # possible values bert, tfidf, word2vec
|
||||
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||
rag_chunk_size: 512 # number of tokens per chunk
|
||||
rag_overlap: 0 # number of tokens of overlap
|
||||
|
||||
rag_n_chunks: 4 #Number of chunks to recover from the database
|
||||
rag_clean_chunks: true #Removed all uinecessary spaces and line returns
|
||||
rag_follow_subfolders: true #if true the vectorizer will vectorize the content of subfolders too
|
||||
|
@ -10,6 +10,11 @@ from lollms.paths import LollmsPaths
|
||||
from lollms.databases.skills_database import SkillsLibrary
|
||||
from lollms.com import LoLLMsCom
|
||||
from safe_store import TextVectorizer, VisualizationMethod, GenericDataLoader
|
||||
from lollmsvectordb.vector_database import VectorDatabase
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
from lollmsvectordb.text_document_loader import TextDocumentsLoader
|
||||
import gc
|
||||
import json
|
||||
import shutil
|
||||
from lollms.tasks import TasksLibrary
|
||||
@ -651,26 +656,28 @@ class Discussion:
|
||||
# Initialize the file lists
|
||||
self.update_file_lists()
|
||||
|
||||
|
||||
self.vectorizer = TextVectorizer(
|
||||
self.lollms.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer"
|
||||
model=self.lollms.model, #needed in case of using model_embedding
|
||||
database_path=self.discussion_rag_folder/"db.json",
|
||||
save_db=self.lollms.config.data_vectorization_save_db,
|
||||
data_visualization_method=VisualizationMethod.PCA,
|
||||
database_dict=None)
|
||||
|
||||
if len(self.vectorizer.chunks)==0 and len(self.text_files)>0:
|
||||
for path in self.text_files:
|
||||
data = GenericDataLoader.read_file(path)
|
||||
if len(self.text_files)>0:
|
||||
self.vectorizer = VectorDatabase(
|
||||
self.discussion_rag_folder/"db.sqli",
|
||||
BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
|
||||
self.lollms.model,
|
||||
chunk_size=self.lollms.config.rag_chunk_size,
|
||||
overlap=self.lollms.config.rag_overlap
|
||||
)
|
||||
|
||||
if len(self.vectorizer.list_documents())==0 and len(self.text_files)>0:
|
||||
for path in self.text_files:
|
||||
data = GenericDataLoader.read_file(path)
|
||||
try:
|
||||
self.vectorizer.add_document(path.stem, data, path, True)
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
try:
|
||||
self.vectorizer.add_document(path, data, self.lollms.config.data_vectorization_chunk_size, self.lollms.config.data_vectorization_overlap_size, add_first_line_to_all_chunks=True if path.suffix==".csv" else False)
|
||||
self.vectorizer.index()
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
try:
|
||||
self.vectorizer.index()
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
trace_exception(ex)
|
||||
else:
|
||||
self.vectorizer = None
|
||||
|
||||
def update_file_lists(self):
|
||||
self.text_files = [Path(file) for file in self.discussion_text_folder.glob('*')]
|
||||
@ -685,6 +692,13 @@ class Discussion:
|
||||
if any(file_name == entry.name for entry in self.text_files):
|
||||
fn = [entry for entry in self.text_files if entry.name == file_name][0]
|
||||
self.text_files = [entry for entry in self.text_files if entry.name != file_name]
|
||||
try:
|
||||
text = TextDocumentsLoader.read_file(fn)
|
||||
hash = self.vectorizer._hash_document(text)
|
||||
self.vectorizer.remove_document(hash)
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
|
||||
Path(fn).unlink()
|
||||
if len(self.text_files)>0:
|
||||
try:
|
||||
@ -713,14 +727,28 @@ class Discussion:
|
||||
def remove_all_files(self):
|
||||
# Iterate over each directory and remove all files
|
||||
for path in [self.discussion_images_folder, self.discussion_rag_folder, self.discussion_audio_folder, self.discussion_text_folder]:
|
||||
|
||||
for file in path.glob('*'):
|
||||
if file.is_file(): # Ensure it's a file, not a directory
|
||||
if file.is_file() and file.suffix!=".sqli": # Ensure it's a file, not a directory
|
||||
try:
|
||||
text = TextDocumentsLoader.read_file(file)
|
||||
hash = self.vectorizer._hash_document(text)
|
||||
self.vectorizer.remove_document(hash)
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
file.unlink() # Delete the file
|
||||
|
||||
# Clear the lists to reflect the current state (empty directories)
|
||||
self.text_files.clear()
|
||||
self.image_files.clear()
|
||||
self.audio_files.clear()
|
||||
self.vectorizer = None
|
||||
gc.collect()
|
||||
fn = self.discussion_rag_folder/"db.sqli"
|
||||
try:
|
||||
fn.unlink()
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
|
||||
def add_file(self, path, client, tasks_library:TasksLibrary, callback=None, process=True):
|
||||
output = ""
|
||||
@ -787,16 +815,14 @@ class Discussion:
|
||||
self.lollms.ShowBlockingMessage("Processing file\nPlease wait ...")
|
||||
if process:
|
||||
if self.vectorizer is None:
|
||||
self.vectorizer = TextVectorizer(
|
||||
self.lollms.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer"
|
||||
model=self.lollms.model, #needed in case of using model_embedding
|
||||
database_path=self.discussion_rag_folder/"db.json",
|
||||
save_db=self.lollms.config.data_vectorization_save_db,
|
||||
data_visualization_method=VisualizationMethod.PCA,
|
||||
database_dict=None)
|
||||
data = GenericDataLoader.read_file(path)
|
||||
self.vectorizer.add_document(path, data, self.lollms.config.data_vectorization_chunk_size, self.lollms.config.data_vectorization_overlap_size, add_first_line_to_all_chunks=True if path.suffix==".csv" else False)
|
||||
self.vectorizer.index()
|
||||
self.vectorizer = VectorDatabase(
|
||||
self.discussion_rag_folder/"db.sqli",
|
||||
BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
|
||||
self.lollms.model,
|
||||
)
|
||||
data = TextDocumentsLoader.read_file(path)
|
||||
self.vectorizer.add_document(path.stem, data, path, True)
|
||||
self.vectorizer.build_index()
|
||||
if callback is not None:
|
||||
callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
|
||||
self.lollms.HideBlockingMessage(client.client_id)
|
||||
|
@ -16,6 +16,10 @@ from lollms.binding import LLMBinding, BindingType
|
||||
from lollms.utilities import PromptReshaper, PackageManager, discussion_path_to_url, process_ai_output, remove_text_from_string
|
||||
from lollms.com import NotificationType, NotificationDisplayType
|
||||
from lollms.client_session import Session, Client
|
||||
from lollmsvectordb.vector_database import VectorDatabase
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
from lollmsvectordb.text_document_loader import TextDocumentsLoader
|
||||
|
||||
import pkg_resources
|
||||
from pathlib import Path
|
||||
@ -1081,16 +1085,16 @@ class AIPersonality:
|
||||
self.ShowBlockingMessage("Processing file\nPlease wait ...")
|
||||
if process:
|
||||
if self.vectorizer is None:
|
||||
self.vectorizer = TextVectorizer(
|
||||
self.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer"
|
||||
model=self.model, #needed in case of using model_embedding
|
||||
database_path=client.discussion.discussion_rag_folder/"db.json",
|
||||
save_db=self.config.data_vectorization_save_db,
|
||||
data_visualization_method=VisualizationMethod.PCA,
|
||||
database_dict=None)
|
||||
data = GenericDataLoader.read_file(path)
|
||||
self.vectorizer.add_document(path, data, self.config.data_vectorization_chunk_size, self.config.data_vectorization_overlap_size, add_first_line_to_all_chunks=True if path.suffix==".csv" else False)
|
||||
self.vectorizer.index()
|
||||
self.vectorizer = VectorDatabase(
|
||||
client.discussion.discussion_rag_folder/"db.sqli",
|
||||
BERTVectorizer(self.config.rag_vectorizer_model) if self.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
|
||||
self.model,
|
||||
chunk_size=self.config.rag_chunk_size,
|
||||
overlap=self.config.rag_overlap
|
||||
)
|
||||
data = TextDocumentsLoader.read_file(path)
|
||||
self.vectorizer.add_document(path.stem, data, path, True)
|
||||
self.vectorizer.build_index()
|
||||
if callback is not None:
|
||||
callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
|
||||
self.HideBlockingMessage(client.client_id)
|
||||
|
Loading…
Reference in New Issue
Block a user