Moved to the new lollms vector database

This commit is contained in:
Saifeddine ALOUI 2024-09-01 02:06:47 +02:00
parent 767644c305
commit 9aaa79e4a1
17 changed files with 133 additions and 92 deletions

View File

@ -1,5 +1,5 @@
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
version: 135
version: 136
binding_name: null
model_name: null
model_variant: null
@ -269,8 +269,8 @@ audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer: semantic # possible values semantic, tfidf, openai
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_overlap: 0 # number of tokens of overlap
@ -285,6 +285,7 @@ rag_min_nb_tokens_in_chunk: 10 #this removed any useless junk ith less than x to
rag_max_n_hops: 3 #We set the maximum number of hop in multi hops rag
rag_deactivate: false # if you have a large context model, you can activate this to use your document as a whole
rag_vectorizer_openai_key: "" # The open ai key (if not provided, this will use the environment varaible OPENAI_API_KEY)
contextual_summary: false #If activated this will completely replace the rag and instead will use contextual summary

View File

@ -248,8 +248,8 @@ audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer: semantic # possible values semantic, tfidf, openai
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_overlap: 0 # number of tokens of overlap

View File

@ -248,8 +248,8 @@ audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer: semantic # possible values semantic, tfidf, openai
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_overlap: 0 # number of tokens of overlap

View File

@ -248,8 +248,8 @@ audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer: semantic # possible values semantic, tfidf, openai
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_overlap: 0 # number of tokens of overlap

View File

@ -68,7 +68,7 @@ class LollmsApplication(LoLLMsCom):
self.tts = None
self.session = Session(lollms_paths)
self.skills_library = SkillsLibrary(self.lollms_paths.personal_skills_path/(self.config.skills_lib_database_name+".db"))
self.skills_library = SkillsLibrary(self.lollms_paths.personal_skills_path/(self.config.skills_lib_database_name+".sqlite"))
self.tasks_library = TasksLibrary(self)
self.handle_generate_msg: Callable[[str, Dict], None] = None
@ -314,19 +314,17 @@ class LollmsApplication(LoLLMsCom):
from lollmsvectordb import VectorDatabase
from lollmsvectordb.text_document_loader import TextDocumentsLoader
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
if self.config.rag_vectorizer == "bert":
self.backup_trust_store()
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
v = BERTVectorizer()
self.restore_trust_store()
elif self.config.rag_vectorizer == "tfidf":
if self.lollms.config.rag_vectorizer=="semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
vectorizer = SemanticVectorizer(self.lollms.config.rag_vectorizer_model)
elif self.lollms.config.rag_vectorizer=="tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif self.config.rag_vectorizer == "word2vec":
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
v = Word2VecVectorizer()
vectorizer = TFIDFVectorizer()
elif self.lollms.config.rag_vectorizer=="openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
vectorizer = OpenAIVectorizer(self.lollms.config.rag_vectorizer_model, self.lollms.config.rag_vectorizer_openai_key)
vdb = VectorDatabase(Path(parts[1])/f"{db_name}.sqlite", v, self.model if self.model else TikTokenTokenizer(), n_neighbors=self.config.rag_n_chunks)
vdb = VectorDatabase(Path(parts[1])/f"{db_name}.sqlite", vectorizer, None if self.lollms.config.rag_vectorizer=="semantic" else self.model if self.model else TikTokenTokenizer(), n_neighbors=self.config.rag_n_chunks)
self.active_rag_dbs.append({"name":parts[0],"path":parts[1],"vectorizer":vdb})
except:
ASCIIColors.error(f"Couldn't load "+str(Path(parts[1])/f"{db_name}.sqlite")+" consider revectorizing it")
@ -355,7 +353,7 @@ class LollmsApplication(LoLLMsCom):
except Exception as ex:
trace_exception(ex)
self.warning(f"Couldn't load vllm")
ASCIIColors.execute_with_animation("Loading local TTT services", start_ttt,ASCIIColors.color_blue)
ASCIIColors.execute_with_animation("Loading TTT services", start_ttt,ASCIIColors.color_blue)
print("OK")
def start_stt(*args, **kwargs):
if self.config.whisper_activate or self.config.active_stt_service == "whisper":
@ -372,7 +370,7 @@ class LollmsApplication(LoLLMsCom):
from lollms.services.stt.whisper.lollms_whisper import LollmsWhisper
self.stt = LollmsWhisper(self, self.config.whisper_model)
ASCIIColors.execute_with_animation("Loading loacal STT services", start_stt, ASCIIColors.color_blue)
ASCIIColors.execute_with_animation("Loading STT services", start_stt, ASCIIColors.color_blue)
print("OK")
def start_tts(*args, **kwargs):
@ -403,7 +401,7 @@ class LollmsApplication(LoLLMsCom):
elif self.config.active_tts_service == "xtts" and self.xtts:
self.tts = self.xtts
ASCIIColors.execute_with_animation("Loading loacal TTS services", start_tts, ASCIIColors.color_blue)
ASCIIColors.execute_with_animation("Loading TTS services", start_tts, ASCIIColors.color_blue)
print("OK")
def start_tti(*args, **kwargs):

View File

@ -1,5 +1,5 @@
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
version: 135
version: 136
binding_name: null
model_name: null
model_variant: null
@ -269,8 +269,8 @@ audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer: semantic # possible values semantic, tfidf, openai
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_overlap: 0 # number of tokens of overlap
@ -285,6 +285,7 @@ rag_min_nb_tokens_in_chunk: 10 #this removed any useless junk ith less than x to
rag_max_n_hops: 3 #We set the maximum number of hop in multi hops rag
rag_deactivate: false # if you have a large context model, you can activate this to use your document as a whole
rag_vectorizer_openai_key: "" # The open ai key (if not provided, this will use the environment varaible OPENAI_API_KEY)
contextual_summary: false #If activated this will completely replace the rag and instead will use contextual summary

View File

@ -10,8 +10,6 @@ from lollms.paths import LollmsPaths
from lollms.com import LoLLMsCom
from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
from lollmsvectordb.text_document_loader import TextDocumentsLoader
import gc
import json
@ -776,9 +774,19 @@ class Discussion:
self.update_file_lists()
if len(self.text_files)>0:
if self.lollms.config.rag_vectorizer=="semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
vectorizer = SemanticVectorizer(self.lollms.config.rag_vectorizer_model)
elif self.lollms.config.rag_vectorizer=="tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
vectorizer = TFIDFVectorizer()
elif self.lollms.config.rag_vectorizer=="openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
vectorizer = OpenAIVectorizer(self.lollms.config.rag_vectorizer_model, self.lollms.config.rag_vectorizer_openai_key)
self.vectorizer = VectorDatabase(
self.discussion_rag_folder/"db.sqli",
BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
vectorizer,
self.lollms.model,
chunk_size=self.lollms.config.rag_chunk_size,
overlap=self.lollms.config.rag_overlap
@ -795,17 +803,17 @@ class Discussion:
except Exception as ex:
trace_exception(ex)
try:
self.vectorizer.index()
self.vectorizer.build_index()
except Exception as ex:
trace_exception(ex)
else:
self.vectorizer = None
def update_file_lists(self):
self.text_files = [Path(file) for file in self.discussion_text_folder.glob('*')]
self.image_files = [Path(file) for file in self.discussion_images_folder.glob('*')]
self.audio_files = [Path(file) for file in self.discussion_audio_folder.glob('*')]
self.rag_db = [Path(file) for file in self.discussion_rag_folder.glob('*')]
self.text_files = [Path(file) for file in self.discussion_text_folder.glob('*') if not file.is_dir()]
self.image_files = [Path(file) for file in self.discussion_images_folder.glob('*') if not file.is_dir()]
self.audio_files = [Path(file) for file in self.discussion_audio_folder.glob('*') if not file.is_dir()]
self.rag_db = [Path(file) for file in self.discussion_rag_folder.glob('*') if not file.is_dir()]
def remove_file(self, file_name, callback=None):
@ -937,9 +945,18 @@ class Discussion:
self.lollms.ShowBlockingMessage("Processing file\nPlease wait ...")
if process:
if self.vectorizer is None:
if self.lollms.config.rag_vectorizer == "semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
v = SemanticVectorizer(self.lollms.config.rag_vectorizer_model)
elif self.lollms.config.rag_vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif self.lollms.config.rag_vectorizer == "openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
v = OpenAIVectorizer(self.lollms.config.rag_vectorizer_openai_key)
self.vectorizer = VectorDatabase(
self.discussion_rag_folder/"db.sqli",
BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
v,
self.lollms.model,
)
data = TextDocumentsLoader.read_file(path)

View File

@ -1,14 +1,14 @@
import sqlite3
from lollmsvectordb import VectorDatabase, BERTVectorizer
from lollmsvectordb import VectorDatabase, SemanticVectorizer
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
import numpy as np
from ascii_colors import ASCIIColors
class SkillsLibrary:
def __init__(self, db_path, model_name: str = 'bert-base-nli-mean-tokens', chunk_size:int=512, overlap:int=0, n_neighbors:int=5):
def __init__(self, db_path, model_name: str = 'sentence-transformers/bert-base-nli-mean-tokens', chunk_size:int=512, overlap:int=0, n_neighbors:int=5):
self.db_path =db_path
self._initialize_db()
self.vectorizer = VectorDatabase("", BERTVectorizer(), TikTokenTokenizer(),chunk_size, overlap, n_neighbors)
self.vectorizer = VectorDatabase(db_path, SemanticVectorizer(model_name), TikTokenTokenizer(),chunk_size, overlap, n_neighbors)
ASCIIColors.green("Vecorizer ready")

View File

@ -337,9 +337,9 @@ def get_image_gen_prompt(agent_name, number_of_entries=5) -> Tuple[str, str]:
"""
try:
from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
db = VectorDatabase("", BERTVectorizer(), TikTokenTokenizer(), number_of_entries)
db = VectorDatabase("", SemanticVectorizer(), TikTokenTokenizer(), number_of_entries)
image_gen_prompts = get_prompts_list()
for entry in image_gen_prompts:

View File

@ -134,9 +134,9 @@ def get_system_prompt(agent_name, number_of_entries=5) -> Tuple[str, str]:
"""
try:
from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
db = VectorDatabase("", BERTVectorizer(), TikTokenTokenizer(), number_of_entries)
db = VectorDatabase("", SemanticVectorizer(), TikTokenTokenizer(), number_of_entries)
system_prompts = get_prompts()

View File

@ -317,7 +317,7 @@ def internet_search(query, internet_nb_search_pages, chromedriver_path=None, qui
return search_results
def internet_search_with_vectorization(query, chromedriver_path=None, internet_nb_search_pages=5, internet_vectorization_chunk_size=512, internet_vectorization_overlap_size=20, internet_vectorization_nb_chunks=4, model = None, quick_search:bool=False, vectorizer = "bert", vectorize=True, asses_using_llm=True, yes_no=None):
def internet_search_with_vectorization(query, chromedriver_path=None, internet_nb_search_pages=5, internet_vectorization_chunk_size=512, internet_vectorization_overlap_size=20, internet_vectorization_nb_chunks=4, model = None, quick_search:bool=False, vectorizer = "semantic", vectorize=True, asses_using_llm=True, yes_no=None):
"""
"""
@ -328,15 +328,15 @@ def internet_search_with_vectorization(query, chromedriver_path=None, internet_n
from lollmsvectordb.text_document_loader import TextDocumentsLoader
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
if vectorizer == "bert":
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
v = BERTVectorizer()
if vectorizer == "semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
v = SemanticVectorizer()
elif vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif vectorizer == "word2vec":
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
v = Word2VecVectorizer()
elif vectorizer == "openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
v = OpenAIVectorizer()
vectorizer = VectorDatabase("", v, TikTokenTokenizer(), internet_vectorization_chunk_size, internet_vectorization_overlap_size)

View File

@ -17,7 +17,7 @@ from lollms.utilities import PromptReshaper, PackageManager, discussion_path_to_
from lollms.com import NotificationType, NotificationDisplayType
from lollms.client_session import Session, Client
from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
from lollmsvectordb.text_document_loader import TextDocumentsLoader
from lollmsvectordb.database_elements.document import Document
@ -898,16 +898,16 @@ class AIPersonality:
if self.data_path.exists():
self.database_path = self.data_path / "db.sqlite"
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
vectorizer = self.config.rag_vectorizer
if vectorizer == "bert":
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
v = BERTVectorizer()
if vectorizer == "semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
v = SemanticVectorizer()
elif vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif vectorizer == "word2vec":
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
v = Word2VecVectorizer()
elif vectorizer == "openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
v = OpenAIVectorizer()
self.persona_data_vectorizer = VectorDatabase(self.database_path, v, TikTokenTokenizer(), self.config.rag_chunk_size, self.config.rag_overlap)
@ -1063,9 +1063,19 @@ class AIPersonality:
self.ShowBlockingMessage("Processing file\nPlease wait ...")
if process:
if self.vectorizer is None:
if self.config.rag_vectorizer == "semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
v = SemanticVectorizer(self.config.rag_vectorizer_model)
elif self.config.rag_vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif self.config.rag_vectorizer == "openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
v = OpenAIVectorizer()
self.vectorizer = VectorDatabase(
client.discussion.discussion_rag_folder/"db.sqli",
BERTVectorizer(self.config.rag_vectorizer_model) if self.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
v,
self.model,
chunk_size=self.config.rag_chunk_size,
overlap=self.config.rag_overlap
@ -2900,15 +2910,15 @@ class APScript(StateMachine):
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
vectorizer = self.config.rag_vectorizer
if vectorizer == "bert":
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
v = BERTVectorizer()
if vectorizer == "semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
v = SemanticVectorizer(self.config.rag_vectorizer_model)
elif vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif vectorizer == "word2vec":
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
v = Word2VecVectorizer()
elif vectorizer == "openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
v = OpenAIVectorizer()
vectorizer = VectorDatabase("", v, TikTokenTokenizer(), self.config.rag_chunk_size, self.config.rag_overlap)
vectorizer.add_document(title, text, url)

View File

@ -248,8 +248,8 @@ audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer: semantic # possible values semantic, tfidf, openai
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_overlap: 0 # number of tokens of overlap

View File

@ -128,21 +128,21 @@ def select_rag_database(client) -> Optional[Dict[str, Path]]:
if not PackageManager.check_package_installed_with_version("lollmsvectordb","0.6.0"):
PackageManager.install_or_update("lollmsvectordb")
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
from lollmsvectordb import VectorDatabase
from lollmsvectordb.text_document_loader import TextDocumentsLoader
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
if lollmsElfServer.config.rag_vectorizer == "bert":
lollmsElfServer.backup_trust_store()
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
v = BERTVectorizer()
lollmsElfServer.restore_trust_store()
if lollmsElfServer.config.rag_vectorizer == "semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
v = SemanticVectorizer(lollmsElfServer.config.rag_vectorizer_model)
elif lollmsElfServer.config.rag_vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif lollmsElfServer.config.rag_vectorizer == "openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
v = OpenAIVectorizer(lollmsElfServer.config.rag_vectorizer_openai_key)
vdb = VectorDatabase(Path(folder_path)/f"{db_name}.sqlite", v, lollmsElfServer.model if lollmsElfServer.model else TikTokenTokenizer())
# Get all files in the folder
@ -269,14 +269,16 @@ def toggle_mount_rag_database(database_infos: MountDatabase):
from lollmsvectordb.text_document_loader import TextDocumentsLoader
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
if lollmsElfServer.config.rag_vectorizer == "bert":
lollmsElfServer.backup_trust_store()
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
v = BERTVectorizer()
lollmsElfServer.restore_trust_store()
if lollmsElfServer.config.rag_vectorizer == "semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
v = SemanticVectorizer(lollmsElfServer.config.rag_vectorizer_model)
elif lollmsElfServer.config.rag_vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif lollmsElfServer.config.rag_vectorizer == "openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
v = OpenAIVectorizer(lollmsElfServer.config.rag_vectorizer_openai_key)
vdb = VectorDatabase(Path(path)/f"{database_infos.database_name}.sqlite", v, lollmsElfServer.model if lollmsElfServer.model else TikTokenTokenizer(), chunk_size=lollmsElfServer.config.rag_chunk_size, clean_chunks=lollmsElfServer.config.rag_clean_chunks, n_neighbors=lollmsElfServer.config.rag_n_chunks)
lollmsElfServer.active_rag_dbs.append({"name":database_infos.database_name,"path":path,"vectorizer":vdb})
@ -328,21 +330,22 @@ async def vectorize_folder(database_infos: FolderInfos):
if not PackageManager.check_package_installed_with_version("lollmsvectordb","0.6.0"):
PackageManager.install_or_update("lollmsvectordb")
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
from lollmsvectordb import VectorDatabase
from lollmsvectordb.text_document_loader import TextDocumentsLoader
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
if lollmsElfServer.config.rag_vectorizer == "bert":
lollmsElfServer.backup_trust_store()
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
v = BERTVectorizer()
lollmsElfServer.restore_trust_store()
if lollmsElfServer.config.rag_vectorizer == "semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
v = SemanticVectorizer(lollmsElfServer.config.rag_vectorizer_model)
elif lollmsElfServer.config.rag_vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif lollmsElfServer.config.rag_vectorizer == "openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
v = OpenAIVectorizer(lollmsElfServer.config.rag_vectorizer_openai_key)
vector_db_path = Path(folder_path)/f"{db_name}.sqlite"
vdb = VectorDatabase(vector_db_path, v, lollmsElfServer.model if lollmsElfServer.model else TikTokenTokenizer(), reset=True)

View File

@ -13,7 +13,7 @@ from typing import List, Optional, Union
from pathlib import Path
from lollmsvectordb.database_elements.chunk import Chunk
from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
import sqlite3
import secrets
@ -67,9 +67,20 @@ def get_user_vectorizer(user_key: str):
small_key = hashlib.md5(user_key.encode()).hexdigest()[:8]
user_folder = lollmsElfServer.lollms_paths.personal_outputs_path / str(user_key)
user_folder.mkdir(parents=True, exist_ok=True)
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
if lollmsElfServer.config.rag_vectorizer == "semantic":
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
v = SemanticVectorizer(lollmsElfServer.config.rag_vectorizer_model)
elif lollmsElfServer.config.rag_vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif lollmsElfServer.config.rag_vectorizer == "openai":
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
v = OpenAIVectorizer(lollmsElfServer.config.rag_vectorizer_openai_key)
return VectorDatabase(
str(user_folder / f"rag_db_{small_key}.sqlite"),
BERTVectorizer(lollmsElfServer.config.rag_vectorizer_model) if lollmsElfServer.config.rag_vectorizer == "bert" else TFIDFVectorizer(),
v, TikTokenTokenizer(),
lollmsElfServer.model,
chunk_size=lollmsElfServer.config.rag_chunk_size,
overlap=lollmsElfServer.config.rag_overlap

View File

@ -32,7 +32,7 @@ import shutil
from tqdm import tqdm
import threading
from io import BytesIO
import os
class LollmsDalle(LollmsTTI):
@ -44,7 +44,7 @@ class LollmsDalle(LollmsTTI):
output_path=None
):
super().__init__(generation_engine,app)
self.key = key
self.key = key or os.getenv('OPENAI_API_KEY')
self.generation_engine = generation_engine
self.output_path = output_path

View File

@ -248,8 +248,8 @@ audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer: semantic # possible values semantic, tfidf, openai
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_overlap: 0 # number of tokens of overlap