mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-04-15 14:36:34 +00:00
Moved to the new lollms vector database
This commit is contained in:
parent
767644c305
commit
9aaa79e4a1
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 135
|
||||
version: 136
|
||||
binding_name: null
|
||||
model_name: null
|
||||
model_variant: null
|
||||
@ -269,8 +269,8 @@ audio_silenceTimer: 5000
|
||||
|
||||
# Data vectorization
|
||||
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
||||
rag_vectorizer: bert # possible values bert, tfidf, word2vec
|
||||
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer: semantic # possible values semantic, tfidf, openai
|
||||
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||
rag_chunk_size: 512 # number of tokens per chunk
|
||||
rag_overlap: 0 # number of tokens of overlap
|
||||
@ -285,6 +285,7 @@ rag_min_nb_tokens_in_chunk: 10 #this removed any useless junk ith less than x to
|
||||
rag_max_n_hops: 3 #We set the maximum number of hop in multi hops rag
|
||||
|
||||
rag_deactivate: false # if you have a large context model, you can activate this to use your document as a whole
|
||||
rag_vectorizer_openai_key: "" # The open ai key (if not provided, this will use the environment varaible OPENAI_API_KEY)
|
||||
|
||||
contextual_summary: false #If activated this will completely replace the rag and instead will use contextual summary
|
||||
|
||||
|
@ -248,8 +248,8 @@ audio_silenceTimer: 5000
|
||||
|
||||
# Data vectorization
|
||||
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
||||
rag_vectorizer: bert # possible values bert, tfidf, word2vec
|
||||
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer: semantic # possible values semantic, tfidf, openai
|
||||
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||
rag_chunk_size: 512 # number of tokens per chunk
|
||||
rag_overlap: 0 # number of tokens of overlap
|
||||
|
@ -248,8 +248,8 @@ audio_silenceTimer: 5000
|
||||
|
||||
# Data vectorization
|
||||
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
||||
rag_vectorizer: bert # possible values bert, tfidf, word2vec
|
||||
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer: semantic # possible values semantic, tfidf, openai
|
||||
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||
rag_chunk_size: 512 # number of tokens per chunk
|
||||
rag_overlap: 0 # number of tokens of overlap
|
||||
|
@ -248,8 +248,8 @@ audio_silenceTimer: 5000
|
||||
|
||||
# Data vectorization
|
||||
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
||||
rag_vectorizer: bert # possible values bert, tfidf, word2vec
|
||||
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer: semantic # possible values semantic, tfidf, openai
|
||||
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||
rag_chunk_size: 512 # number of tokens per chunk
|
||||
rag_overlap: 0 # number of tokens of overlap
|
||||
|
@ -68,7 +68,7 @@ class LollmsApplication(LoLLMsCom):
|
||||
|
||||
self.tts = None
|
||||
self.session = Session(lollms_paths)
|
||||
self.skills_library = SkillsLibrary(self.lollms_paths.personal_skills_path/(self.config.skills_lib_database_name+".db"))
|
||||
self.skills_library = SkillsLibrary(self.lollms_paths.personal_skills_path/(self.config.skills_lib_database_name+".sqlite"))
|
||||
self.tasks_library = TasksLibrary(self)
|
||||
|
||||
self.handle_generate_msg: Callable[[str, Dict], None] = None
|
||||
@ -314,19 +314,17 @@ class LollmsApplication(LoLLMsCom):
|
||||
from lollmsvectordb import VectorDatabase
|
||||
from lollmsvectordb.text_document_loader import TextDocumentsLoader
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
if self.config.rag_vectorizer == "bert":
|
||||
self.backup_trust_store()
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
v = BERTVectorizer()
|
||||
self.restore_trust_store()
|
||||
elif self.config.rag_vectorizer == "tfidf":
|
||||
if self.lollms.config.rag_vectorizer=="semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
vectorizer = SemanticVectorizer(self.lollms.config.rag_vectorizer_model)
|
||||
elif self.lollms.config.rag_vectorizer=="tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif self.config.rag_vectorizer == "word2vec":
|
||||
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
|
||||
v = Word2VecVectorizer()
|
||||
vectorizer = TFIDFVectorizer()
|
||||
elif self.lollms.config.rag_vectorizer=="openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
vectorizer = OpenAIVectorizer(self.lollms.config.rag_vectorizer_model, self.lollms.config.rag_vectorizer_openai_key)
|
||||
|
||||
vdb = VectorDatabase(Path(parts[1])/f"{db_name}.sqlite", v, self.model if self.model else TikTokenTokenizer(), n_neighbors=self.config.rag_n_chunks)
|
||||
vdb = VectorDatabase(Path(parts[1])/f"{db_name}.sqlite", vectorizer, None if self.lollms.config.rag_vectorizer=="semantic" else self.model if self.model else TikTokenTokenizer(), n_neighbors=self.config.rag_n_chunks)
|
||||
self.active_rag_dbs.append({"name":parts[0],"path":parts[1],"vectorizer":vdb})
|
||||
except:
|
||||
ASCIIColors.error(f"Couldn't load "+str(Path(parts[1])/f"{db_name}.sqlite")+" consider revectorizing it")
|
||||
@ -355,7 +353,7 @@ class LollmsApplication(LoLLMsCom):
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
self.warning(f"Couldn't load vllm")
|
||||
ASCIIColors.execute_with_animation("Loading local TTT services", start_ttt,ASCIIColors.color_blue)
|
||||
ASCIIColors.execute_with_animation("Loading TTT services", start_ttt,ASCIIColors.color_blue)
|
||||
print("OK")
|
||||
def start_stt(*args, **kwargs):
|
||||
if self.config.whisper_activate or self.config.active_stt_service == "whisper":
|
||||
@ -372,7 +370,7 @@ class LollmsApplication(LoLLMsCom):
|
||||
from lollms.services.stt.whisper.lollms_whisper import LollmsWhisper
|
||||
self.stt = LollmsWhisper(self, self.config.whisper_model)
|
||||
|
||||
ASCIIColors.execute_with_animation("Loading loacal STT services", start_stt, ASCIIColors.color_blue)
|
||||
ASCIIColors.execute_with_animation("Loading STT services", start_stt, ASCIIColors.color_blue)
|
||||
print("OK")
|
||||
|
||||
def start_tts(*args, **kwargs):
|
||||
@ -403,7 +401,7 @@ class LollmsApplication(LoLLMsCom):
|
||||
elif self.config.active_tts_service == "xtts" and self.xtts:
|
||||
self.tts = self.xtts
|
||||
|
||||
ASCIIColors.execute_with_animation("Loading loacal TTS services", start_tts, ASCIIColors.color_blue)
|
||||
ASCIIColors.execute_with_animation("Loading TTS services", start_tts, ASCIIColors.color_blue)
|
||||
print("OK")
|
||||
|
||||
def start_tti(*args, **kwargs):
|
||||
|
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 135
|
||||
version: 136
|
||||
binding_name: null
|
||||
model_name: null
|
||||
model_variant: null
|
||||
@ -269,8 +269,8 @@ audio_silenceTimer: 5000
|
||||
|
||||
# Data vectorization
|
||||
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
||||
rag_vectorizer: bert # possible values bert, tfidf, word2vec
|
||||
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer: semantic # possible values semantic, tfidf, openai
|
||||
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||
rag_chunk_size: 512 # number of tokens per chunk
|
||||
rag_overlap: 0 # number of tokens of overlap
|
||||
@ -285,6 +285,7 @@ rag_min_nb_tokens_in_chunk: 10 #this removed any useless junk ith less than x to
|
||||
rag_max_n_hops: 3 #We set the maximum number of hop in multi hops rag
|
||||
|
||||
rag_deactivate: false # if you have a large context model, you can activate this to use your document as a whole
|
||||
rag_vectorizer_openai_key: "" # The open ai key (if not provided, this will use the environment varaible OPENAI_API_KEY)
|
||||
|
||||
contextual_summary: false #If activated this will completely replace the rag and instead will use contextual summary
|
||||
|
||||
|
@ -10,8 +10,6 @@ from lollms.paths import LollmsPaths
|
||||
from lollms.com import LoLLMsCom
|
||||
|
||||
from lollmsvectordb.vector_database import VectorDatabase
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
from lollmsvectordb.text_document_loader import TextDocumentsLoader
|
||||
import gc
|
||||
import json
|
||||
@ -776,9 +774,19 @@ class Discussion:
|
||||
self.update_file_lists()
|
||||
|
||||
if len(self.text_files)>0:
|
||||
if self.lollms.config.rag_vectorizer=="semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
vectorizer = SemanticVectorizer(self.lollms.config.rag_vectorizer_model)
|
||||
elif self.lollms.config.rag_vectorizer=="tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
vectorizer = TFIDFVectorizer()
|
||||
elif self.lollms.config.rag_vectorizer=="openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
vectorizer = OpenAIVectorizer(self.lollms.config.rag_vectorizer_model, self.lollms.config.rag_vectorizer_openai_key)
|
||||
|
||||
self.vectorizer = VectorDatabase(
|
||||
self.discussion_rag_folder/"db.sqli",
|
||||
BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
|
||||
vectorizer,
|
||||
self.lollms.model,
|
||||
chunk_size=self.lollms.config.rag_chunk_size,
|
||||
overlap=self.lollms.config.rag_overlap
|
||||
@ -795,17 +803,17 @@ class Discussion:
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
try:
|
||||
self.vectorizer.index()
|
||||
self.vectorizer.build_index()
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
else:
|
||||
self.vectorizer = None
|
||||
|
||||
def update_file_lists(self):
|
||||
self.text_files = [Path(file) for file in self.discussion_text_folder.glob('*')]
|
||||
self.image_files = [Path(file) for file in self.discussion_images_folder.glob('*')]
|
||||
self.audio_files = [Path(file) for file in self.discussion_audio_folder.glob('*')]
|
||||
self.rag_db = [Path(file) for file in self.discussion_rag_folder.glob('*')]
|
||||
self.text_files = [Path(file) for file in self.discussion_text_folder.glob('*') if not file.is_dir()]
|
||||
self.image_files = [Path(file) for file in self.discussion_images_folder.glob('*') if not file.is_dir()]
|
||||
self.audio_files = [Path(file) for file in self.discussion_audio_folder.glob('*') if not file.is_dir()]
|
||||
self.rag_db = [Path(file) for file in self.discussion_rag_folder.glob('*') if not file.is_dir()]
|
||||
|
||||
|
||||
def remove_file(self, file_name, callback=None):
|
||||
@ -937,9 +945,18 @@ class Discussion:
|
||||
self.lollms.ShowBlockingMessage("Processing file\nPlease wait ...")
|
||||
if process:
|
||||
if self.vectorizer is None:
|
||||
if self.lollms.config.rag_vectorizer == "semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
v = SemanticVectorizer(self.lollms.config.rag_vectorizer_model)
|
||||
elif self.lollms.config.rag_vectorizer == "tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif self.lollms.config.rag_vectorizer == "openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
v = OpenAIVectorizer(self.lollms.config.rag_vectorizer_openai_key)
|
||||
self.vectorizer = VectorDatabase(
|
||||
self.discussion_rag_folder/"db.sqli",
|
||||
BERTVectorizer(self.lollms.config.rag_vectorizer_model) if self.lollms.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
|
||||
v,
|
||||
self.lollms.model,
|
||||
)
|
||||
data = TextDocumentsLoader.read_file(path)
|
||||
|
@ -1,14 +1,14 @@
|
||||
import sqlite3
|
||||
from lollmsvectordb import VectorDatabase, BERTVectorizer
|
||||
from lollmsvectordb import VectorDatabase, SemanticVectorizer
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
import numpy as np
|
||||
from ascii_colors import ASCIIColors
|
||||
class SkillsLibrary:
|
||||
|
||||
def __init__(self, db_path, model_name: str = 'bert-base-nli-mean-tokens', chunk_size:int=512, overlap:int=0, n_neighbors:int=5):
|
||||
def __init__(self, db_path, model_name: str = 'sentence-transformers/bert-base-nli-mean-tokens', chunk_size:int=512, overlap:int=0, n_neighbors:int=5):
|
||||
self.db_path =db_path
|
||||
self._initialize_db()
|
||||
self.vectorizer = VectorDatabase("", BERTVectorizer(), TikTokenTokenizer(),chunk_size, overlap, n_neighbors)
|
||||
self.vectorizer = VectorDatabase(db_path, SemanticVectorizer(model_name), TikTokenTokenizer(),chunk_size, overlap, n_neighbors)
|
||||
ASCIIColors.green("Vecorizer ready")
|
||||
|
||||
|
||||
|
@ -337,9 +337,9 @@ def get_image_gen_prompt(agent_name, number_of_entries=5) -> Tuple[str, str]:
|
||||
"""
|
||||
try:
|
||||
from lollmsvectordb.vector_database import VectorDatabase
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
db = VectorDatabase("", BERTVectorizer(), TikTokenTokenizer(), number_of_entries)
|
||||
db = VectorDatabase("", SemanticVectorizer(), TikTokenTokenizer(), number_of_entries)
|
||||
|
||||
image_gen_prompts = get_prompts_list()
|
||||
for entry in image_gen_prompts:
|
||||
|
@ -134,9 +134,9 @@ def get_system_prompt(agent_name, number_of_entries=5) -> Tuple[str, str]:
|
||||
"""
|
||||
try:
|
||||
from lollmsvectordb.vector_database import VectorDatabase
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
db = VectorDatabase("", BERTVectorizer(), TikTokenTokenizer(), number_of_entries)
|
||||
db = VectorDatabase("", SemanticVectorizer(), TikTokenTokenizer(), number_of_entries)
|
||||
|
||||
system_prompts = get_prompts()
|
||||
|
||||
|
@ -317,7 +317,7 @@ def internet_search(query, internet_nb_search_pages, chromedriver_path=None, qui
|
||||
|
||||
return search_results
|
||||
|
||||
def internet_search_with_vectorization(query, chromedriver_path=None, internet_nb_search_pages=5, internet_vectorization_chunk_size=512, internet_vectorization_overlap_size=20, internet_vectorization_nb_chunks=4, model = None, quick_search:bool=False, vectorizer = "bert", vectorize=True, asses_using_llm=True, yes_no=None):
|
||||
def internet_search_with_vectorization(query, chromedriver_path=None, internet_nb_search_pages=5, internet_vectorization_chunk_size=512, internet_vectorization_overlap_size=20, internet_vectorization_nb_chunks=4, model = None, quick_search:bool=False, vectorizer = "semantic", vectorize=True, asses_using_llm=True, yes_no=None):
|
||||
"""
|
||||
"""
|
||||
|
||||
@ -328,15 +328,15 @@ def internet_search_with_vectorization(query, chromedriver_path=None, internet_n
|
||||
from lollmsvectordb.text_document_loader import TextDocumentsLoader
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
|
||||
if vectorizer == "bert":
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
v = BERTVectorizer()
|
||||
if vectorizer == "semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
v = SemanticVectorizer()
|
||||
elif vectorizer == "tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif vectorizer == "word2vec":
|
||||
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
|
||||
v = Word2VecVectorizer()
|
||||
elif vectorizer == "openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
v = OpenAIVectorizer()
|
||||
|
||||
vectorizer = VectorDatabase("", v, TikTokenTokenizer(), internet_vectorization_chunk_size, internet_vectorization_overlap_size)
|
||||
|
||||
|
@ -17,7 +17,7 @@ from lollms.utilities import PromptReshaper, PackageManager, discussion_path_to_
|
||||
from lollms.com import NotificationType, NotificationDisplayType
|
||||
from lollms.client_session import Session, Client
|
||||
from lollmsvectordb.vector_database import VectorDatabase
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
from lollmsvectordb.text_document_loader import TextDocumentsLoader
|
||||
from lollmsvectordb.database_elements.document import Document
|
||||
@ -898,16 +898,16 @@ class AIPersonality:
|
||||
if self.data_path.exists():
|
||||
self.database_path = self.data_path / "db.sqlite"
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
vectorizer = self.config.rag_vectorizer
|
||||
if vectorizer == "bert":
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
v = BERTVectorizer()
|
||||
|
||||
if vectorizer == "semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
v = SemanticVectorizer()
|
||||
elif vectorizer == "tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif vectorizer == "word2vec":
|
||||
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
|
||||
v = Word2VecVectorizer()
|
||||
elif vectorizer == "openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
v = OpenAIVectorizer()
|
||||
|
||||
self.persona_data_vectorizer = VectorDatabase(self.database_path, v, TikTokenTokenizer(), self.config.rag_chunk_size, self.config.rag_overlap)
|
||||
|
||||
@ -1063,9 +1063,19 @@ class AIPersonality:
|
||||
self.ShowBlockingMessage("Processing file\nPlease wait ...")
|
||||
if process:
|
||||
if self.vectorizer is None:
|
||||
if self.config.rag_vectorizer == "semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
v = SemanticVectorizer(self.config.rag_vectorizer_model)
|
||||
elif self.config.rag_vectorizer == "tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif self.config.rag_vectorizer == "openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
v = OpenAIVectorizer()
|
||||
|
||||
self.vectorizer = VectorDatabase(
|
||||
client.discussion.discussion_rag_folder/"db.sqli",
|
||||
BERTVectorizer(self.config.rag_vectorizer_model) if self.config.rag_vectorizer=="bert" else TFIDFVectorizer(),
|
||||
v,
|
||||
self.model,
|
||||
chunk_size=self.config.rag_chunk_size,
|
||||
overlap=self.config.rag_overlap
|
||||
@ -2900,15 +2910,15 @@ class APScript(StateMachine):
|
||||
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
vectorizer = self.config.rag_vectorizer
|
||||
if vectorizer == "bert":
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
v = BERTVectorizer()
|
||||
if vectorizer == "semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
v = SemanticVectorizer(self.config.rag_vectorizer_model)
|
||||
elif vectorizer == "tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif vectorizer == "word2vec":
|
||||
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
|
||||
v = Word2VecVectorizer()
|
||||
elif vectorizer == "openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
v = OpenAIVectorizer()
|
||||
|
||||
vectorizer = VectorDatabase("", v, TikTokenTokenizer(), self.config.rag_chunk_size, self.config.rag_overlap)
|
||||
vectorizer.add_document(title, text, url)
|
||||
|
@ -248,8 +248,8 @@ audio_silenceTimer: 5000
|
||||
|
||||
# Data vectorization
|
||||
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
||||
rag_vectorizer: bert # possible values bert, tfidf, word2vec
|
||||
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer: semantic # possible values semantic, tfidf, openai
|
||||
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||
rag_chunk_size: 512 # number of tokens per chunk
|
||||
rag_overlap: 0 # number of tokens of overlap
|
||||
|
@ -128,21 +128,21 @@ def select_rag_database(client) -> Optional[Dict[str, Path]]:
|
||||
if not PackageManager.check_package_installed_with_version("lollmsvectordb","0.6.0"):
|
||||
PackageManager.install_or_update("lollmsvectordb")
|
||||
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
from lollmsvectordb import VectorDatabase
|
||||
from lollmsvectordb.text_document_loader import TextDocumentsLoader
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
|
||||
|
||||
if lollmsElfServer.config.rag_vectorizer == "bert":
|
||||
lollmsElfServer.backup_trust_store()
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
v = BERTVectorizer()
|
||||
lollmsElfServer.restore_trust_store()
|
||||
|
||||
if lollmsElfServer.config.rag_vectorizer == "semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
v = SemanticVectorizer(lollmsElfServer.config.rag_vectorizer_model)
|
||||
elif lollmsElfServer.config.rag_vectorizer == "tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif lollmsElfServer.config.rag_vectorizer == "openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
v = OpenAIVectorizer(lollmsElfServer.config.rag_vectorizer_openai_key)
|
||||
|
||||
vdb = VectorDatabase(Path(folder_path)/f"{db_name}.sqlite", v, lollmsElfServer.model if lollmsElfServer.model else TikTokenTokenizer())
|
||||
# Get all files in the folder
|
||||
@ -269,14 +269,16 @@ def toggle_mount_rag_database(database_infos: MountDatabase):
|
||||
from lollmsvectordb.text_document_loader import TextDocumentsLoader
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
|
||||
if lollmsElfServer.config.rag_vectorizer == "bert":
|
||||
lollmsElfServer.backup_trust_store()
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
v = BERTVectorizer()
|
||||
lollmsElfServer.restore_trust_store()
|
||||
if lollmsElfServer.config.rag_vectorizer == "semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
v = SemanticVectorizer(lollmsElfServer.config.rag_vectorizer_model)
|
||||
elif lollmsElfServer.config.rag_vectorizer == "tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif lollmsElfServer.config.rag_vectorizer == "openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
v = OpenAIVectorizer(lollmsElfServer.config.rag_vectorizer_openai_key)
|
||||
|
||||
|
||||
vdb = VectorDatabase(Path(path)/f"{database_infos.database_name}.sqlite", v, lollmsElfServer.model if lollmsElfServer.model else TikTokenTokenizer(), chunk_size=lollmsElfServer.config.rag_chunk_size, clean_chunks=lollmsElfServer.config.rag_clean_chunks, n_neighbors=lollmsElfServer.config.rag_n_chunks)
|
||||
lollmsElfServer.active_rag_dbs.append({"name":database_infos.database_name,"path":path,"vectorizer":vdb})
|
||||
@ -328,21 +330,22 @@ async def vectorize_folder(database_infos: FolderInfos):
|
||||
if not PackageManager.check_package_installed_with_version("lollmsvectordb","0.6.0"):
|
||||
PackageManager.install_or_update("lollmsvectordb")
|
||||
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
from lollmsvectordb import VectorDatabase
|
||||
from lollmsvectordb.text_document_loader import TextDocumentsLoader
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
|
||||
|
||||
if lollmsElfServer.config.rag_vectorizer == "bert":
|
||||
lollmsElfServer.backup_trust_store()
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
v = BERTVectorizer()
|
||||
lollmsElfServer.restore_trust_store()
|
||||
|
||||
if lollmsElfServer.config.rag_vectorizer == "semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
v = SemanticVectorizer(lollmsElfServer.config.rag_vectorizer_model)
|
||||
elif lollmsElfServer.config.rag_vectorizer == "tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif lollmsElfServer.config.rag_vectorizer == "openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
v = OpenAIVectorizer(lollmsElfServer.config.rag_vectorizer_openai_key)
|
||||
|
||||
vector_db_path = Path(folder_path)/f"{db_name}.sqlite"
|
||||
|
||||
vdb = VectorDatabase(vector_db_path, v, lollmsElfServer.model if lollmsElfServer.model else TikTokenTokenizer(), reset=True)
|
||||
|
@ -13,7 +13,7 @@ from typing import List, Optional, Union
|
||||
from pathlib import Path
|
||||
from lollmsvectordb.database_elements.chunk import Chunk
|
||||
from lollmsvectordb.vector_database import VectorDatabase
|
||||
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
import sqlite3
|
||||
import secrets
|
||||
@ -67,9 +67,20 @@ def get_user_vectorizer(user_key: str):
|
||||
small_key = hashlib.md5(user_key.encode()).hexdigest()[:8]
|
||||
user_folder = lollmsElfServer.lollms_paths.personal_outputs_path / str(user_key)
|
||||
user_folder.mkdir(parents=True, exist_ok=True)
|
||||
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
|
||||
if lollmsElfServer.config.rag_vectorizer == "semantic":
|
||||
from lollmsvectordb.lollms_vectorizers.semantic_vectorizer import SemanticVectorizer
|
||||
v = SemanticVectorizer(lollmsElfServer.config.rag_vectorizer_model)
|
||||
elif lollmsElfServer.config.rag_vectorizer == "tfidf":
|
||||
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
|
||||
v = TFIDFVectorizer()
|
||||
elif lollmsElfServer.config.rag_vectorizer == "openai":
|
||||
from lollmsvectordb.lollms_vectorizers.openai_vectorizer import OpenAIVectorizer
|
||||
v = OpenAIVectorizer(lollmsElfServer.config.rag_vectorizer_openai_key)
|
||||
|
||||
return VectorDatabase(
|
||||
str(user_folder / f"rag_db_{small_key}.sqlite"),
|
||||
BERTVectorizer(lollmsElfServer.config.rag_vectorizer_model) if lollmsElfServer.config.rag_vectorizer == "bert" else TFIDFVectorizer(),
|
||||
v, TikTokenTokenizer(),
|
||||
lollmsElfServer.model,
|
||||
chunk_size=lollmsElfServer.config.rag_chunk_size,
|
||||
overlap=lollmsElfServer.config.rag_overlap
|
||||
|
@ -32,7 +32,7 @@ import shutil
|
||||
from tqdm import tqdm
|
||||
import threading
|
||||
from io import BytesIO
|
||||
|
||||
import os
|
||||
|
||||
|
||||
class LollmsDalle(LollmsTTI):
|
||||
@ -44,7 +44,7 @@ class LollmsDalle(LollmsTTI):
|
||||
output_path=None
|
||||
):
|
||||
super().__init__(generation_engine,app)
|
||||
self.key = key
|
||||
self.key = key or os.getenv('OPENAI_API_KEY')
|
||||
self.generation_engine = generation_engine
|
||||
self.output_path = output_path
|
||||
|
||||
|
@ -248,8 +248,8 @@ audio_silenceTimer: 5000
|
||||
|
||||
# Data vectorization
|
||||
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
|
||||
rag_vectorizer: bert # possible values bert, tfidf, word2vec
|
||||
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer: semantic # possible values semantic, tfidf, openai
|
||||
rag_vectorizer_model: sentence-transformers/bert-base-nli-mean-tokens # The model name if applicable
|
||||
rag_vectorizer_parameters: null # Parameters of the model in json format
|
||||
rag_chunk_size: 512 # number of tokens per chunk
|
||||
rag_overlap: 0 # number of tokens of overlap
|
||||
|
Loading…
x
Reference in New Issue
Block a user