upgraded vector db

This commit is contained in:
Saifeddine ALOUI 2024-07-18 01:32:11 +02:00
parent 6f40981651
commit 1437b2c40d
20 changed files with 513 additions and 175 deletions

View File

@ -1,5 +1,5 @@
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
version: 125
version: 127
binding_name: null
model_name: null
model_variant: null
@ -153,6 +153,7 @@ xtts_top_k: 50
xtts_top_p: 0.85
xtts_speed: 1
xtts_enable_text_splitting: true
xtts_freq: 22050
# openai_whisper configuration
openai_tts_key: ""

View File

@ -1,35 +1,53 @@
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
version: 81
version: 118
binding_name: null
model_name: null
model_variant: null
model_type: null
show_news_panel: True
show_news_panel: true
# Security measures
turn_on_setting_update_validation: True
turn_on_code_execution: True
turn_on_code_validation: True
turn_on_open_file_validation: False
turn_on_send_file_validation: False
turn_on_setting_update_validation: true
turn_on_code_execution: true
turn_on_code_validation: true
turn_on_open_file_validation: true
turn_on_send_file_validation: true
turn_on_language_validation: true
force_accept_remote_access: false
# Server information
headless_server_mode: False
headless_server_mode: false
allowed_origins: []
# Host information
host: localhost
port: 9600
app_custom_logo: ""
# Genreration parameters
discussion_prompt_separator: "!@>"
start_header_id_template: "!@>"
end_header_id_template: ": "
separator_template: "\n"
start_user_header_id_template: "!@>"
end_user_header_id_template: ": "
end_user_message_id_template: ""
start_ai_header_id_template: "!@>"
end_ai_header_id_template: ": "
end_ai_message_id_template: ""
system_message_template: "system"
seed: -1
ctx_size: 4084
max_n_predict: 4096
min_n_predict: 512
min_n_predict: 1024
temperature: 0.9
top_k: 50
top_p: 0.95
@ -50,14 +68,14 @@ user_name: user
user_description: ""
use_user_name_in_discussions: false
use_model_name_in_discussions: false
user_avatar: default_user.svg
user_avatar: null
use_user_informations_in_discussion: false
# UI parameters
discussion_db_name: default
# Automatic updates
debug: False
debug: false
debug_log_file_path: ""
auto_update: true
auto_sync_personalities: true
@ -77,23 +95,104 @@ auto_show_browser: true
# copy to clipboard
copy_to_clipboard_add_all_details: false
# -------------------- Services global configurations --------------------------
# Select the active test to speach, text to image and speach to text services
active_tts_service: "None" # xtts (offline), openai_tts (API key required)
active_tti_service: "None" # autosd (offline), dall-e (online)
active_stt_service: "None" # whisper (offline), asr (offline or online), openai_whiosper (API key required)
active_ttm_service: "None" # musicgen (offline)
# -------------------- Services --------------------------
# ***************** STT *****************
stt_input_device: 0
# STT service
stt_listening_threshold: 1000
stt_silence_duration: 2
stt_sound_threshold_percentage: 10
stt_gain: 1.0
stt_rate: 44100
stt_channels: 1
stt_buffer_size: 10
stt_activate_word_detection: false
stt_word_detection_file: null
# ASR STT service
asr_enable: false
asr_base_url: http://localhost:9000
# openai_whisper configuration
openai_whisper_key: ""
openai_whisper_model: "whisper-1"
# whisper configuration
whisper_activate: false
whisper_model: base
# ***************** TTS *****************
tts_output_device: 0
# Voice service
auto_read: false
xtts_current_voice: null
xtts_current_language: en
xtts_stream_chunk_size: 100
xtts_temperature: 0.75
xtts_length_penalty: 1.0
xtts_repetition_penalty: 5.0
xtts_top_k: 50
xtts_top_p: 0.85
xtts_speed: 1
xtts_enable_text_splitting: true
# openai_whisper configuration
openai_tts_key: ""
openai_tts_model: "tts-1"
openai_tts_voice: "alloy"
# ***************** TTI *****************
use_negative_prompt: true
use_ai_generated_negative_prompt: false
negative_prompt_generation_prompt: Generate negative prompt for the following prompt. negative prompt is a set of words that describe things we do not want to have in the generated image.
default_negative_prompt: (((text))), (((ugly))), (((duplicate))), ((morbid)), ((mutilated)), out of frame, extra fingers, mutated hands, ((poorly drawn hands)), ((poorly drawn face)), (((mutation))), (((deformed))), blurry, ((bad anatomy)), (((bad proportions))), ((extra limbs)), cloned face, (((disfigured))), ((extra arms)), (((extra legs))), mutated hands, (fused fingers), (too many fingers), (((long neck))), ((watermark)), ((robot eyes))
# Image generation service
enable_sd_service: false
sd_base_url: http://localhost:7860
# Image generation service
enable_fooocus_service: false
fooocus_base_url: http://localhost:7860
# diffuser
diffusers_offloading_mode: sequential_cpu_offload # sequential_cpu_offload
diffusers_model: PixArt-alpha/PixArt-Sigma-XL-2-1024-MS
# Dall e service key
dall_e_key: ""
dall_e_generation_engine: "dall-e-3"
# Midjourney service key
midjourney_key: ""
# Image generation service comfyui
enable_comfyui_service: false
comfyui_base_url: http://127.0.0.1:8188/
comfyui_model: v1-5-pruned-emaonly.ckpt
# Motion control service
enable_motion_ctrl_service: false
motion_ctrl_base_url: http://localhost:7861
# ***************** TTT *****************
# ollama service
enable_ollama_service: false
ollama_base_url: http://localhost:11434
@ -107,6 +206,11 @@ petals_device: cuda
# lollms service
enable_lollms_service: false
lollms_base_url: http://localhost:1234
lollms_access_keys : "" # set a list of keys separated by coma to restrict access
activate_lollms_server: true
activate_ollama_emulator: true
activate_openai_emulator: true
activate_mistralai_emulator: true
# elastic search service
elastic_search_service: false
@ -131,13 +235,22 @@ audio_auto_send_input: true
audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_n_chunks: 4 #Number of chunks to recover from the database
rag_clean_chunks: true #Removed all uinecessary spaces and line returns
rag_follow_subfolders: true #if true the vectorizer will vectorize the content of subfolders too
rag_check_new_files_at_startup: false #if true, the vectorizer will automatically check for any new files in the folder and adds it to the database
rag_preprocess_chunks: false #if true, an LLM will preprocess the content of the chunk before writing it in a simple format
activate_skills_lib: false # Activate vectorizing previous conversations
skills_lib_database_name: "default" # Default skills database
summarize_discussion: false # activate discussion summary (better but adds computation time)
max_summary_size: 512 # in tokens
data_vectorization_visualize_on_vectorization: false
use_files: true # Activate using files
data_vectorization_activate: true # To activate/deactivate data vectorization
data_vectorization_method: "tfidf_vectorizer" #"model_embedding" or "tfidf_vectorizer"
data_visualization_method: "PCA" #"PCA" or "TSNE"
@ -154,12 +267,13 @@ data_vectorization_make_persistance: false # If true, the data will be persistan
# Activate internet search
activate_internet_search: false
activate_internet_pages_judgement: true
internet_vectorization_chunk_size: 512 # chunk size
internet_vectorization_overlap_size: 128 # overlap between chunks size
internet_vectorization_nb_chunks: 2 # number of chunks to use
internet_nb_search_pages: 3 # number of pages to select
internet_quick_search: False # If active the search engine will not load and read the webpages
internet_activate_search_decision: False # If active the ai decides by itself if it needs to do search
internet_vectorization_overlap_size: 0 # overlap between chunks size
internet_vectorization_nb_chunks: 4 # number of chunks to use
internet_nb_search_pages: 8 # number of pages to select
internet_quick_search: false # If active the search engine will not load and read the webpages
internet_activate_search_decision: false # If active the ai decides by itself if it needs to do search
# Helpers
pdf_latex_path: null
@ -167,7 +281,7 @@ pdf_latex_path: null
positive_boost: null
negative_boost: null
current_language: english
fun_mode: False
fun_mode: false
# webui configurations
@ -175,5 +289,3 @@ show_code_of_conduct: true
activate_audio_infos: true
# whisper configuration
whisper_model: base

View File

@ -1,35 +1,53 @@
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
version: 81
version: 118
binding_name: null
model_name: null
model_variant: null
model_type: null
show_news_panel: True
show_news_panel: true
# Security measures
turn_on_setting_update_validation: True
turn_on_code_execution: True
turn_on_code_validation: True
turn_on_open_file_validation: False
turn_on_send_file_validation: False
turn_on_setting_update_validation: true
turn_on_code_execution: true
turn_on_code_validation: true
turn_on_open_file_validation: true
turn_on_send_file_validation: true
turn_on_language_validation: true
force_accept_remote_access: false
# Server information
headless_server_mode: False
headless_server_mode: false
allowed_origins: []
# Host information
host: localhost
port: 9600
app_custom_logo: ""
# Genreration parameters
discussion_prompt_separator: "!@>"
start_header_id_template: "!@>"
end_header_id_template: ": "
separator_template: "\n"
start_user_header_id_template: "!@>"
end_user_header_id_template: ": "
end_user_message_id_template: ""
start_ai_header_id_template: "!@>"
end_ai_header_id_template: ": "
end_ai_message_id_template: ""
system_message_template: "system"
seed: -1
ctx_size: 4084
max_n_predict: 4096
min_n_predict: 512
min_n_predict: 1024
temperature: 0.9
top_k: 50
top_p: 0.95
@ -50,14 +68,14 @@ user_name: user
user_description: ""
use_user_name_in_discussions: false
use_model_name_in_discussions: false
user_avatar: default_user.svg
user_avatar: null
use_user_informations_in_discussion: false
# UI parameters
discussion_db_name: default
# Automatic updates
debug: False
debug: false
debug_log_file_path: ""
auto_update: true
auto_sync_personalities: true
@ -77,23 +95,104 @@ auto_show_browser: true
# copy to clipboard
copy_to_clipboard_add_all_details: false
# -------------------- Services global configurations --------------------------
# Select the active test to speach, text to image and speach to text services
active_tts_service: "None" # xtts (offline), openai_tts (API key required)
active_tti_service: "None" # autosd (offline), dall-e (online)
active_stt_service: "None" # whisper (offline), asr (offline or online), openai_whiosper (API key required)
active_ttm_service: "None" # musicgen (offline)
# -------------------- Services --------------------------
# ***************** STT *****************
stt_input_device: 0
# STT service
stt_listening_threshold: 1000
stt_silence_duration: 2
stt_sound_threshold_percentage: 10
stt_gain: 1.0
stt_rate: 44100
stt_channels: 1
stt_buffer_size: 10
stt_activate_word_detection: false
stt_word_detection_file: null
# ASR STT service
asr_enable: false
asr_base_url: http://localhost:9000
# openai_whisper configuration
openai_whisper_key: ""
openai_whisper_model: "whisper-1"
# whisper configuration
whisper_activate: false
whisper_model: base
# ***************** TTS *****************
tts_output_device: 0
# Voice service
auto_read: false
xtts_current_voice: null
xtts_current_language: en
xtts_stream_chunk_size: 100
xtts_temperature: 0.75
xtts_length_penalty: 1.0
xtts_repetition_penalty: 5.0
xtts_top_k: 50
xtts_top_p: 0.85
xtts_speed: 1
xtts_enable_text_splitting: true
# openai_whisper configuration
openai_tts_key: ""
openai_tts_model: "tts-1"
openai_tts_voice: "alloy"
# ***************** TTI *****************
use_negative_prompt: true
use_ai_generated_negative_prompt: false
negative_prompt_generation_prompt: Generate negative prompt for the following prompt. negative prompt is a set of words that describe things we do not want to have in the generated image.
default_negative_prompt: (((text))), (((ugly))), (((duplicate))), ((morbid)), ((mutilated)), out of frame, extra fingers, mutated hands, ((poorly drawn hands)), ((poorly drawn face)), (((mutation))), (((deformed))), blurry, ((bad anatomy)), (((bad proportions))), ((extra limbs)), cloned face, (((disfigured))), ((extra arms)), (((extra legs))), mutated hands, (fused fingers), (too many fingers), (((long neck))), ((watermark)), ((robot eyes))
# Image generation service
enable_sd_service: false
sd_base_url: http://localhost:7860
# Image generation service
enable_fooocus_service: false
fooocus_base_url: http://localhost:7860
# diffuser
diffusers_offloading_mode: sequential_cpu_offload # sequential_cpu_offload
diffusers_model: PixArt-alpha/PixArt-Sigma-XL-2-1024-MS
# Dall e service key
dall_e_key: ""
dall_e_generation_engine: "dall-e-3"
# Midjourney service key
midjourney_key: ""
# Image generation service comfyui
enable_comfyui_service: false
comfyui_base_url: http://127.0.0.1:8188/
comfyui_model: v1-5-pruned-emaonly.ckpt
# Motion control service
enable_motion_ctrl_service: false
motion_ctrl_base_url: http://localhost:7861
# ***************** TTT *****************
# ollama service
enable_ollama_service: false
ollama_base_url: http://localhost:11434
@ -107,6 +206,11 @@ petals_device: cuda
# lollms service
enable_lollms_service: false
lollms_base_url: http://localhost:1234
lollms_access_keys : "" # set a list of keys separated by coma to restrict access
activate_lollms_server: true
activate_ollama_emulator: true
activate_openai_emulator: true
activate_mistralai_emulator: true
# elastic search service
elastic_search_service: false
@ -131,13 +235,22 @@ audio_auto_send_input: true
audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_n_chunks: 4 #Number of chunks to recover from the database
rag_clean_chunks: true #Removed all uinecessary spaces and line returns
rag_follow_subfolders: true #if true the vectorizer will vectorize the content of subfolders too
rag_check_new_files_at_startup: false #if true, the vectorizer will automatically check for any new files in the folder and adds it to the database
rag_preprocess_chunks: false #if true, an LLM will preprocess the content of the chunk before writing it in a simple format
activate_skills_lib: false # Activate vectorizing previous conversations
skills_lib_database_name: "default" # Default skills database
summarize_discussion: false # activate discussion summary (better but adds computation time)
max_summary_size: 512 # in tokens
data_vectorization_visualize_on_vectorization: false
use_files: true # Activate using files
data_vectorization_activate: true # To activate/deactivate data vectorization
data_vectorization_method: "tfidf_vectorizer" #"model_embedding" or "tfidf_vectorizer"
data_visualization_method: "PCA" #"PCA" or "TSNE"
@ -154,20 +267,21 @@ data_vectorization_make_persistance: false # If true, the data will be persistan
# Activate internet search
activate_internet_search: false
activate_internet_pages_judgement: true
internet_vectorization_chunk_size: 512 # chunk size
internet_vectorization_overlap_size: 128 # overlap between chunks size
internet_vectorization_nb_chunks: 2 # number of chunks to use
internet_nb_search_pages: 3 # number of pages to select
internet_quick_search: False # If active the search engine will not load and read the webpages
internet_activate_search_decision: False # If active the ai decides by itself if it needs to do search
internet_vectorization_overlap_size: 0 # overlap between chunks size
internet_vectorization_nb_chunks: 4 # number of chunks to use
internet_nb_search_pages: 8 # number of pages to select
internet_quick_search: false # If active the search engine will not load and read the webpages
internet_activate_search_decision: false # If active the ai decides by itself if it needs to do search
# Helpers
pdf_latex_path: null
# boosting information
positive_boost: null
negative_boost: null
current_language: null
fun_mode: False
current_language: english
fun_mode: false
# webui configurations
@ -175,5 +289,3 @@ show_code_of_conduct: true
activate_audio_infos: true
# whisper configuration
whisper_model: base

View File

@ -13,7 +13,6 @@ from lollms.utilities import PromptReshaper
from lollms.client_session import Client, Session
from lollms.databases.skills_database import SkillsLibrary
from lollms.tasks import TasksLibrary
from safe_store import TextVectorizer, VectorizationMethod, VisualizationMethod
from lollmsvectordb.database_elements.chunk import Chunk
from lollmsvectordb.vector_database import VectorDatabase
@ -335,7 +334,7 @@ class LollmsApplication(LoLLMsCom):
trace_exception(ex)
ASCIIColors.blue("Loading local TTS services")
if self.config.xtts_enable or self.config.active_tts_service == "xtts":
if self.config.active_tts_service == "xtts":
ASCIIColors.yellow("Loading XTTS")
try:
from lollms.services.xtts.lollms_xtts import LollmsXTTS
@ -348,6 +347,7 @@ class LollmsApplication(LoLLMsCom):
self.xtts = LollmsXTTS(
self,
voices_folders=[voices_folder, self.lollms_paths.custom_voices_path],
freq=self.config.xtts_freq
)
except Exception as ex:
trace_exception(ex)
@ -448,7 +448,7 @@ class LollmsApplication(LoLLMsCom):
trace_exception(ex)
ASCIIColors.blue("Loading loacal TTS services")
if (self.config.xtts_enable or self.config.active_tts_service == "xtts") and self.xtts is None:
if self.config.active_tts_service == "xtts" and self.xtts is None:
ASCIIColors.yellow("Loading XTTS")
try:
from lollms.services.xtts.lollms_xtts import LollmsXTTS
@ -461,6 +461,7 @@ class LollmsApplication(LoLLMsCom):
self.xtts = LollmsXTTS(
self,
voices_folders=[voices_folder, self.lollms_paths.custom_voices_path],
freq=self.config.xtts_freq
)
except Exception as ex:
trace_exception(ex)
@ -532,17 +533,6 @@ class LollmsApplication(LoLLMsCom):
trace_exception(ex)
def build_long_term_skills_memory(self):
discussion_db_name:Path = self.lollms_paths.personal_discussions_path/self.config.discussion_db_name.split(".")[0]
discussion_db_name.mkdir(exist_ok=True, parents=True)
self.long_term_memory = TextVectorizer(
vectorization_method=VectorizationMethod.TFIDF_VECTORIZER,
model=self.model,
database_path=discussion_db_name/"skills_memory.json",
save_db=True,
data_visualization_method=VisualizationMethod.PCA,
)
return self.long_term_memory
def process_chunk(
self,
@ -969,6 +959,7 @@ class LollmsApplication(LoLLMsCom):
f"{self.start_header_id_template}websearch query{self.end_header_id_template}"
])
query = self.personality.fast_gen(q, max_generation_size=256, show_progress=True, callback=self.personality.sink)
query = query.replace("\"","")
self.personality.step_end("Crafting internet search query")
self.personality.step(f"web search query: {query}")
@ -979,12 +970,12 @@ class LollmsApplication(LoLLMsCom):
internet_search_results=f"{self.system_full_header}Use the web search results data to answer {self.config.user_name}. Try to extract information from the web search and use it to perform the requested task or answer the question. Do not come up with information that is not in the websearch results. Try to stick to the websearch results and clarify if your answer was based on the resuts or on your own culture. If you don't know how to perform the task, then tell the user politely that you need more data inputs.{self.separator_template}{self.start_header_id_template}Web search results{self.end_header_id_template}\n"
docs, sorted_similarities, document_ids = self.personality.internet_search_with_vectorization(query, self.config.internet_quick_search, asses_using_llm=self.config.activate_internet_pages_judgement)
chunks:List[Chunk] = self.personality.internet_search_with_vectorization(query, self.config.internet_quick_search, asses_using_llm=self.config.activate_internet_pages_judgement)
if len(docs)>0:
for doc, infos,document_id in zip(docs, sorted_similarities, document_ids):
internet_search_infos.append(document_id)
internet_search_results += f"{self.start_header_id_template}search result chunk{self.end_header_id_template}\nchunk_infos:{document_id['url']}\nchunk_title:{document_id['title']}\ncontent:{doc}\n"
if len(chunks)>0:
for chunk in chunks:
internet_search_infos.append(chunk.doc.title)
internet_search_results += f"{self.start_header_id_template}search result chunk{self.end_header_id_template}\nchunk_infos:{chunk.doc.path}\nchunk_title:{chunk.doc.title}\ncontent:{doc}\n"
else:
internet_search_results += "The search response was empty!\nFailed to recover useful information from the search engine.\n"
if self.config.internet_quick_search:
@ -1051,9 +1042,12 @@ class LollmsApplication(LoLLMsCom):
docs = v.list_documents()
for doc in docs:
document=v.get_document(document_path = doc["path"])
self.personality.step_start(f"Summeryzing document {doc['path']}")
summary = self.personality.summarize_text(document, f"Extract information from the following text chunk to answer this request. If there is no information about the query, just return an empty string.\n{self.system_custom_header('query')}{query}", callback=self.personality.sink)
self.personality.step_end(f"Summeryzing document {doc['path']}")
self.personality.step_start(f"Summaryzing document {doc['path']}")
def post_process(summary):
return summary
summary = self.personality.summarize_text(document,
f"Extract information from the following text chunk to answer this request.\n{self.system_custom_header('query')}{query}", chunk_summary_post_processing=post_process, callback=self.personality.sink)
self.personality.step_end(f"Summaryzing document {doc['path']}")
document_infos = f"{self.separator_template}".join([
self.system_custom_header('document contextual summary'),
f"source_document_title:{doc['title']}",

View File

@ -1,5 +1,5 @@
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
version: 125
version: 127
binding_name: null
model_name: null
model_variant: null
@ -153,6 +153,7 @@ xtts_top_k: 50
xtts_top_p: 0.85
xtts_speed: 1
xtts_enable_text_splitting: true
xtts_freq: 22050
# openai_whisper configuration
openai_tts_key: ""

View File

@ -7,9 +7,8 @@ from lollms.types import MSG_TYPE
from lollms.types import BindingType
from lollms.utilities import PackageManager, discussion_path_to_url
from lollms.paths import LollmsPaths
from lollms.databases.skills_database import SkillsLibrary
from lollms.com import LoLLMsCom
from safe_store import TextVectorizer, VisualizationMethod, GenericDataLoader
from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
@ -671,7 +670,7 @@ class Discussion:
if len(self.vectorizer.list_documents())==0 and len(self.text_files)>0:
for path in self.text_files:
data = GenericDataLoader.read_file(path)
data = TextDocumentsLoader.read_file(path)
try:
self.vectorizer.add_document(path.stem, data, path, True)
except Exception as ex:
@ -833,7 +832,7 @@ class Discussion:
return True
except Exception as e:
trace_exception(e)
self.lollms.InfoMessage(f"Unsupported file format or empty file.\nSupported formats are {GenericDataLoader.get_supported_file_types()}",client_id=client.client_id)
self.lollms.InfoMessage(f"Unsupported file format or empty file.\nSupported formats are {TextDocumentsLoader.get_supported_file_types()}",client_id=client.client_id)
return False
def load_message(self, id):

View File

@ -1,6 +1,6 @@
from pathlib import Path
from lollms.personality import APScript
from safe_store.generic_data_loader import GenericDataLoader
from lollmsvectordb.text_document_loader import TextDocumentsLoader
from safe_store.text_vectorizer import TextVectorizer
import json
import re

View File

@ -7,7 +7,6 @@ from typing import Union
from lollms.utilities import PackageManager
from lollms.personality import APScript
from lollms.tts import LollmsTTS
from safe_store import GenericDataLoader
from ascii_colors import trace_exception
# Here is the core of the function to be built

View File

@ -7,7 +7,7 @@ from typing import Union
from lollms.utilities import PackageManager
from lollms.personality import APScript
from lollms.tts import LollmsTTS
from safe_store import GenericDataLoader
from lollmsvectordb import TextDocumentsLoader
from ascii_colors import trace_exception
# Here is the core of the function to be built
@ -28,7 +28,7 @@ def read_text_from_file(file_path: Union[Path, str], tts_module:LollmsTTS, llm:A
file_path = Path(file_path)
# Read the text from the file
text = GenericDataLoader.read_file(file_path)
text = TextDocumentsLoader.read_file(file_path)
# Generate audio from the text
audio_file_path = tts_module.tts_audio(text,use_threading=True)

View File

@ -29,7 +29,7 @@ def get_root_url(url):
def format_url_parameter(value:str):
encoded_value = value.strip().replace("\"","")
encoded_value = value.strip().replace("\"","").replace(" ","+")
return encoded_value
@ -294,7 +294,6 @@ def internet_search(query, internet_nb_search_pages, chromedriver_path=None, qui
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from safe_store.text_vectorizer import TextVectorizer, VectorizationMethod
search_results = []
@ -349,9 +348,10 @@ def internet_search_with_vectorization(query, chromedriver_path=None, internet_n
nb_non_empty = 0
# Configure Chrome options
driver = prepare_chrome_driver(chromedriver_path)
qquery = format_url_parameter(query)
url = f"https://duckduckgo.com/?q={qquery}&t=h_&ia=web"
results = extract_results(
f"https://duckduckgo.com/?q={format_url_parameter(query)}&t=h_&ia=web",
url,
internet_nb_search_pages,
driver
)
@ -369,13 +369,11 @@ def internet_search_with_vectorization(query, chromedriver_path=None, internet_n
nb_non_empty += 1
if nb_non_empty>=internet_nb_search_pages:
break
docs, sorted_similarities, document_ids = vectorizer.recover_text(query, internet_vectorization_nb_chunks)
vectorizer.build_index()
chunks = vectorizer.search(query, internet_vectorization_nb_chunks)
else:
docs = ["The web search has failed. Try using another query"]
sorted_similarities = [0]
document_ids = ["duckduckgo.com"]
chunks = []
# Close the browser
driver.quit()
return docs, sorted_similarities, document_ids
return chunks

View File

@ -20,7 +20,7 @@ from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
from lollmsvectordb.text_document_loader import TextDocumentsLoader
from lollmsvectordb.database_elements.document import Document
import pkg_resources
from pathlib import Path
from PIL import Image
@ -37,7 +37,11 @@ from lollms.types import MSG_TYPE, SUMMARY_MODE
import json
from typing import Any, List, Optional, Type, Callable, Dict, Any, Union
import json
from safe_store import TextVectorizer, GenericDataLoader, VisualizationMethod, VectorizationMethod, DocumentDecomposer
from lollmsvectordb.vector_database import VectorDatabase
from lollmsvectordb.text_document_loader import TextDocumentsLoader
from lollmsvectordb.text_chunker import TextChunker
import hashlib
from functools import partial
import sys
from lollms.com import LoLLMsCom
@ -910,42 +914,34 @@ class AIPersonality:
# Verify if the persona has a data folder
if self.data_path.exists():
self.database_path = self.data_path / "db.json"
if self.database_path.exists():
ASCIIColors.info("Loading database ...",end="")
self.persona_data_vectorizer = TextVectorizer(
"tfidf_vectorizer", # self.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer"
model=self.model, #needed in case of using model_embedding
save_db=True,
database_path=self.database_path,
data_visualization_method=VisualizationMethod.PCA,
database_dict=None)
ASCIIColors.green("Ok")
else:
files = [f for f in self.data_path.iterdir() if f.suffix.lower() in ['.asm', '.bat', '.c', '.cpp', '.cs', '.csproj', '.css',
'.csv', '.docx', '.h', '.hh', '.hpp', '.html', '.inc', '.ini', '.java', '.js', '.json', '.log',
'.lua', '.map', '.md', '.pas', '.pdf', '.php', '.pptx', '.ps1', '.py', '.rb', '.rtf', '.s', '.se', '.sh', '.sln',
'.snippet', '.snippets', '.sql', '.sym', '.ts', '.txt', '.xlsx', '.xml', '.yaml', '.yml', '.msg'] ]
if len(files)>0:
dl = GenericDataLoader()
self.persona_data_vectorizer = TextVectorizer(
"tfidf_vectorizer", # self.config.data_vectorization_method, # supported "model_embedding" or "tfidf_vectorizer"
model=self.model, #needed in case of using model_embedding
save_db=True,
database_path=self.database_path,
data_visualization_method=VisualizationMethod.PCA,
database_dict=None)
for f in files:
text = dl.read_file(f)
self.persona_data_vectorizer.add_document(f.name,text,self.config.data_vectorization_chunk_size, self.config.data_vectorization_overlap_size)
# data_vectorization_chunk_size: 512 # chunk size
# data_vectorization_overlap_size: 128 # overlap between chunks size
# data_vectorization_nb_chunks: 2 # number of chunks to use
self.persona_data_vectorizer.index()
self.persona_data_vectorizer.save_db()
else:
self.persona_data_vectorizer = None
self._data = None
self.database_path = self.data_path / "db.sqlite"
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
vectorizer = self.config.rag_vectorizer
if vectorizer == "bert":
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
v = BERTVectorizer()
elif vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif vectorizer == "word2vec":
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
v = Word2VecVectorizer()
self.persona_data_vectorizer = VectorDatabase(self.database_path, v, TikTokenTokenizer(), self.config.rag_chunk_size, self.config.rag_overlap)
files = [f for f in self.data_path.iterdir() if f.suffix.lower() in ['.asm', '.bat', '.c', '.cpp', '.cs', '.csproj', '.css',
'.csv', '.docx', '.h', '.hh', '.hpp', '.html', '.inc', '.ini', '.java', '.js', '.json', '.log',
'.lua', '.map', '.md', '.pas', '.pdf', '.php', '.pptx', '.ps1', '.py', '.rb', '.rtf', '.s', '.se', '.sh', '.sln',
'.snippet', '.snippets', '.sql', '.sym', '.ts', '.txt', '.xlsx', '.xml', '.yaml', '.yml', '.msg'] ]
dl = TextDocumentsLoader()
for f in files:
text = dl.read_file(f)
self.persona_data_vectorizer.add_document(f.name, text, f)
# data_vectorization_chunk_size: 512 # chunk size
# data_vectorization_overlap_size: 128 # overlap between chunks size
# data_vectorization_nb_chunks: 2 # number of chunks to use
self.persona_data_vectorizer.build_index()
else:
self.persona_data_vectorizer = None
@ -1820,7 +1816,7 @@ class AIPersonality:
while len(tk)>max_summary_size and (document_chunks is None or len(document_chunks)>1):
self.step_start(f"Comprerssing {doc_name}...")
chunk_size = int(self.config.ctx_size*0.6)
document_chunks = DocumentDecomposer.decompose_document(text, chunk_size, 0, self.model.tokenize, self.model.detokenize, True)
document_chunks =TextChunker.chunk_text(text, self.model, chunk_size, 0, True)
text = self.summarize_chunks(
document_chunks,
summary_instruction,
@ -1831,7 +1827,6 @@ class AIPersonality:
chunk_summary_post_processing=chunk_summary_post_processing,
summary_mode=summary_mode)
tk = self.model.tokenize(text)
tk = self.model.tokenize(text)
dtk_ln=prev_len-len(tk)
prev_len = len(tk)
self.step(f"Current text size : {prev_len}, max summary size : {max_summary_size}")
@ -1857,7 +1852,7 @@ class AIPersonality:
prev_len = len(tk)
while len(tk)>max_summary_size:
chunk_size = int(self.config.ctx_size*0.6)
document_chunks = DocumentDecomposer.decompose_document(text, chunk_size, 0, self.model.tokenize, self.model.detokenize, True)
document_chunks = TextChunker.chunk_text(text, self.model, chunk_size, 0, True)
text = self.summarize_chunks(
document_chunks,
data_extraction_instruction,
@ -2548,7 +2543,7 @@ class APScript(StateMachine):
while len(tk)>max_summary_size and (document_chunks is None or len(document_chunks)>1):
self.step_start(f"Comprerssing {doc_name}...")
chunk_size = int(self.personality.config.ctx_size*0.6)
document_chunks = DocumentDecomposer.decompose_document(text, chunk_size, 0, self.personality.model.tokenize, self.personality.model.detokenize, True)
document_chunks = TextChunker.chunk_text(text, self.model, chunk_size, 0, True)
text = self.summarize_chunks(
document_chunks,
summary_instruction,
@ -2585,7 +2580,7 @@ class APScript(StateMachine):
prev_len = len(tk)
while len(tk)>max_summary_size:
chunk_size = int(self.personality.config.ctx_size*0.6)
document_chunks = DocumentDecomposer.decompose_document(text, chunk_size, 0, self.personality.model.tokenize, self.personality.model.detokenize, True)
document_chunks = TextChunker.chunk_text(text, self.model, chunk_size, 0, True)
text = self.summarize_chunks(
document_chunks,
data_extraction_instruction,
@ -2893,15 +2888,25 @@ class APScript(StateMachine):
return self.personality.internet_search_with_vectorization(query, quick_search=quick_search)
def vectorize_and_query(self, text, query, max_chunk_size=512, overlap_size=20, internet_vectorization_nb_chunks=3):
vectorizer = TextVectorizer(VectorizationMethod.TFIDF_VECTORIZER, model = self.personality.model)
decomposer = DocumentDecomposer()
chunks = decomposer.decompose_document(text, max_chunk_size, overlap_size,self.personality.model.tokenize,self.personality.model.detokenize)
for i, chunk in enumerate(chunks):
vectorizer.add_document(f"chunk_{i}", self.personality.model.detokenize(chunk))
vectorizer.index()
docs, sorted_similarities, document_ids = vectorizer.recover_text(query, internet_vectorization_nb_chunks)
return docs, sorted_similarities
def vectorize_and_query(self, title, url, text, query, max_chunk_size=512, overlap_size=20, internet_vectorization_nb_chunks=3):
from lollmsvectordb.lollms_tokenizers.tiktoken_tokenizer import TikTokenTokenizer
vectorizer = self.config.rag_vectorizer
if vectorizer == "bert":
from lollmsvectordb.lollms_vectorizers.bert_vectorizer import BERTVectorizer
v = BERTVectorizer()
elif vectorizer == "tfidf":
from lollmsvectordb.lollms_vectorizers.tfidf_vectorizer import TFIDFVectorizer
v = TFIDFVectorizer()
elif vectorizer == "word2vec":
from lollmsvectordb.lollms_vectorizers.word2vec_vectorizer import Word2VecVectorizer
v = Word2VecVectorizer()
vectorizer = VectorDatabase("", v, TikTokenTokenizer(), self.config.rag_chunk_size, self.config.rag_overlap)
vectorizer.add_document(title, text, url)
vectorizer.build_index()
chunks = vectorizer.search(query, internet_vectorization_nb_chunks)
return chunks
def step_start(self, step_text, callback: Callable[[str, MSG_TYPE, dict, list], bool]=None):

View File

@ -18,7 +18,6 @@ from ascii_colors import ASCIIColors
from lollms.databases.discussions_database import DiscussionsDB, Discussion
from typing import List
import shutil
from safe_store.text_vectorizer import TextVectorizer, VectorizationMethod, VisualizationMethod
import tqdm
from pathlib import Path
class GenerateRequest(BaseModel):

View File

@ -14,7 +14,7 @@ from pydantic import BaseModel
from starlette.responses import StreamingResponse
from lollms.types import MSG_TYPE
from lollms.main_config import BaseConfig
from lollms.utilities import output_file_path_to_url, detect_antiprompt, remove_text_from_string, trace_exception, find_first_available_file_index, add_period, PackageManager
from lollms.utilities import find_next_available_filename, output_file_path_to_url, detect_antiprompt, remove_text_from_string, trace_exception, find_first_available_file_index, add_period, PackageManager
from lollms.security import sanitize_path, validate_path, check_access
from pathlib import Path
from ascii_colors import ASCIIColors
@ -176,8 +176,7 @@ async def text2Wave(request: LollmsText2AudioRequest):
request.fn = (lollmsElfServer.lollms_paths.personal_outputs_path/"audio_out")/request.fn
validate_path(request.fn,[str(lollmsElfServer.lollms_paths.personal_outputs_path/"audio_out")])
else:
request.fn = lollmsElfServer.lollms_paths.personal_outputs_path/"audio_out"/"tts2audio.wav"
request.fn = find_next_available_filename(lollmsElfServer.lollms_paths.personal_outputs_path/"audio_out", "tts_out","wave")
# Verify the path exists
request.fn.parent.mkdir(exist_ok=True, parents=True)
@ -236,6 +235,7 @@ def start_xtts():
lollmsElfServer.tts = LollmsXTTS(
lollmsElfServer,
voices_folders=[voices_folder, lollmsElfServer.lollms_paths.custom_voices_path],
freq=lollmsElfServer.config.xtts_freq
)
lollmsElfServer.HideBlockingMessage()
except Exception as ex:

View File

@ -18,7 +18,6 @@ from ascii_colors import ASCIIColors
from lollms.databases.discussions_database import DiscussionsDB
from lollms.security import check_access
from pathlib import Path
from safe_store.text_vectorizer import TextVectorizer, VectorizationMethod, VisualizationMethod
import tqdm
from fastapi import FastAPI, UploadFile, File
import shutil

View File

@ -34,8 +34,9 @@ from queue import Queue
import re
class LollmsXTTS(LollmsTTS):
def __init__(self, app: LollmsApplication, voices_folders: List[str|Path]):
def __init__(self, app: LollmsApplication, voices_folders: List[str|Path], freq = 22050):
super().__init__("lollms_xtts", app)
self.freq = freq
self.generation_threads = {}
self.voices_folders = [Path(v) for v in voices_folders] + [Path(__file__).parent/"voices"]
self.stop_event = threading.Event()
@ -75,7 +76,7 @@ class LollmsXTTS(LollmsTTS):
def get(app: LollmsApplication) -> 'LollmsXTTS':
# Verify if the service is installed and if true then return an instance of LollmsXTTS
if LollmsXTTS.verify(app.lollms_paths):
return LollmsXTTS(app, app.lollms_paths.custom_voices_path)
return LollmsXTTS(app, app.lollms_paths.custom_voices_path, freq=app.config.xtts_freq)
else:
raise Exception("LollmsXTTS service is not installed properly.")
def get_speaker_wav(self, speaker) -> Path:
@ -147,7 +148,7 @@ class LollmsXTTS(LollmsTTS):
if wav is None:
# Play any remaining buffered sentences
for buffered_wav in buffer:
self.play_obj = sa.play_buffer(buffered_wav.tobytes(), 1, 2, 22050)
self.play_obj = sa.play_buffer(buffered_wav.tobytes(), 1, 2, self.freq)
self.play_obj.wait_done()
time.sleep(0.5) # Pause between sentences
ASCIIColors.green("Audio done")
@ -156,7 +157,7 @@ class LollmsXTTS(LollmsTTS):
buffered_sentences += 1
if buffered_sentences >= 2:
for buffered_wav in buffer:
self.play_obj = sa.play_buffer(buffered_wav.tobytes(), 1, 2, 22050)
self.play_obj = sa.play_buffer(buffered_wav.tobytes(), 1, 2, self.freq)
self.play_obj.wait_done()
time.sleep(0.5) # Pause between sentences
buffer = []
@ -166,7 +167,7 @@ class LollmsXTTS(LollmsTTS):
with wave.open(str(file_name_or_path), 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(22050)
wf.setframerate(self.freq)
for wav in wav_data:
wf.writeframes(wav.tobytes())

View File

@ -7,7 +7,10 @@ from ascii_colors import ASCIIColors
from lollms.types import MSG_TYPE, SUMMARY_MODE
from lollms.com import LoLLMsCom
from lollms.utilities import PromptReshaper, remove_text_from_string, process_ai_output
from safe_store import DocumentDecomposer
from lollmsvectordb.text_chunker import TextChunker
from lollmsvectordb.database_elements.document import Document
from lollmsvectordb.directory_binding import DirectoryBinding
import hashlib
import json
class TasksLibrary:
def __init__(self, lollms:LoLLMsCom, callback: Callable[[str, MSG_TYPE, dict, list], bool]=None) -> None:
@ -566,7 +569,11 @@ class TasksLibrary:
while len(tk)>max_summary_size and (document_chunks is None or len(document_chunks)>1):
self.step_start(f"Comprerssing {doc_name}... [depth {depth+1}]")
chunk_size = int(self.lollms.config.ctx_size*0.6)
document_chunks = DocumentDecomposer.decompose_document(text, chunk_size, 0, self.lollms.model.tokenize, self.lollms.model.detokenize, True)
tc = TextChunker(chunk_size, 0, model= self.lollms.model)
hasher = hashlib.md5()
hasher.update(text.encode("utf8"))
document_chunks = tc.get_text_chunks(text, Document(hasher.hexdigest(), doc_name ) )
text = self.summarize_chunks(
document_chunks,
summary_instruction,
@ -577,7 +584,6 @@ class TasksLibrary:
chunk_summary_post_processing=chunk_summary_post_processing,
summary_mode=summary_mode)
tk = self.lollms.model.tokenize(text)
tk = self.lollms.model.tokenize(text)
dtk_ln=prev_len-len(tk)
prev_len = len(tk)
self.step(f"Current text size : {prev_len}, max summary size : {max_summary_size}")

View File

@ -608,7 +608,7 @@ def add_period(text):
processed_text = '\n'.join(processed_lines)
return processed_text
def find_next_available_filename(folder_path, prefix):
def find_next_available_filename(folder_path, prefix, extension="png"):
folder = Path(folder_path)
if not folder.exists():
@ -616,7 +616,7 @@ def find_next_available_filename(folder_path, prefix):
index = 1
while True:
next_filename = f"{prefix}_{index}.png"
next_filename = f"{prefix}_{index}.{extension}"
potential_file = folder / next_filename
if not potential_file.exists():
return potential_file

View File

@ -1,35 +1,53 @@
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
version: 81
version: 118
binding_name: null
model_name: null
model_variant: null
model_type: null
show_news_panel: True
show_news_panel: true
# Security measures
turn_on_setting_update_validation: True
turn_on_code_execution: True
turn_on_code_validation: True
turn_on_open_file_validation: False
turn_on_send_file_validation: False
turn_on_setting_update_validation: true
turn_on_code_execution: true
turn_on_code_validation: true
turn_on_open_file_validation: true
turn_on_send_file_validation: true
turn_on_language_validation: true
force_accept_remote_access: false
# Server information
headless_server_mode: False
headless_server_mode: false
allowed_origins: []
# Host information
host: localhost
port: 9600
app_custom_logo: ""
# Genreration parameters
discussion_prompt_separator: "!@>"
start_header_id_template: "!@>"
end_header_id_template: ": "
separator_template: "\n"
start_user_header_id_template: "!@>"
end_user_header_id_template: ": "
end_user_message_id_template: ""
start_ai_header_id_template: "!@>"
end_ai_header_id_template: ": "
end_ai_message_id_template: ""
system_message_template: "system"
seed: -1
ctx_size: 4084
max_n_predict: 4096
min_n_predict: 512
min_n_predict: 1024
temperature: 0.9
top_k: 50
top_p: 0.95
@ -50,14 +68,14 @@ user_name: user
user_description: ""
use_user_name_in_discussions: false
use_model_name_in_discussions: false
user_avatar: default_user.svg
user_avatar: null
use_user_informations_in_discussion: false
# UI parameters
discussion_db_name: default
# Automatic updates
debug: False
debug: false
debug_log_file_path: ""
auto_update: true
auto_sync_personalities: true
@ -77,23 +95,104 @@ auto_show_browser: true
# copy to clipboard
copy_to_clipboard_add_all_details: false
# -------------------- Services global configurations --------------------------
# Select the active test to speach, text to image and speach to text services
active_tts_service: "None" # xtts (offline), openai_tts (API key required)
active_tti_service: "None" # autosd (offline), dall-e (online)
active_stt_service: "None" # whisper (offline), asr (offline or online), openai_whiosper (API key required)
active_ttm_service: "None" # musicgen (offline)
# -------------------- Services --------------------------
# ***************** STT *****************
stt_input_device: 0
# STT service
stt_listening_threshold: 1000
stt_silence_duration: 2
stt_sound_threshold_percentage: 10
stt_gain: 1.0
stt_rate: 44100
stt_channels: 1
stt_buffer_size: 10
stt_activate_word_detection: false
stt_word_detection_file: null
# ASR STT service
asr_enable: false
asr_base_url: http://localhost:9000
# openai_whisper configuration
openai_whisper_key: ""
openai_whisper_model: "whisper-1"
# whisper configuration
whisper_activate: false
whisper_model: base
# ***************** TTS *****************
tts_output_device: 0
# Voice service
auto_read: false
xtts_current_voice: null
xtts_current_language: en
xtts_stream_chunk_size: 100
xtts_temperature: 0.75
xtts_length_penalty: 1.0
xtts_repetition_penalty: 5.0
xtts_top_k: 50
xtts_top_p: 0.85
xtts_speed: 1
xtts_enable_text_splitting: true
# openai_whisper configuration
openai_tts_key: ""
openai_tts_model: "tts-1"
openai_tts_voice: "alloy"
# ***************** TTI *****************
use_negative_prompt: true
use_ai_generated_negative_prompt: false
negative_prompt_generation_prompt: Generate negative prompt for the following prompt. negative prompt is a set of words that describe things we do not want to have in the generated image.
default_negative_prompt: (((text))), (((ugly))), (((duplicate))), ((morbid)), ((mutilated)), out of frame, extra fingers, mutated hands, ((poorly drawn hands)), ((poorly drawn face)), (((mutation))), (((deformed))), blurry, ((bad anatomy)), (((bad proportions))), ((extra limbs)), cloned face, (((disfigured))), ((extra arms)), (((extra legs))), mutated hands, (fused fingers), (too many fingers), (((long neck))), ((watermark)), ((robot eyes))
# Image generation service
enable_sd_service: false
sd_base_url: http://localhost:7860
# Image generation service
enable_fooocus_service: false
fooocus_base_url: http://localhost:7860
# diffuser
diffusers_offloading_mode: sequential_cpu_offload # sequential_cpu_offload
diffusers_model: PixArt-alpha/PixArt-Sigma-XL-2-1024-MS
# Dall e service key
dall_e_key: ""
dall_e_generation_engine: "dall-e-3"
# Midjourney service key
midjourney_key: ""
# Image generation service comfyui
enable_comfyui_service: false
comfyui_base_url: http://127.0.0.1:8188/
comfyui_model: v1-5-pruned-emaonly.ckpt
# Motion control service
enable_motion_ctrl_service: false
motion_ctrl_base_url: http://localhost:7861
# ***************** TTT *****************
# ollama service
enable_ollama_service: false
ollama_base_url: http://localhost:11434
@ -107,6 +206,11 @@ petals_device: cuda
# lollms service
enable_lollms_service: false
lollms_base_url: http://localhost:1234
lollms_access_keys : "" # set a list of keys separated by coma to restrict access
activate_lollms_server: true
activate_ollama_emulator: true
activate_openai_emulator: true
activate_mistralai_emulator: true
# elastic search service
elastic_search_service: false
@ -131,13 +235,22 @@ audio_auto_send_input: true
audio_silenceTimer: 5000
# Data vectorization
rag_databases: [] # This is the list of paths to database sources. Each database is a folder containing data
rag_vectorizer: bert # possible values bert, tfidf, word2vec
rag_vectorizer_model: bert-base-nli-mean-tokens # The model name if applicable
rag_vectorizer_parameters: null # Parameters of the model in json format
rag_chunk_size: 512 # number of tokens per chunk
rag_n_chunks: 4 #Number of chunks to recover from the database
rag_clean_chunks: true #Removed all uinecessary spaces and line returns
rag_follow_subfolders: true #if true the vectorizer will vectorize the content of subfolders too
rag_check_new_files_at_startup: false #if true, the vectorizer will automatically check for any new files in the folder and adds it to the database
rag_preprocess_chunks: false #if true, an LLM will preprocess the content of the chunk before writing it in a simple format
activate_skills_lib: false # Activate vectorizing previous conversations
skills_lib_database_name: "default" # Default skills database
summarize_discussion: false # activate discussion summary (better but adds computation time)
max_summary_size: 512 # in tokens
data_vectorization_visualize_on_vectorization: false
use_files: true # Activate using files
data_vectorization_activate: true # To activate/deactivate data vectorization
data_vectorization_method: "tfidf_vectorizer" #"model_embedding" or "tfidf_vectorizer"
data_visualization_method: "PCA" #"PCA" or "TSNE"
@ -154,20 +267,21 @@ data_vectorization_make_persistance: false # If true, the data will be persistan
# Activate internet search
activate_internet_search: false
activate_internet_pages_judgement: true
internet_vectorization_chunk_size: 512 # chunk size
internet_vectorization_overlap_size: 128 # overlap between chunks size
internet_vectorization_nb_chunks: 2 # number of chunks to use
internet_nb_search_pages: 3 # number of pages to select
internet_quick_search: False # If active the search engine will not load and read the webpages
internet_activate_search_decision: False # If active the ai decides by itself if it needs to do search
internet_vectorization_overlap_size: 0 # overlap between chunks size
internet_vectorization_nb_chunks: 4 # number of chunks to use
internet_nb_search_pages: 8 # number of pages to select
internet_quick_search: false # If active the search engine will not load and read the webpages
internet_activate_search_decision: false # If active the ai decides by itself if it needs to do search
# Helpers
pdf_latex_path: null
# boosting information
positive_boost: null
negative_boost: null
current_language: null
fun_mode: False
current_language: english
fun_mode: false
# webui configurations
@ -175,5 +289,3 @@ show_code_of_conduct: true
activate_audio_infos: true
# whisper configuration
whisper_model: base

View File

@ -6,7 +6,7 @@ setuptools
requests
safe_store
lollmsvectordb
pipmaster
ascii_colors>=0.1.3
beautifulsoup4

View File

@ -5,8 +5,8 @@ wget
setuptools
requests
safe_store
ascii_colors>=0.1.3
lollmsvectordb
autopep8