This commit is contained in:
saloui 2023-07-27 16:27:08 +02:00
parent d21ba70dff
commit 58a0c986c3
10 changed files with 281 additions and 107 deletions

View File

@ -14,7 +14,7 @@ class MyConversation(Conversation):
self.menu.main_menu() self.menu.main_menu()
full_discussion += self.personality.user_message_prefix+prompt+self.personality.link_text full_discussion += self.personality.user_message_prefix+prompt+self.personality.link_text
full_discussion += self.personality.ai_message_prefix full_discussion += self.personality.ai_message_prefix
def callback(text, type=None): def callback(text, type=None, metadata:dict={}):
print(text, end="") print(text, end="")
sys.stdout = sys.__stdout__ sys.stdout = sys.__stdout__
sys.stdout.flush() sys.stdout.flush()

View File

@ -7,7 +7,7 @@ class MyConversation(Conversation):
def start_conversation(self): def start_conversation(self):
prompt = "Once apon a time" prompt = "Once apon a time"
def callback(text, type=None): def callback(text, type=None, metadata:dict={}):
print(text, end="") print(text, end="")
sys.stdout = sys.__stdout__ sys.stdout = sys.__stdout__
sys.stdout.flush() sys.stdout.flush()

View File

@ -7,6 +7,7 @@ from lollms.paths import LollmsPaths
from lollms.app import LollmsApplication from lollms.app import LollmsApplication
from lollms.terminal import MainMenu from lollms.terminal import MainMenu
from typing import Callable
from pathlib import Path from pathlib import Path
import argparse import argparse
import yaml import yaml
@ -133,7 +134,7 @@ Participating personalities:
full_discussion = "" full_discussion = ""
return full_discussion return full_discussion
def safe_generate(self, full_discussion:str, n_predict=None, callback=None): def safe_generate(self, full_discussion:str, n_predict=None, callback: Callable[[str, int, dict], bool]=None):
"""safe_generate """safe_generate
Args: Args:
@ -236,7 +237,7 @@ Participating personalities:
self.personality.ai_message_prefix self.personality.ai_message_prefix
) )
def callback(text, type:MSG_TYPE=None): def callback(text, type:MSG_TYPE=None, metadata:dict={}):
if type == MSG_TYPE.MSG_TYPE_CHUNK: if type == MSG_TYPE.MSG_TYPE_CHUNK:
# Replace stdout with the default stdout # Replace stdout with the default stdout
sys.stdout = sys.__stdout__ sys.stdout = sys.__stdout__

View File

@ -12,6 +12,7 @@ from lollms.apps.console import MainMenu
from lollms.paths import LollmsPaths from lollms.paths import LollmsPaths
from lollms.apps.console import MainMenu from lollms.apps.console import MainMenu
from lollms.app import LollmsApplication from lollms.app import LollmsApplication
from lollms.utilities import TextVectorizer
from typing import List, Tuple from typing import List, Tuple
import importlib import importlib
from pathlib import Path from pathlib import Path
@ -20,6 +21,7 @@ import logging
import yaml import yaml
import copy import copy
import gc import gc
import json
def reset_all_installs(lollms_paths:LollmsPaths): def reset_all_installs(lollms_paths:LollmsPaths):
ASCIIColors.info("Removeing all configuration files to force reinstall") ASCIIColors.info("Removeing all configuration files to force reinstall")
ASCIIColors.info(f"Searching files from {lollms_paths.personal_configuration_path}") ASCIIColors.info(f"Searching files from {lollms_paths.personal_configuration_path}")
@ -322,6 +324,96 @@ class LoLLMsServer(LollmsApplication):
emit('personality_add_failed', {'success':False, 'error': error_message}, room=request.sid) emit('personality_add_failed', {'success':False, 'error': error_message}, room=request.sid)
@self.socketio.on('vectorize_text')
def vectorize_text(parameters:dict):
"""Vectorizes text
Args:
parameters (dict): contains
'chunk_size': the maximum size of a text chunk (512 by default)
'vectorization_method': can be either "model_embedding" or "ftidf_vectorizer" (default is "ftidf_vectorizer")
'payloads': a list of dicts. each entry has the following format
{
"path": the path to the document
"text": the text of the document
},
'return_database': If true the vectorized database will be sent to the client (default is True)
'database_path': the path to store the database (default is none)
returns a dict
status: True if success and false if not
if you asked for the database to be sent back you will ahve those fields too:
embeddings: a dictionary containing the text chunks with their ids and embeddings
"texts": a dictionary of text chunks for each embedding (use index for correspondance)
"infos": extra information
"vectorizer": The vectorize if this is using tfidf or none if it uses model
"""
vectorization_method = parameters.get('vectorization_method',"ftidf_vectorizer")
chunk_size = parameters.get("chunk_size",512)
payloads = parameters["payloads"]
database_path = parameters.get("database_path",None)
return_database = parameters.get("return_database",True)
if database_path is None and return_database is None:
ASCIIColors.warning("Vectorization should either ask to save the database or to recover it. You didn't ask for any one!")
emit('vectorized_db',{"status":False, "error":"Vectorization should either ask to save the database or to recover it. You didn't ask for any one!"})
return
tv = TextVectorizer(vectorization_method, self.model)
for payload in payloads:
tv.add_document(payload["path"],payload["text"],chunk_size=chunk_size)
json_db = tv.toJson()
if return_database:
emit('vectorized_db',{**{"status":True}, **json_db})
else:
emit('vectorized_db',{"status":True})
with open(database_path, "w") as file:
json.dump(json_db, file, indent=4)
@self.socketio.on('query_database')
def query_database(parameters:dict):
"""queries a database
Args:
parameters (dict): contains
'vectorization_method': can be either "model_embedding" or "ftidf_vectorizer"
'database': a list of dicts. each entry has the following format
{
embeddings: a dictionary containing the text chunks with their ids and embeddings
"texts": a dictionary of text chunks for each embedding (use index for correspondance)
"infos": extra information
"vectorizer": The vectorize if this is using tfidf or none if it uses model
}
'database_path': If supplied, the database is loaded from a path
'query': a query to search in the database
"""
vectorization_method = parameters['vectorization_method']
database = parameters.get("database",None)
query = parameters.get("query",None)
if query is None:
ASCIIColors.error("No query given!")
emit('vector_db_query',{"status":False, "error":"Please supply a query"})
return
if database is None:
database_path = parameters.get("database_path",None)
if database_path is None:
ASCIIColors.error("No database given!")
emit('vector_db_query',{"status":False, "error":"You did not supply a database file nor a database content"})
return
else:
with open(database_path, "r") as file:
database = json.load(file)
tv = TextVectorizer(vectorization_method, self.model, database_dict=database)
docs, sorted_similarities = tv.recover_text(tv.embed_query(query))
emit('vectorized_db',{
"chunks":docs,
"refs":sorted_similarities
})
@self.socketio.on('list_active_personalities') @self.socketio.on('list_active_personalities')
def handle_list_active_personalities(): def handle_list_active_personalities():
personality_names = [p.name for p in self.personalities] personality_names = [p.name for p in self.personalities]
@ -394,7 +486,7 @@ class LoLLMsServer(LollmsApplication):
if personality_id==-1: if personality_id==-1:
# Raw text generation # Raw text generation
self.answer = {"full_text":""} self.answer = {"full_text":""}
def callback(text, message_type: MSG_TYPE): def callback(text, message_type: MSG_TYPE, metadata:dict={}):
if message_type == MSG_TYPE.MSG_TYPE_CHUNK: if message_type == MSG_TYPE.MSG_TYPE_CHUNK:
ASCIIColors.success(f"generated:{len(self.answer['full_text'].split())} words", end='\r') ASCIIColors.success(f"generated:{len(self.answer['full_text'].split())} words", end='\r')
self.answer["full_text"] = self.answer["full_text"] + text self.answer["full_text"] = self.answer["full_text"] + text
@ -467,7 +559,7 @@ class LoLLMsServer(LollmsApplication):
full_discussion = personality.personality_conditioning + ''.join(full_discussion_blocks) full_discussion = personality.personality_conditioning + ''.join(full_discussion_blocks)
def callback(text, message_type: MSG_TYPE): def callback(text, message_type: MSG_TYPE, metadata:dict={}):
if message_type == MSG_TYPE.MSG_TYPE_CHUNK: if message_type == MSG_TYPE.MSG_TYPE_CHUNK:
self.answer["full_text"] = self.answer["full_text"] + text self.answer["full_text"] = self.answer["full_text"] + text
self.socketio.emit('text_chunk', {'chunk': text}, room=client_id) self.socketio.emit('text_chunk', {'chunk': text}, room=client_id)

View File

@ -120,7 +120,7 @@ Participating personalities:
full_discussion = "" full_discussion = ""
return full_discussion return full_discussion
def safe_generate(self, full_discussion:str, n_predict=None, callback=None): def safe_generate(self, full_discussion:str, n_predict=None, callback: Callable[[str, int, dict], bool]=None):
"""safe_generate """safe_generate
Args: Args:

View File

@ -195,9 +195,9 @@ class LLMBinding:
def generate(self, def generate(self,
prompt:str, prompt:str,
n_predict: int = 128, n_predict: int = 128,
callback: Callable[[str], None] = None, callback: Callable[[str, int, dict], bool] = None,
verbose: bool = False, verbose: bool = False,
**gpt_params ): **gpt_params ):
"""Generates text out of a prompt """Generates text out of a prompt
@ -206,7 +206,7 @@ class LLMBinding:
Args: Args:
prompt (str): The prompt to use for generation prompt (str): The prompt to use for generation
n_predict (int, optional): Number of tokens to prodict. Defaults to 128. n_predict (int, optional): Number of tokens to prodict. Defaults to 128.
callback (Callable[[str], None], optional): A callback function that is called everytime a new text element is generated. Defaults to None. callback (Callable[[str, int, dict], None], optional): A callback function that is called everytime a new text element is generated. Defaults to None.
verbose (bool, optional): If true, the code will spit many informations about the generation process. Defaults to False. verbose (bool, optional): If true, the code will spit many informations about the generation process. Defaults to False.
""" """
pass pass

View File

@ -17,6 +17,7 @@ import subprocess
import yaml import yaml
from lollms.helpers import ASCIIColors from lollms.helpers import ASCIIColors
from lollms.types import MSG_TYPE from lollms.types import MSG_TYPE
from typing import Callable
import json import json
@ -54,7 +55,8 @@ class AIPersonality:
model:LLMBinding=None, model:LLMBinding=None,
run_scripts=True, run_scripts=True,
is_relative_path=True, is_relative_path=True,
installation_option:InstallOption=InstallOption.INSTALL_IF_NECESSARY installation_option:InstallOption=InstallOption.INSTALL_IF_NECESSARY,
callback: Callable[[str, int, dict], bool]=None
): ):
""" """
Initialize an AIPersonality instance. Initialize an AIPersonality instance.
@ -68,6 +70,7 @@ class AIPersonality:
self.lollms_paths = lollms_paths self.lollms_paths = lollms_paths
self.model = model self.model = model
self.config = config self.config = config
self.callback = callback
self.files = [] self.files = []
@ -248,7 +251,7 @@ Date: {{date}}
module = importlib.util.module_from_spec(module_spec) module = importlib.util.module_from_spec(module_spec)
module_spec.loader.exec_module(module) module_spec.loader.exec_module(module)
if hasattr(module, "Processor"): if hasattr(module, "Processor"):
self._processor = module.Processor(self) self._processor = module.Processor(self, callback=self.callback)
else: else:
self._processor = None self._processor = None
else: else:
@ -881,7 +884,7 @@ class StateMachine:
def process_state(self, command, full_context, callback=None): def process_state(self, command, full_context, callback: Callable[[str, int, dict], bool]=None):
""" """
Process the given command based on the current state. Process the given command based on the current state.
@ -922,7 +925,8 @@ class APScript(StateMachine):
self, self,
personality :AIPersonality, personality :AIPersonality,
personality_config :TypedConfig, personality_config :TypedConfig,
states_dict :dict = {} states_dict :dict = {},
callback = None
) -> None: ) -> None:
super().__init__(states_dict) super().__init__(states_dict)
self.files=[] self.files=[]
@ -932,6 +936,7 @@ class APScript(StateMachine):
self.configuration_file_path = self.personality.lollms_paths.personal_configuration_path/f"personality_{self.personality.personality_folder_name}.yaml" self.configuration_file_path = self.personality.lollms_paths.personal_configuration_path/f"personality_{self.personality.personality_folder_name}.yaml"
self.personality_config.config.file_path = self.configuration_file_path self.personality_config.config.file_path = self.configuration_file_path
self.callback = callback
# Installation # Installation
if (not self.configuration_file_path.exists() or self.installation_option==InstallOption.FORCE_INSTALL) and self.installation_option!=InstallOption.NEVER_INSTALL: if (not self.configuration_file_path.exists() or self.installation_option==InstallOption.FORCE_INSTALL) and self.installation_option!=InstallOption.NEVER_INSTALL:
self.install() self.install()
@ -990,8 +995,7 @@ class APScript(StateMachine):
else: else:
ASCIIColors.error("Pytorch installed successfully!!") ASCIIColors.error("Pytorch installed successfully!!")
def add_file(self, path, callback=None): def add_file(self, path):
self.callback=callback
self.files.append(path) self.files.append(path)
return True return True
@ -1104,7 +1108,7 @@ class APScript(StateMachine):
else: else:
return False return False
def run_workflow(self, prompt:str, previous_discussion_text:str="", callback=None): def run_workflow(self, prompt:str, previous_discussion_text:str="", callback: Callable[[str, int, dict], bool]=None):
""" """
Runs the workflow for processing the model input and output. Runs the workflow for processing the model input and output.
@ -1121,7 +1125,7 @@ class APScript(StateMachine):
""" """
return None return None
def step_start(self, step_text, callback=None): def step_start(self, step_text, callback: Callable[[str, int, dict], bool]=None):
"""This triggers a step start """This triggers a step start
Args: Args:
@ -1131,7 +1135,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(step_text, MSG_TYPE.MSG_TYPE_STEP_START) callback(step_text, MSG_TYPE.MSG_TYPE_STEP_START)
def step_end(self, step_text, callback=None): def step_end(self, step_text, status=True, callback: Callable[[str, int, dict], bool]=None):
"""This triggers a step end """This triggers a step end
Args: Args:
@ -1139,9 +1143,9 @@ class APScript(StateMachine):
callback (callable, optional): A callable with this signature (str, MSG_TYPE) to send the step end to. Defaults to None. callback (callable, optional): A callable with this signature (str, MSG_TYPE) to send the step end to. Defaults to None.
""" """
if callback: if callback:
callback(step_text, MSG_TYPE.MSG_TYPE_STEP_END) callback(step_text, MSG_TYPE.MSG_TYPE_STEP_END, {'status':status})
def step(self, step_text, callback=None): def step(self, step_text, callback: Callable[[str, int, dict], bool]=None):
"""This triggers a step information """This triggers a step information
Args: Args:
@ -1151,7 +1155,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(step_text, MSG_TYPE.MSG_TYPE_STEP) callback(step_text, MSG_TYPE.MSG_TYPE_STEP)
def exception(self, ex, callback=None): def exception(self, ex, callback: Callable[[str, int, dict], bool]=None):
"""This sends exception to the client """This sends exception to the client
Args: Args:
@ -1161,7 +1165,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(str(ex), MSG_TYPE.MSG_TYPE_EXCEPTION) callback(str(ex), MSG_TYPE.MSG_TYPE_EXCEPTION)
def warning(self, warning:str, callback=None): def warning(self, warning:str, callback: Callable[[str, int, dict], bool]=None):
"""This sends exception to the client """This sends exception to the client
Args: Args:
@ -1171,7 +1175,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(warning, MSG_TYPE.MSG_TYPE_EXCEPTION) callback(warning, MSG_TYPE.MSG_TYPE_EXCEPTION)
def info(self, info:str, callback=None): def info(self, info:str, callback: Callable[[str, int, dict], bool]=None):
"""This sends exception to the client """This sends exception to the client
Args: Args:
@ -1181,7 +1185,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(info, MSG_TYPE.MSG_TYPE_INFO) callback(info, MSG_TYPE.MSG_TYPE_INFO)
def json(self, json_infos:dict, callback=None): def json(self, json_infos:dict, callback: Callable[[str, int, dict], bool]=None):
"""This sends json data to front end """This sends json data to front end
Args: Args:
@ -1191,7 +1195,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(json.dumps(json_infos), MSG_TYPE.MSG_TYPE_JSON_INFOS) callback(json.dumps(json_infos), MSG_TYPE.MSG_TYPE_JSON_INFOS)
def ui(self, html_ui:str, callback=None): def ui(self, html_ui:str, callback: Callable[[str, int, dict], bool]=None):
"""This sends ui elements to front end """This sends ui elements to front end
Args: Args:
@ -1201,7 +1205,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(html_ui, MSG_TYPE.MSG_TYPE_UI) callback(html_ui, MSG_TYPE.MSG_TYPE_UI)
def code(self, code:str, callback=None): def code(self, code:str, callback: Callable[[str, int, dict], bool]=None):
"""This sends code to front end """This sends code to front end
Args: Args:
@ -1211,7 +1215,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(code, MSG_TYPE.MSG_TYPE_CODE) callback(code, MSG_TYPE.MSG_TYPE_CODE)
def full(self, full_text:str, callback=None): def full(self, full_text:str, callback: Callable[[str, int, dict], bool]=None):
"""This sends full text to front end """This sends full text to front end
Args: Args:
@ -1224,7 +1228,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(full_text, MSG_TYPE.MSG_TYPE_FULL) callback(full_text, MSG_TYPE.MSG_TYPE_FULL)
def full_invisible_to_ai(self, full_text:str, callback=None): def full_invisible_to_ai(self, full_text:str, callback: Callable[[str, int, dict], bool]=None):
"""This sends full text to front end (INVISIBLE to AI) """This sends full text to front end (INVISIBLE to AI)
Args: Args:
@ -1237,7 +1241,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(full_text, MSG_TYPE.MSG_TYPE_FULL_INVISIBLE_TO_AI) callback(full_text, MSG_TYPE.MSG_TYPE_FULL_INVISIBLE_TO_AI)
def full_invisible_to_user(self, full_text:str, callback=None): def full_invisible_to_user(self, full_text:str, callback: Callable[[str, int, dict], bool]=None):
"""This sends full text to front end (INVISIBLE to user) """This sends full text to front end (INVISIBLE to user)
Args: Args:
@ -1251,7 +1255,7 @@ class APScript(StateMachine):
callback(full_text, MSG_TYPE.MSG_TYPE_FULL_INVISIBLE_TO_USER) callback(full_text, MSG_TYPE.MSG_TYPE_FULL_INVISIBLE_TO_USER)
def info(self, info_text:str, callback=None): def info(self, info_text:str, callback: Callable[[str, int, dict], bool]=None):
"""This sends info text to front end """This sends info text to front end
Args: Args:
@ -1264,7 +1268,7 @@ class APScript(StateMachine):
if callback: if callback:
callback(info_text, MSG_TYPE.MSG_TYPE_FULL) callback(info_text, MSG_TYPE.MSG_TYPE_FULL)
def step_progress(self, progress:float, callback=None): def step_progress(self, step_text:str, progress:float, callback: Callable[[str, int, dict], bool]=None):
"""This sends step rogress to front end """This sends step rogress to front end
Args: Args:
@ -1275,8 +1279,16 @@ class APScript(StateMachine):
callback = self.callback callback = self.callback
if callback: if callback:
callback(str(progress), MSG_TYPE.MSG_TYPE_STEP_PROGRESS) callback(step_text, MSG_TYPE.MSG_TYPE_STEP_PROGRESS, {'progress':progress})
#Helper method to convert outputs path to url
def path2url(file):
file = str(file).replace("\\","/")
pth = file.split('/')
idx = pth.index("outputs")
pth = "/".join(pth[idx:])
file_path = f"![](/{pth})\n"
return file_path
# =========================================================== # ===========================================================
class AIPersonalityInstaller: class AIPersonalityInstaller:

View File

@ -1,51 +1,85 @@
from lollms.personality import APScript from lollms.personality import APScript
from lollms.helpers import ASCIIColors, trace_exception from lollms.helpers import ASCIIColors, trace_exception
from lollms.paths import LollmsPaths
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np import numpy as np
import json
from pathlib import Path from pathlib import Path
import numpy as np
import json import json
class TFIDFLoader:
@staticmethod
def create_vectorizer_from_dict(tfidf_info):
vectorizer = TfidfVectorizer(**tfidf_info['params'])
vectorizer.vocabulary_ = tfidf_info['vocabulary']
vectorizer.idf_ = [tfidf_info['idf_values'][feature] for feature in vectorizer.get_feature_names()]
return vectorizer
@staticmethod
def create_dict_from_vectorizer(vectorizer):
tfidf_info = {
"vocabulary": vectorizer.vocabulary_,
"idf_values": dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)),
"params": vectorizer.get_params()
}
return tfidf_info
class TextVectorizer: class TextVectorizer:
def __init__(self, processor): def __init__(
self,
vectorization_method, # supported "model_embedding" or "ftidf_vectorizer"
model=None, #needed in case of using model_embedding
database_path=None,
save_db=False,
visualize_data_at_startup=False,
visualize_data_at_add_file=False,
visualize_data_at_generate=False,
data_visualization_method="PCA",
database_dict=None
):
self.processor:APScript = processor self.vectorization_method = vectorization_method
self.personality = self.processor.personality self.save_db = save_db
self.model = self.personality.model self.model = model
self.personality_config = self.processor.personality_config self.database_file = database_path
self.lollms_paths = self.personality.lollms_paths
self.embeddings = {}
self.texts = {}
self.ready = False
self.vectorizer = None
self.database_file = Path(self.lollms_paths.personal_data_path/self.personality_config["database_path"])
self.visualize_data_at_startup=self.personality_config["visualize_data_at_startup"] self.visualize_data_at_startup=visualize_data_at_startup
self.visualize_data_at_add_file=self.personality_config["visualize_data_at_add_file"] self.visualize_data_at_add_file=visualize_data_at_add_file
self.visualize_data_at_generate=self.personality_config["visualize_data_at_generate"] self.visualize_data_at_generate=visualize_data_at_generate
if self.personality_config.vectorization_method=="model_embedding": self.data_visualization_method = data_visualization_method
try:
if self.model.embed("hi")==None: if database_dict is not None:
self.personality_config.vectorization_method="ftidf_vectorizer" self.chunks = []
self.embeddings = database_dict["embeddings"]
self.texts = database_dict["text"]
self.infos = database_dict["infos"]
self.ready = True
self.vectorizer = database_dict["vectorizer"]
else:
self.chunks = []
self.embeddings = {}
self.texts = {}
self.ready = False
self.vectorizer = None
if vectorization_method=="model_embedding":
try:
if not self.model or self.model.embed("hi")==None: # test
self.vectorization_method="ftidf_vectorizer"
self.infos={
"vectorization_method":"ftidf_vectorizer"
}
else:
self.infos={
"vectorization_method":"model_embedding"
}
except Exception as ex:
ASCIIColors.error("Couldn't embed the text, so trying to use tfidf instead.")
trace_exception(ex)
self.infos={ self.infos={
"vectorization_method":"ftidf_vectorizer" "vectorization_method":"ftidf_vectorizer"
} }
else:
self.infos={
"vectorization_method":"model_embedding"
}
except Exception as ex:
ASCIIColors.error("Couldn't embed the text, so trying to use tfidf instead.")
trace_exception(ex)
self.infos={
"vectorization_method":"ftidf_vectorizer"
}
# Load previous state from the JSON file # Load previous state from the JSON file
if self.personality_config.save_db: if self.save_db:
if Path(self.database_file).exists(): if Path(self.database_file).exists():
ASCIIColors.success(f"Database file found : {self.database_file}") ASCIIColors.success(f"Database file found : {self.database_file}")
self.load_from_json() self.load_from_json()
@ -56,7 +90,7 @@ class TextVectorizer:
ASCIIColors.info(f"No database file found : {self.database_file}") ASCIIColors.info(f"No database file found : {self.database_file}")
def show_document(self, query_text=None): def show_document(self, query_text=None, save_fig_path =None, show_interactive_form=False):
import textwrap import textwrap
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@ -66,9 +100,8 @@ class TextVectorizer:
from sklearn.manifold import TSNE from sklearn.manifold import TSNE
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
import torch
if self.personality_config.data_visualization_method=="PCA": if self.data_visualization_method=="PCA":
use_pca = True use_pca = True
else: else:
use_pca = False use_pca = False
@ -80,6 +113,7 @@ class TextVectorizer:
texts = list(self.texts.values()) texts = list(self.texts.values())
embeddings = self.embeddings embeddings = self.embeddings
emb = list(embeddings.values()) emb = list(embeddings.values())
ref = list(embeddings.keys())
if len(emb)>=2: if len(emb)>=2:
# Normalize embeddings # Normalize embeddings
emb = np.vstack(emb) emb = np.vstack(emb)
@ -94,6 +128,7 @@ class TextVectorizer:
# Combine the query embedding with the document embeddings # Combine the query embedding with the document embeddings
combined_embeddings = np.vstack((normalized_embeddings, query_normalized_embedding)) combined_embeddings = np.vstack((normalized_embeddings, query_normalized_embedding))
ref.append("Quey_chunk_0")
else: else:
# Combine the query embedding with the document embeddings # Combine the query embedding with the document embeddings
combined_embeddings = normalized_embeddings combined_embeddings = normalized_embeddings
@ -113,13 +148,19 @@ class TextVectorizer:
tsne = TSNE(n_components=2, perplexity=perplexity) tsne = TSNE(n_components=2, perplexity=perplexity)
embeddings_2d = tsne.fit_transform(combined_embeddings) embeddings_2d = tsne.fit_transform(combined_embeddings)
# Create a dictionary to map document paths to colors
document_path_colors = {}
for i, path in enumerate(ref):
document_path = "_".join(path.split("_")[:-1]) # Extract the document path (excluding chunk and chunk number)
if document_path not in document_path_colors:
# Assign a new color to the document path if it's not in the dictionary
document_path_colors[document_path] = sns.color_palette("hls", len(document_path_colors) + 1)[-1]
# Generate a list of colors for each data point based on the document path
point_colors = [document_path_colors["_".join(path.split("_")[:-1])] for path in ref]
# Create a scatter plot using Seaborn # Create a scatter plot using Seaborn
if query_text is not None: sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=point_colors) # Plot document embeddings
sns.scatterplot(x=embeddings_2d[:-1, 0], y=embeddings_2d[:-1, 1]) # Plot document embeddings
plt.scatter(embeddings_2d[-1, 0], embeddings_2d[-1, 1], color='red') # Plot query embedding
else:
sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1]) # Plot document embeddings
# Add labels to the scatter plot # Add labels to the scatter plot
for i, (x, y) in enumerate(embeddings_2d[:-1]): for i, (x, y) in enumerate(embeddings_2d[:-1]):
plt.text(x, y, str(i), fontsize=8) plt.text(x, y, str(i), fontsize=8)
@ -176,11 +217,12 @@ class TextVectorizer:
# Connect the click event handler to the figure # Connect the click event handler to the figure
plt.gcf().canvas.mpl_connect("button_press_event", on_click) plt.gcf().canvas.mpl_connect("button_press_event", on_click)
plt.savefig(self.lollms_paths.personal_uploads_path / self.personality.personality_folder_name/ "db.png") if save_fig_path:
plt.show() plt.savefig(save_fig_path)
if show_interactive_form:
plt.show()
def index_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False): def add_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
if document_id in self.embeddings and not force_vectorize: if document_id in self.embeddings and not force_vectorize:
print(f"Document {document_id} already exists. Skipping vectorization.") print(f"Document {document_id} already exists. Skipping vectorization.")
return return
@ -188,15 +230,13 @@ class TextVectorizer:
# Split tokens into sentences # Split tokens into sentences
sentences = text.split('. ') sentences = text.split('. ')
def remove_empty_sentences(sentences): def remove_empty_sentences(sentences):
return [sentence for sentence in sentences if sentence.strip() != ''] return [self.model.tokenize(sentence) for sentence in sentences if sentence.strip() != '']
sentences = remove_empty_sentences(sentences) sentences = remove_empty_sentences(sentences)
# Generate chunks with overlap and sentence boundaries # Generate chunks with overlap and sentence boundaries
chunks = [] chunks = []
current_chunk = [] current_chunk = []
for i in range(len(sentences)): for i in range(len(sentences)):
sentence = sentences[i] sentence_tokens = sentences[i]
sentence_tokens = self.model.tokenize(sentence)
# ASCIIColors.yellow(len(sentence_tokens)) # ASCIIColors.yellow(len(sentence_tokens))
if len(current_chunk) + len(sentence_tokens) <= chunk_size: if len(current_chunk) + len(sentence_tokens) <= chunk_size:
@ -204,45 +244,51 @@ class TextVectorizer:
else: else:
if current_chunk: if current_chunk:
chunks.append(current_chunk) chunks.append(current_chunk)
while len(sentence_tokens)>chunk_size: current_chunk=[]
current_chunk = sentence_tokens[0:chunk_size] for j in reversed(range(overlap_size)):
sentence_tokens = sentence_tokens[chunk_size:] current_chunk.extend(sentences[i-j-1])
chunks.append(current_chunk) current_chunk.extend(sentence_tokens)
current_chunk = sentence_tokens
if current_chunk: if current_chunk:
chunks.append(current_chunk) for i, chunk_text in enumerate(chunks):
chunk_id = f"{document_id}_chunk_{i + 1}"
if self.personality_config.vectorization_method=="ftidf_vectorizer": chunk_dict = {
from sklearn.feature_extraction.text import TfidfVectorizer "chunk_id": chunk_id,
"chunk_text": chunk_text
}
self.chunks.append(chunk_dict)
def index(self):
if self.vectorization_method=="ftidf_vectorizer":
self.vectorizer = TfidfVectorizer() self.vectorizer = TfidfVectorizer()
#if self.personality.config.debug: #if self.debug:
# ASCIIColors.yellow(','.join([len(chunk) for chunk in chunks])) # ASCIIColors.yellow(','.join([len(chunk) for chunk in chunks]))
data=[] data=[]
for chunk in chunks: for chunk in self.chunks:
try: try:
data.append(self.model.detokenize(chunk).replace("<s>","").replace("</s>","") ) data.append(self.model.detokenize(chunk["chunk_text"]).replace("<s>","").replace("</s>","") )
except Exception as ex: except Exception as ex:
print("oups") print("oups")
self.vectorizer.fit(data) self.vectorizer.fit(data)
self.embeddings = {} self.embeddings = {}
# Generate embeddings for each chunk # Generate embeddings for each chunk
for i, chunk in enumerate(chunks): for i, chunk in enumerate(self.chunks):
# Store chunk ID, embedding, and original text # Store chunk ID, embedding, and original text
chunk_id = f"{document_id}_chunk_{i + 1}" chunk_id = chunk["chunk_id"]
chunk_text = chunk["chunk_text"]
try: try:
self.texts[chunk_id] = self.model.detokenize(chunk[:chunk_size]) self.texts[chunk_id] = self.model.detokenize(chunk_text)
if self.personality_config.vectorization_method=="ftidf_vectorizer": if self.vectorization_method=="ftidf_vectorizer":
self.embeddings[chunk_id] = self.vectorizer.transform([self.texts[chunk_id]]).toarray() self.embeddings[chunk_id] = self.vectorizer.transform([self.texts[chunk_id]]).toarray()
else: else:
self.embeddings[chunk_id] = self.model.embed(self.texts[chunk_id]) self.embeddings[chunk_id] = self.model.embed(self.texts[chunk_id])
except Exception as ex: except Exception as ex:
print("oups") print("oups")
if self.personality_config.save_db: if self.save_db:
self.save_to_json() self.save_to_json()
self.ready = True self.ready = True
@ -252,10 +298,14 @@ class TextVectorizer:
def embed_query(self, query_text): def embed_query(self, query_text):
# Generate query embedding # Generate query embedding
if self.personality_config.vectorization_method=="ftidf_vectorizer": if self.vectorization_method=="ftidf_vectorizer":
query_embedding = self.vectorizer.transform([query_text]).toarray() query_embedding = self.vectorizer.transform([query_text]).toarray()
else: else:
query_embedding = self.model.embed(query_text) query_embedding = self.model.embed(query_text)
if query_embedding is None:
ASCIIColors.warning("The model doesn't implement embedding extraction")
self.vectorization_method="ftidf_vectorizer"
query_embedding = self.vectorizer.transform([query_text]).toarray()
return query_embedding return query_embedding
@ -277,6 +327,18 @@ class TextVectorizer:
return texts, sorted_similarities return texts, sorted_similarities
def toJson(self):
state = {
"embeddings": {str(k): v.tolist() if type(v)!=list else v for k, v in self.embeddings.items() },
"texts": self.texts,
"infos": self.infos,
"vectorizer": TFIDFLoader.create_vectorizer_from_dict(self.vectorizer) if self.vectorization_method=="ftidf_vectorizer" else None
}
return state
def setVectorizer(self, vectorizer_dict:dict):
self.vectorizer=TFIDFLoader.create_vectorizer_from_dict(vectorizer_dict)
def save_to_json(self): def save_to_json(self):
state = { state = {
"embeddings": {str(k): v.tolist() if type(v)!=list else v for k, v in self.embeddings.items() }, "embeddings": {str(k): v.tolist() if type(v)!=list else v for k, v in self.embeddings.items() },
@ -295,7 +357,7 @@ class TextVectorizer:
self.texts = state["texts"] self.texts = state["texts"]
self.infos= state["infos"] self.infos= state["infos"]
self.ready = True self.ready = True
if self.personality_config.vectorization_method=="ftidf_vectorizer": if self.vectorization_method=="ftidf_vectorizer":
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
data = list(self.texts.values()) data = list(self.texts.values())
if len(data)>0: if len(data)>0:
@ -304,11 +366,13 @@ class TextVectorizer:
self.embeddings={} self.embeddings={}
for k,v in self.texts.items(): for k,v in self.texts.items():
self.embeddings[k]= self.vectorizer.transform([v]).toarray() self.embeddings[k]= self.vectorizer.transform([v]).toarray()
def clear_database(self): def clear_database(self):
self.vectorizer=None self.vectorizer=None
self.embeddings = {} self.embeddings = {}
self.texts={} self.texts={}
if self.personality_config.save_db: if self.save_db:
self.save_to_json() self.save_to_json()

View File

@ -8,4 +8,9 @@ simple-websocket
eventlet eventlet
wget wget
setuptools setuptools
requests requests
matplotlib
seaborn
mplcursors
scikit-learn

View File

@ -26,7 +26,7 @@ def get_all_files(path):
setuptools.setup( setuptools.setup(
name="lollms", name="lollms",
version="2.1.56", version="2.1.59",
author="Saifeddine ALOUI", author="Saifeddine ALOUI",
author_email="aloui.saifeddine@gmail.com", author_email="aloui.saifeddine@gmail.com",
description="A python library for AI personality definition", description="A python library for AI personality definition",