This commit is contained in:
Saifeddine ALOUI 2023-09-15 01:21:52 +02:00
parent fdbb1387a9
commit d1a93718de
3 changed files with 131 additions and 114 deletions

View File

@ -282,9 +282,34 @@ Date: {{date}}
self._assets_list = contents self._assets_list = contents
return config return config
def remove_file(self, path, callback=None):
try:
self.files.remove(path)
Path(path).unlink()
if len(self.files)>0:
try:
self.vectorizer.remove_document(path)
if callback is not None:
callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
return True
except ValueError as ve:
ASCIIColors.error(f"Unsupported file format. Supported formats are {GenericDataLoader.get_supported_file_types()}")
return False
else:
self.vectorizer = None
except Exception as ex:
ASCIIColors.warning(f"Couldn't remove the file {path}")
def remove_all_files(self, callback=None):
for file in self.files:
try:
Path(file).unlink()
except Exception as ex:
ASCIIColors.warning(f"Couldn't remove the file {file}")
self.files=[]
self.vectorizer = None
return True
def add_file(self, path, callback=None): def add_file(self, path, callback=None):
self.files.append(path) self.files.append(path)
db_path = self.lollms_paths.personal_databases_path / "personalities" / self.name / "db.json" db_path = self.lollms_paths.personal_databases_path / "personalities" / self.name / "db.json"
db_path.parent.mkdir(parents=True, exist_ok=True) db_path.parent.mkdir(parents=True, exist_ok=True)

View File

@ -7,6 +7,19 @@ import re
import subprocess import subprocess
import gc import gc
class NumpyEncoderDecoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return {'__numpy_array__': True, 'data': obj.tolist()}
return super(NumpyEncoderDecoder, self).default(obj)
@staticmethod
def as_numpy_array(dct):
if '__numpy_array__' in dct:
return np.array(dct['data'])
return dct
def git_pull(folder_path): def git_pull(folder_path):
try: try:
# Change the current working directory to the desired folder # Change the current working directory to the desired folder
@ -192,14 +205,14 @@ class DocumentDecomposer:
return sentences return sentences
@staticmethod @staticmethod
def decompose_document(text, max_chunk_size, tokenize): def decompose_document(text, max_chunk_size, overlap_size, tokenize, detokenize):
cleaned_text = DocumentDecomposer.clean_text(text) cleaned_text = DocumentDecomposer.clean_text(text)
paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text) paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text)
# List to store the final clean chunks # List to store the final clean chunks
clean_chunks = [] clean_chunks = []
current_chunk = "" # To store the current chunk being built current_chunk = [] # To store the current chunk being built
l=0 l=0
for paragraph in paragraphs: for paragraph in paragraphs:
# Tokenize the paragraph into sentences # Tokenize the paragraph into sentences
@ -208,19 +221,33 @@ class DocumentDecomposer:
for sentence in sentences: for sentence in sentences:
# If adding the current sentence to the chunk exceeds the max_chunk_size, # If adding the current sentence to the chunk exceeds the max_chunk_size,
# we add the current chunk to the list of clean chunks and start a new chunk # we add the current chunk to the list of clean chunks and start a new chunk
nb_tokens = len(tokenize(sentence)) tokens = tokenize(sentence)
nb_tokens = len(tokens)
if nb_tokens>max_chunk_size:
while nb_tokens>max_chunk_size:
current_chunk += tokens[:max_chunk_size-l-1]
clean_chunks.append(current_chunk)
tokens = tokens[max_chunk_size-l-1-overlap_size:]
nb_tokens -= max_chunk_size-l-1-overlap_size
l=0
current_chunk = current_chunk[-overlap_size:]
else:
if l + nb_tokens + 1 > max_chunk_size: if l + nb_tokens + 1 > max_chunk_size:
clean_chunks.append(current_chunk.strip())
current_chunk = "" clean_chunks.append(current_chunk)
if overlap_size==0:
current_chunk = []
else:
current_chunk = current_chunk[-overlap_size:]
l=0 l=0
# Add the current sentence to the chunk # Add the current sentence to the chunk
current_chunk += sentence + " " current_chunk += tokens
l += nb_tokens l += nb_tokens
# Add the remaining chunk from the paragraph to the clean_chunks # Add the remaining chunk from the paragraph to the clean_chunks
if current_chunk: if current_chunk:
clean_chunks.append(current_chunk.strip()) clean_chunks.append(current_chunk)
current_chunk = "" current_chunk = ""
return clean_chunks return clean_chunks
@ -251,16 +278,12 @@ class TextVectorizer:
self.data_visualization_method = data_visualization_method self.data_visualization_method = data_visualization_method
if database_dict is not None: if database_dict is not None:
self.chunks = [] self.chunks = database_dict["chunks"]
self.embeddings = database_dict["embeddings"] self.vectorizer = database_dict["vectorizer"]
self.texts = database_dict["text"]
self.infos = database_dict["infos"] self.infos = database_dict["infos"]
self.ready = True self.ready = True
self.vectorizer = database_dict["vectorizer"]
else: else:
self.chunks = [] self.chunks = {}
self.embeddings = {}
self.texts = {}
self.ready = False self.ready = False
self.vectorizer = None self.vectorizer = None
@ -313,8 +336,8 @@ class TextVectorizer:
print("Showing pca representation :") print("Showing pca representation :")
else: else:
print("Showing t-sne representation :") print("Showing t-sne representation :")
texts = list(self.texts.values())
embeddings = self.embeddings embeddings = {key:chunk["embeddings"] for key, chunk in self.chunks.items()}
emb = list(embeddings.values()) emb = list(embeddings.values())
ref = list(embeddings.keys()) ref = list(embeddings.keys())
if len(emb)>=2: if len(emb)>=2:
@ -329,11 +352,11 @@ class TextVectorizer:
query_embedding = query_embedding.detach().squeeze().numpy() query_embedding = query_embedding.detach().squeeze().numpy()
query_normalized_embedding = query_embedding / np.linalg.norm(query_embedding) query_normalized_embedding = query_embedding / np.linalg.norm(query_embedding)
# Combine the query embedding with the document embeddings # Combine the query embeddings with the document embeddings
combined_embeddings = np.vstack((normalized_embeddings, query_normalized_embedding)) combined_embeddings = np.vstack((normalized_embeddings, query_normalized_embedding))
ref.append("Quey_chunk_0") ref.append("Quey_chunk_0")
else: else:
# Combine the query embedding with the document embeddings # Combine the query embeddings with the document embeddings
combined_embeddings = normalized_embeddings combined_embeddings = normalized_embeddings
if use_pca: if use_pca:
@ -383,7 +406,7 @@ class TextVectorizer:
def on_hover(sel): def on_hover(sel):
index = sel.target.index index = sel.target.index
if index > 0: if index > 0:
text = texts[index] text = self.chunks[index]["chunk_text"]
wrapped_text = textwrap.fill(text, width=50) # Wrap the text into multiple lines wrapped_text = textwrap.fill(text, width=50) # Wrap the text into multiple lines
sel.annotation.set_text(f"Index: {index}\nText:\n{wrapped_text}") sel.annotation.set_text(f"Index: {index}\nText:\n{wrapped_text}")
else: else:
@ -395,7 +418,7 @@ class TextVectorizer:
x, y = event.xdata, event.ydata x, y = event.xdata, event.ydata
distances = ((embeddings_2d[:, 0] - x) ** 2 + (embeddings_2d[:, 1] - y) ** 2) distances = ((embeddings_2d[:, 0] - x) ** 2 + (embeddings_2d[:, 1] - y) ** 2)
index = distances.argmin() index = distances.argmin()
text = texts[index] if index < len(texts) else query_text text = self.chunks[index]["chunk_text"] if index < len(self.chunks) else query_text
# Open a new Tkinter window with the content of the text # Open a new Tkinter window with the content of the text
root = Tk() root = Tk()
@ -429,66 +452,41 @@ class TextVectorizer:
if show_interactive_form: if show_interactive_form:
plt.show() plt.show()
def add_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False): def file_exists(self, document_name:str)->bool:
if document_id in self.embeddings and not force_vectorize: # Loop through the list of dictionaries
print(f"Document {document_id} already exists. Skipping vectorization.") for dictionary in self.chunks:
if 'document_name' in dictionary and dictionary['document_name'] == document_name:
# If the document_name is found in the current dictionary, set the flag to True and break the loop
document_name_found = True
return True
return False
def remove_document(self, document_name:str):
for dictionary in self.chunks:
if 'document_name' in dictionary and dictionary['document_name'] == document_name:
# If the document_name is found in the current dictionary, set the flag to True and break the loop
self.chunks.remove(dictionary)
return True
return False
def add_document(self, document_name:Path, text:str, chunk_size: int, overlap_size:int, force_vectorize=False):
if self.file_exists(document_name) and not force_vectorize:
print(f"Document {document_name} already exists. Skipping vectorization.")
return return
chunks_text = DocumentDecomposer.decompose_document(text, chunk_size, self.model.tokenize) chunks_text = DocumentDecomposer.decompose_document(text, chunk_size, overlap_size, self.model.tokenize, self.model.detokenize)
self.chunks = []
for i, chunk in enumerate(chunks_text): for i, chunk in enumerate(chunks_text):
chunk_id = f"{document_id}_chunk_{i + 1}" chunk_id = f"{document_name}_chunk_{i + 1}"
chunk_dict = { chunk_dict = {
"chunk_id": chunk_id, "document_name": document_name,
"chunk_text": self.model.tokenize(chunk) "chunk_index": i+1,
"chunk_text":self.model.detokenize(chunk),
"chunk_tokens": chunk,
"embeddings":[]
} }
self.chunks.append(chunk_dict) self.chunks[chunk_id] = chunk_dict
"""
# Split tokens into sentences
sentences = text.split('. ')
sentences = [sentence.strip() + '. ' if not sentence.endswith('.') else sentence for sentence in sentences]
import regex as re
def remove_special_characters(input_string):
# Define a regex pattern to match non-alphanumeric characters and also keep Arabic and Chinese characters
pattern = r'[^\p{L}\p{N}\p{Zs}\u0600-\u06FF\u4E00-\u9FFF]+'
# Remove special characters using the regex pattern
cleaned_string = re.sub(pattern, '', input_string)
return cleaned_string
def remove_empty_sentences(sentences):
return [self.model.tokenize(remove_special_characters(sentence)) for sentence in sentences if sentence.strip() != '']
sentences = remove_empty_sentences(sentences)
# Generate chunks with overlap and sentence boundaries
chunks = []
current_chunk = []
for i in range(len(sentences)):
sentence_tokens = sentences[i]
# ASCIIColors.yellow(len(sentence_tokens))
if len(current_chunk) + len(sentence_tokens) <= chunk_size:
current_chunk.extend(sentence_tokens)
else:
if current_chunk:
print(f"Chunk size:{len(current_chunk)}")
chunks.append(current_chunk)
current_chunk=[]
for j in reversed(range(overlap_size)):
current_chunk.extend(sentences[i-j-1])
current_chunk.extend(sentence_tokens)
if current_chunk:
for i, chunk_text in enumerate(chunks):
chunk_id = f"{document_id}_chunk_{i + 1}"
chunk_dict = {
"chunk_id": chunk_id,
"chunk_text": chunk_text
}
self.chunks.append(chunk_dict)
"""
def index(self): def index(self):
if self.vectorization_method=="ftidf_vectorizer": if self.vectorization_method=="ftidf_vectorizer":
@ -496,25 +494,21 @@ class TextVectorizer:
#if self.debug: #if self.debug:
# ASCIIColors.yellow(','.join([len(chunk) for chunk in chunks])) # ASCIIColors.yellow(','.join([len(chunk) for chunk in chunks]))
data=[] data=[]
for chunk in self.chunks: for k,chunk in self.chunks.items():
try: try:
data.append(self.model.detokenize(chunk["chunk_text"]).replace("<s>","").replace("</s>","") ) data.append(chunk["chunk_text"])
except Exception as ex: except Exception as ex:
print("oups") print("oups")
self.vectorizer.fit(data) self.vectorizer.fit(data)
self.embeddings = {}
# Generate embeddings for each chunk # Generate embeddings for each chunk
for i, chunk in enumerate(self.chunks): for chunk_id, chunk in self.chunks.items():
# Store chunk ID, embedding, and original text # Store chunk ID, embeddings, and original text
chunk_id = chunk["chunk_id"]
chunk_text = chunk["chunk_text"]
try: try:
self.texts[chunk_id] = self.model.detokenize(chunk_text)
if self.vectorization_method=="ftidf_vectorizer": if self.vectorization_method=="ftidf_vectorizer":
self.embeddings[chunk_id] = self.vectorizer.transform([self.texts[chunk_id]]).toarray() chunk["embeddings"] = self.vectorizer.transform([chunk["chunk_text"]]).toarray()
else: else:
self.embeddings[chunk_id] = self.model.embed(self.texts[chunk_id]) chunk["embeddings"] = self.model.embed(chunk["chunk_text"])
except Exception as ex: except Exception as ex:
print("oups") print("oups")
@ -527,13 +521,13 @@ class TextVectorizer:
def embed_query(self, query_text): def embed_query(self, query_text):
# Generate query embedding # Generate query embeddings
if self.vectorization_method=="ftidf_vectorizer": if self.vectorization_method=="ftidf_vectorizer":
query_embedding = self.vectorizer.transform([query_text]).toarray() query_embedding = self.vectorizer.transform([query_text]).toarray()
else: else:
query_embedding = self.model.embed(query_text) query_embedding = self.model.embed(query_text)
if query_embedding is None: if query_embedding is None:
ASCIIColors.warning("The model doesn't implement embedding extraction") ASCIIColors.warning("The model doesn't implement embeddings extraction")
self.vectorization_method="ftidf_vectorizer" self.vectorization_method="ftidf_vectorizer"
query_embedding = self.vectorizer.transform([query_text]).toarray() query_embedding = self.vectorizer.transform([query_text]).toarray()
@ -542,15 +536,15 @@ class TextVectorizer:
def recover_text(self, query_embedding, top_k=1): def recover_text(self, query_embedding, top_k=1):
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
similarities = {} similarities = {}
for chunk_id, chunk_embedding in self.embeddings.items(): for chunk_id, chunk in self.chunks.items():
similarity = cosine_similarity(query_embedding, chunk_embedding) similarity = cosine_similarity(query_embedding, chunk["embeddings"])
similarities[chunk_id] = similarity similarities[chunk_id] = similarity
# Sort the similarities and retrieve the top-k most similar embeddings # Sort the similarities and retrieve the top-k most similar embeddings
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k] sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k]
# Retrieve the original text associated with the most similar embeddings # Retrieve the original text associated with the most similar embeddings
texts = [self.texts[chunk_id] for chunk_id, _ in sorted_similarities] texts = [self.chunks[chunk_id]["chunk_text"] for chunk_id, _ in sorted_similarities]
if self.visualize_data_at_generate: if self.visualize_data_at_generate:
self.show_document() self.show_document()
@ -559,8 +553,7 @@ class TextVectorizer:
def toJson(self): def toJson(self):
state = { state = {
"embeddings": {str(k): v.tolist() if type(v)!=list else v for k, v in self.embeddings.items() }, "chunks": self.chunks,
"texts": self.texts,
"infos": self.infos, "infos": self.infos,
"vectorizer": TFIDFLoader.create_vectorizer_from_dict(self.vectorizer) if self.vectorization_method=="ftidf_vectorizer" else None "vectorizer": TFIDFLoader.create_vectorizer_from_dict(self.vectorizer) if self.vectorization_method=="ftidf_vectorizer" else None
} }
@ -571,38 +564,37 @@ class TextVectorizer:
def save_to_json(self): def save_to_json(self):
state = { state = {
"embeddings": {str(k): v.tolist() if type(v)!=list else v for k, v in self.embeddings.items() }, "chunks": self.chunks,
"texts": self.texts, "infos": self.infos,
"infos": self.infos "vectorizer": TFIDFLoader.create_vectorizer_from_dict(self.vectorizer) if self.vectorization_method=="ftidf_vectorizer" else None
} }
with open(self.database_file, "w") as f: with open(self.database_file, "w") as f:
json.dump(state, f) json.dump(state, f, cls=NumpyEncoderDecoder, indent=4)
def load_from_json(self): def load_from_json(self):
ASCIIColors.info("Loading vectorized documents") ASCIIColors.info("Loading vectorized documents")
with open(self.database_file, "r") as f: with open(self.database_file, "r") as f:
state = json.load(f) database = json.load(f, object_hook=NumpyEncoderDecoder.as_numpy_array)
self.embeddings = {k: v for k, v in state["embeddings"].items()} self.chunks = database["chunks"]
self.texts = state["texts"] self.infos= database["infos"]
self.infos= state["infos"]
self.ready = True self.ready = True
if self.vectorization_method=="ftidf_vectorizer": if self.vectorization_method=="ftidf_vectorizer":
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
data = list(self.texts.values()) data = [c["chunk_text"] for k,c in self.chunks.items()]
if len(data)>0: if len(data)>0:
self.vectorizer = TfidfVectorizer() self.vectorizer = TfidfVectorizer()
self.vectorizer.fit(data) self.vectorizer.fit(data)
self.embeddings={} self.embeddings={}
for k,v in self.texts.items(): for k,chunk in self.chunks.items():
self.embeddings[k]= self.vectorizer.transform([v]).toarray() chunk["embeddings"][k]= self.vectorizer.transform([chunk["embeddings"]]).toarray()
def clear_database(self): def clear_database(self):
self.ready = False self.ready = False
self.vectorizer=None self.vectorizer=None
self.embeddings = {} self.chunks = {}
self.texts={} self.infos={}
if self.save_db: if self.save_db:
self.save_to_json() self.save_to_json()
@ -620,12 +612,12 @@ class GenericDataLoader:
return GenericDataLoader.read_html_file(file_path) return GenericDataLoader.read_html_file(file_path)
elif file_path.suffix == ".pptx": elif file_path.suffix == ".pptx":
return GenericDataLoader.read_pptx_file(file_path) return GenericDataLoader.read_pptx_file(file_path)
if file_path.suffix in [".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]: if file_path.suffix in [".txt", ".rtf", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]:
return GenericDataLoader.read_text_file(file_path) return GenericDataLoader.read_text_file(file_path)
else: else:
raise ValueError("Unknown file type") raise ValueError("Unknown file type")
def get_supported_file_types(): def get_supported_file_types():
return ["pdf", "txt", "docx", "json", "html", "pptx",".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"] return ["pdf", "txt", "docx", "json", "html", "pptx",".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat", ".rtf"]
@staticmethod @staticmethod
def read_pdf_file(file_path): def read_pdf_file(file_path):
try: try:

View File

@ -26,7 +26,7 @@ def get_all_files(path):
setuptools.setup( setuptools.setup(
name="lollms", name="lollms",
version="5.3.0", version="5.5.0",
author="Saifeddine ALOUI", author="Saifeddine ALOUI",
author_email="aloui.saifeddine@gmail.com", author_email="aloui.saifeddine@gmail.com",
description="A python library for AI personality definition", description="A python library for AI personality definition",