repared file vectorization

This commit is contained in:
Saifeddine ALOUI 2023-08-25 02:12:43 +02:00
parent f348e02fb3
commit 4908ed7e6d
2 changed files with 15 additions and 12 deletions

View File

@ -181,7 +181,7 @@ class DocumentDecomposer:
return sentences return sentences
@staticmethod @staticmethod
def decompose_document(text, max_chunk_size): def decompose_document(text, max_chunk_size, tokenize):
cleaned_text = DocumentDecomposer.clean_text(text) cleaned_text = DocumentDecomposer.clean_text(text)
paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text) paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text)
@ -189,7 +189,7 @@ class DocumentDecomposer:
clean_chunks = [] clean_chunks = []
current_chunk = "" # To store the current chunk being built current_chunk = "" # To store the current chunk being built
l=0
for paragraph in paragraphs: for paragraph in paragraphs:
# Tokenize the paragraph into sentences # Tokenize the paragraph into sentences
sentences = DocumentDecomposer.tokenize_sentences(paragraph) sentences = DocumentDecomposer.tokenize_sentences(paragraph)
@ -197,17 +197,20 @@ class DocumentDecomposer:
for sentence in sentences: for sentence in sentences:
# If adding the current sentence to the chunk exceeds the max_chunk_size, # If adding the current sentence to the chunk exceeds the max_chunk_size,
# we add the current chunk to the list of clean chunks and start a new chunk # we add the current chunk to the list of clean chunks and start a new chunk
if len(current_chunk) + len(sentence) + 1 > max_chunk_size: nb_tokens = len(tokenize(sentence))
if l + nb_tokens + 1 > max_chunk_size:
clean_chunks.append(current_chunk.strip()) clean_chunks.append(current_chunk.strip())
current_chunk = "" current_chunk = ""
l=0
# Add the current sentence to the chunk # Add the current sentence to the chunk
current_chunk += sentence + " " current_chunk += sentence + " "
l += nb_tokens
# Add the remaining chunk from the paragraph to the clean_chunks # Add the remaining chunk from the paragraph to the clean_chunks
if current_chunk: if current_chunk:
clean_chunks.append(current_chunk.strip()) clean_chunks.append(current_chunk.strip())
current_chunk = "" current_chunk = ""
return clean_chunks return clean_chunks
@ -419,7 +422,7 @@ class TextVectorizer:
if document_id in self.embeddings and not force_vectorize: if document_id in self.embeddings and not force_vectorize:
print(f"Document {document_id} already exists. Skipping vectorization.") print(f"Document {document_id} already exists. Skipping vectorization.")
return return
chunks_text = DocumentDecomposer.decompose_document(text, chunk_size) chunks_text = DocumentDecomposer.decompose_document(text, chunk_size, self.model.tokenize)
self.chunks = [] self.chunks = []
for i, chunk in enumerate(chunks_text): for i, chunk in enumerate(chunks_text):
chunk_id = f"{document_id}_chunk_{i + 1}" chunk_id = f"{document_id}_chunk_{i + 1}"
@ -598,8 +601,6 @@ class GenericDataLoader:
def read_file(file_path:Path): def read_file(file_path:Path):
if file_path.suffix ==".pdf": if file_path.suffix ==".pdf":
return GenericDataLoader.read_pdf_file(file_path) return GenericDataLoader.read_pdf_file(file_path)
elif file_path.suffix == ".txt":
return GenericDataLoader.read_text_file(file_path)
elif file_path.suffix == ".docx": elif file_path.suffix == ".docx":
return GenericDataLoader.read_docx_file(file_path) return GenericDataLoader.read_docx_file(file_path)
elif file_path.suffix == ".json": elif file_path.suffix == ".json":
@ -608,10 +609,12 @@ class GenericDataLoader:
return GenericDataLoader.read_html_file(file_path) return GenericDataLoader.read_html_file(file_path)
elif file_path.suffix == ".pptx": elif file_path.suffix == ".pptx":
return GenericDataLoader.read_pptx_file(file_path) return GenericDataLoader.read_pptx_file(file_path)
if file_path.suffix in [".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]:
return GenericDataLoader.read_text_file(file_path)
else: else:
raise ValueError("Unknown file type") raise ValueError("Unknown file type")
def get_supported_file_types(): def get_supported_file_types():
return ["pdf", "txt", "docx", "json", "html", "pptx"] return ["pdf", "txt", "docx", "json", "html", "pptx",".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]
@staticmethod @staticmethod
def read_pdf_file(file_path): def read_pdf_file(file_path):
try: try:

View File

@ -26,7 +26,7 @@ def get_all_files(path):
setuptools.setup( setuptools.setup(
name="lollms", name="lollms",
version="4.1.0", version="4.1.5",
author="Saifeddine ALOUI", author="Saifeddine ALOUI",
author_email="aloui.saifeddine@gmail.com", author_email="aloui.saifeddine@gmail.com",
description="A python library for AI personality definition", description="A python library for AI personality definition",