mirror of
https://github.com/ParisNeo/lollms.git
synced 2024-12-18 20:27:58 +00:00
repared file vectorization
This commit is contained in:
parent
f348e02fb3
commit
4908ed7e6d
@ -181,7 +181,7 @@ class DocumentDecomposer:
|
|||||||
return sentences
|
return sentences
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def decompose_document(text, max_chunk_size):
|
def decompose_document(text, max_chunk_size, tokenize):
|
||||||
cleaned_text = DocumentDecomposer.clean_text(text)
|
cleaned_text = DocumentDecomposer.clean_text(text)
|
||||||
paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text)
|
paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text)
|
||||||
|
|
||||||
@ -189,7 +189,7 @@ class DocumentDecomposer:
|
|||||||
clean_chunks = []
|
clean_chunks = []
|
||||||
|
|
||||||
current_chunk = "" # To store the current chunk being built
|
current_chunk = "" # To store the current chunk being built
|
||||||
|
l=0
|
||||||
for paragraph in paragraphs:
|
for paragraph in paragraphs:
|
||||||
# Tokenize the paragraph into sentences
|
# Tokenize the paragraph into sentences
|
||||||
sentences = DocumentDecomposer.tokenize_sentences(paragraph)
|
sentences = DocumentDecomposer.tokenize_sentences(paragraph)
|
||||||
@ -197,12 +197,15 @@ class DocumentDecomposer:
|
|||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
# If adding the current sentence to the chunk exceeds the max_chunk_size,
|
# If adding the current sentence to the chunk exceeds the max_chunk_size,
|
||||||
# we add the current chunk to the list of clean chunks and start a new chunk
|
# we add the current chunk to the list of clean chunks and start a new chunk
|
||||||
if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
|
nb_tokens = len(tokenize(sentence))
|
||||||
|
if l + nb_tokens + 1 > max_chunk_size:
|
||||||
clean_chunks.append(current_chunk.strip())
|
clean_chunks.append(current_chunk.strip())
|
||||||
current_chunk = ""
|
current_chunk = ""
|
||||||
|
l=0
|
||||||
|
|
||||||
# Add the current sentence to the chunk
|
# Add the current sentence to the chunk
|
||||||
current_chunk += sentence + " "
|
current_chunk += sentence + " "
|
||||||
|
l += nb_tokens
|
||||||
|
|
||||||
# Add the remaining chunk from the paragraph to the clean_chunks
|
# Add the remaining chunk from the paragraph to the clean_chunks
|
||||||
if current_chunk:
|
if current_chunk:
|
||||||
@ -419,7 +422,7 @@ class TextVectorizer:
|
|||||||
if document_id in self.embeddings and not force_vectorize:
|
if document_id in self.embeddings and not force_vectorize:
|
||||||
print(f"Document {document_id} already exists. Skipping vectorization.")
|
print(f"Document {document_id} already exists. Skipping vectorization.")
|
||||||
return
|
return
|
||||||
chunks_text = DocumentDecomposer.decompose_document(text, chunk_size)
|
chunks_text = DocumentDecomposer.decompose_document(text, chunk_size, self.model.tokenize)
|
||||||
self.chunks = []
|
self.chunks = []
|
||||||
for i, chunk in enumerate(chunks_text):
|
for i, chunk in enumerate(chunks_text):
|
||||||
chunk_id = f"{document_id}_chunk_{i + 1}"
|
chunk_id = f"{document_id}_chunk_{i + 1}"
|
||||||
@ -598,8 +601,6 @@ class GenericDataLoader:
|
|||||||
def read_file(file_path:Path):
|
def read_file(file_path:Path):
|
||||||
if file_path.suffix ==".pdf":
|
if file_path.suffix ==".pdf":
|
||||||
return GenericDataLoader.read_pdf_file(file_path)
|
return GenericDataLoader.read_pdf_file(file_path)
|
||||||
elif file_path.suffix == ".txt":
|
|
||||||
return GenericDataLoader.read_text_file(file_path)
|
|
||||||
elif file_path.suffix == ".docx":
|
elif file_path.suffix == ".docx":
|
||||||
return GenericDataLoader.read_docx_file(file_path)
|
return GenericDataLoader.read_docx_file(file_path)
|
||||||
elif file_path.suffix == ".json":
|
elif file_path.suffix == ".json":
|
||||||
@ -608,10 +609,12 @@ class GenericDataLoader:
|
|||||||
return GenericDataLoader.read_html_file(file_path)
|
return GenericDataLoader.read_html_file(file_path)
|
||||||
elif file_path.suffix == ".pptx":
|
elif file_path.suffix == ".pptx":
|
||||||
return GenericDataLoader.read_pptx_file(file_path)
|
return GenericDataLoader.read_pptx_file(file_path)
|
||||||
|
if file_path.suffix in [".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]:
|
||||||
|
return GenericDataLoader.read_text_file(file_path)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unknown file type")
|
raise ValueError("Unknown file type")
|
||||||
def get_supported_file_types():
|
def get_supported_file_types():
|
||||||
return ["pdf", "txt", "docx", "json", "html", "pptx"]
|
return ["pdf", "txt", "docx", "json", "html", "pptx",".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_pdf_file(file_path):
|
def read_pdf_file(file_path):
|
||||||
try:
|
try:
|
||||||
|
2
setup.py
2
setup.py
@ -26,7 +26,7 @@ def get_all_files(path):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="lollms",
|
name="lollms",
|
||||||
version="4.1.0",
|
version="4.1.5",
|
||||||
author="Saifeddine ALOUI",
|
author="Saifeddine ALOUI",
|
||||||
author_email="aloui.saifeddine@gmail.com",
|
author_email="aloui.saifeddine@gmail.com",
|
||||||
description="A python library for AI personality definition",
|
description="A python library for AI personality definition",
|
||||||
|
Loading…
Reference in New Issue
Block a user