repared file vectorization

2024-12-18 20:27:58 +00:00 · 2023-08-25 02:12:43 +02:00 · 2023-08-25 02:12:43 +02:00 · 4908ed7e6d
commit 4908ed7e6d
parent f348e02fb3
2 changed files with 15 additions and 12 deletions
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@ -181,7 +181,7 @@ class DocumentDecomposer:
        return sentences
    @staticmethod
-    def decompose_document(text, max_chunk_size):
+    def decompose_document(text, max_chunk_size, tokenize):
        cleaned_text = DocumentDecomposer.clean_text(text)
        paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text)
@ -189,7 +189,7 @@ class DocumentDecomposer:
        clean_chunks = []
        current_chunk = ""  # To store the current chunk being built
-
+        l=0
        for paragraph in paragraphs:
            # Tokenize the paragraph into sentences
            sentences = DocumentDecomposer.tokenize_sentences(paragraph)
@ -197,12 +197,15 @@ class DocumentDecomposer:
            for sentence in sentences:
                # If adding the current sentence to the chunk exceeds the max_chunk_size,
                # we add the current chunk to the list of clean chunks and start a new chunk
-                if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
+                nb_tokens = len(tokenize(sentence))
                if l + nb_tokens + 1 > max_chunk_size:
                    clean_chunks.append(current_chunk.strip())
                    current_chunk = ""
                    l=0
                # Add the current sentence to the chunk
                current_chunk += sentence + " "
                l += nb_tokens
        # Add the remaining chunk from the paragraph to the clean_chunks
        if current_chunk:
@ -419,7 +422,7 @@ class TextVectorizer:
        if document_id in self.embeddings and not force_vectorize:
            print(f"Document {document_id} already exists. Skipping vectorization.")
            return
-        chunks_text = DocumentDecomposer.decompose_document(text, chunk_size)
+        chunks_text = DocumentDecomposer.decompose_document(text, chunk_size, self.model.tokenize)
        self.chunks = []
        for i, chunk in enumerate(chunks_text):
            chunk_id = f"{document_id}_chunk_{i + 1}"
@ -598,8 +601,6 @@ class GenericDataLoader:
    def read_file(file_path:Path):
        if file_path.suffix ==".pdf":
            return GenericDataLoader.read_pdf_file(file_path)
        elif file_path.suffix == ".txt":
            return GenericDataLoader.read_text_file(file_path)
        elif file_path.suffix == ".docx":
            return GenericDataLoader.read_docx_file(file_path)
        elif file_path.suffix == ".json":
@ -608,10 +609,12 @@ class GenericDataLoader:
            return GenericDataLoader.read_html_file(file_path)
        elif file_path.suffix == ".pptx":
            return GenericDataLoader.read_pptx_file(file_path)
        if file_path.suffix in [".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]:
            return GenericDataLoader.read_text_file(file_path)
        else:
            raise ValueError("Unknown file type")
    def get_supported_file_types():
-        return ["pdf", "txt", "docx", "json", "html", "pptx"]    
+        return ["pdf", "txt", "docx", "json", "html", "pptx",".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]    
    @staticmethod        
    def read_pdf_file(file_path):
        try:
--- a/setup.py
+++ b/setup.py
@ -26,7 +26,7 @@ def get_all_files(path):
 setuptools.setup(
    name="lollms",
-    version="4.1.0",
+    version="4.1.5",
    author="Saifeddine ALOUI",
    author_email="aloui.saifeddine@gmail.com",
    description="A python library for AI personality definition",