bugfix

2025-04-06 10:26:41 +00:00 · 2024-04-19 22:01:26 +02:00 · 2024-04-19 22:01:26 +02:00 · f4a9bbec32
commit f4a9bbec32
parent 4882b74b60
1 changed files with 0 additions and 85 deletions
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@ -9,7 +9,6 @@
 # module.
 ######
 from ascii_colors import ASCIIColors, trace_exception
-from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
 from pathlib import Path
 import json
@ -1012,90 +1011,6 @@ class File64BitsManager:
            file.write(base64.b64decode(File64BitsManager.extract_content_from_base64(b64data)))

        return filename
-class TFIDFLoader:
-    @staticmethod
-    def create_vectorizer_from_dict(tfidf_info):
-        vectorizer = TfidfVectorizer(**tfidf_info['params'])
-        vectorizer.vocabulary_ = tfidf_info['vocabulary']
-        vectorizer.idf_ = [tfidf_info['idf_values'][feature] for feature in vectorizer.get_feature_names()]
-        return vectorizer
-
-    @staticmethod
-    def create_dict_from_vectorizer(vectorizer):
-        tfidf_info = {
-            "vocabulary": vectorizer.vocabulary_,
-            "idf_values": dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)),
-            "params": vectorizer.get_params()
-        }
-        return tfidf_info
-    
-class DocumentDecomposer:
-    @staticmethod
-    def clean_text(text):
-        # Remove extra returns and leading/trailing spaces
-        text = text.replace('\r', '').strip()
-        return text
-
-    @staticmethod
-    def split_into_paragraphs(text):
-        # Split the text into paragraphs using two or more consecutive newlines
-        paragraphs = [p+"\n" for p in re.split(r'\n{2,}', text)]
-        return paragraphs
-
-    @staticmethod
-    def tokenize_sentences(paragraph):
-        # Custom sentence tokenizer using simple regex-based approach
-        sentences = [s+"." for s in paragraph.split(".")]
-        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
-        return sentences
-
-    @staticmethod
-    def decompose_document(text, max_chunk_size, overlap_size, tokenize, detokenize):
-        cleaned_text = DocumentDecomposer.clean_text(text)
-        paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text)
-
-        # List to store the final clean chunks
-        clean_chunks = []
-
-        current_chunk = []  # To store the current chunk being built
-        l=0
-        for paragraph in paragraphs:
-            # Tokenize the paragraph into sentences
-            sentences = DocumentDecomposer.tokenize_sentences(paragraph)
-
-            for sentence in sentences:
-                # If adding the current sentence to the chunk exceeds the max_chunk_size,
-                # we add the current chunk to the list of clean chunks and start a new chunk
-                tokens = tokenize(sentence)
-                nb_tokens = len(tokens)
-                if nb_tokens>max_chunk_size:
-                    while nb_tokens>max_chunk_size:
-                        current_chunk += tokens[:max_chunk_size-l-1]
-                        clean_chunks.append(current_chunk)
-                        tokens = tokens[max_chunk_size-l-1-overlap_size:]
-                        nb_tokens -= max_chunk_size-l-1-overlap_size
-                        l=0
-                        current_chunk = current_chunk[-overlap_size:]
-                else:
-                    if l + nb_tokens + 1 > max_chunk_size:
-
-                        clean_chunks.append(current_chunk)
-                        if overlap_size==0:
-                            current_chunk = []
-                        else:
-                            current_chunk = current_chunk[-overlap_size:]
-                        l=0
-
-                    # Add the current sentence to the chunk
-                    current_chunk += tokens
-                    l += nb_tokens
-
-        # Add the remaining chunk from the paragraph to the clean_chunks
-        if current_chunk:
-            clean_chunks.append(current_chunk)
-            current_chunk = ""
-
-        return clean_chunks
    
 class PromptReshaper:
    def __init__(self, template:str):