From f4a9bbec3242435e0798d1100751ca74848f7fd9 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Fri, 19 Apr 2024 22:01:26 +0200 Subject: [PATCH] bugfix --- lollms/utilities.py | 85 --------------------------------------------- 1 file changed, 85 deletions(-) diff --git a/lollms/utilities.py b/lollms/utilities.py index c57f4d7..18d959e 100644 --- a/lollms/utilities.py +++ b/lollms/utilities.py @@ -9,7 +9,6 @@ # module. ###### from ascii_colors import ASCIIColors, trace_exception -from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from pathlib import Path import json @@ -1012,90 +1011,6 @@ class File64BitsManager: file.write(base64.b64decode(File64BitsManager.extract_content_from_base64(b64data))) return filename -class TFIDFLoader: - @staticmethod - def create_vectorizer_from_dict(tfidf_info): - vectorizer = TfidfVectorizer(**tfidf_info['params']) - vectorizer.vocabulary_ = tfidf_info['vocabulary'] - vectorizer.idf_ = [tfidf_info['idf_values'][feature] for feature in vectorizer.get_feature_names()] - return vectorizer - - @staticmethod - def create_dict_from_vectorizer(vectorizer): - tfidf_info = { - "vocabulary": vectorizer.vocabulary_, - "idf_values": dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)), - "params": vectorizer.get_params() - } - return tfidf_info - -class DocumentDecomposer: - @staticmethod - def clean_text(text): - # Remove extra returns and leading/trailing spaces - text = text.replace('\r', '').strip() - return text - - @staticmethod - def split_into_paragraphs(text): - # Split the text into paragraphs using two or more consecutive newlines - paragraphs = [p+"\n" for p in re.split(r'\n{2,}', text)] - return paragraphs - - @staticmethod - def tokenize_sentences(paragraph): - # Custom sentence tokenizer using simple regex-based approach - sentences = [s+"." for s in paragraph.split(".")] - sentences = [sentence.strip() for sentence in sentences if sentence.strip()] - return sentences - - @staticmethod - def decompose_document(text, max_chunk_size, overlap_size, tokenize, detokenize): - cleaned_text = DocumentDecomposer.clean_text(text) - paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text) - - # List to store the final clean chunks - clean_chunks = [] - - current_chunk = [] # To store the current chunk being built - l=0 - for paragraph in paragraphs: - # Tokenize the paragraph into sentences - sentences = DocumentDecomposer.tokenize_sentences(paragraph) - - for sentence in sentences: - # If adding the current sentence to the chunk exceeds the max_chunk_size, - # we add the current chunk to the list of clean chunks and start a new chunk - tokens = tokenize(sentence) - nb_tokens = len(tokens) - if nb_tokens>max_chunk_size: - while nb_tokens>max_chunk_size: - current_chunk += tokens[:max_chunk_size-l-1] - clean_chunks.append(current_chunk) - tokens = tokens[max_chunk_size-l-1-overlap_size:] - nb_tokens -= max_chunk_size-l-1-overlap_size - l=0 - current_chunk = current_chunk[-overlap_size:] - else: - if l + nb_tokens + 1 > max_chunk_size: - - clean_chunks.append(current_chunk) - if overlap_size==0: - current_chunk = [] - else: - current_chunk = current_chunk[-overlap_size:] - l=0 - - # Add the current sentence to the chunk - current_chunk += tokens - l += nb_tokens - - # Add the remaining chunk from the paragraph to the clean_chunks - if current_chunk: - clean_chunks.append(current_chunk) - current_chunk = "" - - return clean_chunks class PromptReshaper: def __init__(self, template:str):