This commit is contained in:
Saifeddine ALOUI 2024-04-19 22:01:26 +02:00
parent 4882b74b60
commit f4a9bbec32

View File

@ -9,7 +9,6 @@
# module.
######
from ascii_colors import ASCIIColors, trace_exception
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from pathlib import Path
import json
@ -1012,90 +1011,6 @@ class File64BitsManager:
file.write(base64.b64decode(File64BitsManager.extract_content_from_base64(b64data)))
return filename
class TFIDFLoader:
@staticmethod
def create_vectorizer_from_dict(tfidf_info):
vectorizer = TfidfVectorizer(**tfidf_info['params'])
vectorizer.vocabulary_ = tfidf_info['vocabulary']
vectorizer.idf_ = [tfidf_info['idf_values'][feature] for feature in vectorizer.get_feature_names()]
return vectorizer
@staticmethod
def create_dict_from_vectorizer(vectorizer):
tfidf_info = {
"vocabulary": vectorizer.vocabulary_,
"idf_values": dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)),
"params": vectorizer.get_params()
}
return tfidf_info
class DocumentDecomposer:
@staticmethod
def clean_text(text):
# Remove extra returns and leading/trailing spaces
text = text.replace('\r', '').strip()
return text
@staticmethod
def split_into_paragraphs(text):
# Split the text into paragraphs using two or more consecutive newlines
paragraphs = [p+"\n" for p in re.split(r'\n{2,}', text)]
return paragraphs
@staticmethod
def tokenize_sentences(paragraph):
# Custom sentence tokenizer using simple regex-based approach
sentences = [s+"." for s in paragraph.split(".")]
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
return sentences
@staticmethod
def decompose_document(text, max_chunk_size, overlap_size, tokenize, detokenize):
cleaned_text = DocumentDecomposer.clean_text(text)
paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text)
# List to store the final clean chunks
clean_chunks = []
current_chunk = [] # To store the current chunk being built
l=0
for paragraph in paragraphs:
# Tokenize the paragraph into sentences
sentences = DocumentDecomposer.tokenize_sentences(paragraph)
for sentence in sentences:
# If adding the current sentence to the chunk exceeds the max_chunk_size,
# we add the current chunk to the list of clean chunks and start a new chunk
tokens = tokenize(sentence)
nb_tokens = len(tokens)
if nb_tokens>max_chunk_size:
while nb_tokens>max_chunk_size:
current_chunk += tokens[:max_chunk_size-l-1]
clean_chunks.append(current_chunk)
tokens = tokens[max_chunk_size-l-1-overlap_size:]
nb_tokens -= max_chunk_size-l-1-overlap_size
l=0
current_chunk = current_chunk[-overlap_size:]
else:
if l + nb_tokens + 1 > max_chunk_size:
clean_chunks.append(current_chunk)
if overlap_size==0:
current_chunk = []
else:
current_chunk = current_chunk[-overlap_size:]
l=0
# Add the current sentence to the chunk
current_chunk += tokens
l += nb_tokens
# Add the remaining chunk from the paragraph to the clean_chunks
if current_chunk:
clean_chunks.append(current_chunk)
current_chunk = ""
return clean_chunks
class PromptReshaper:
def __init__(self, template:str):