mirror of
https://github.com/ParisNeo/lollms.git
synced 2024-12-21 21:47:54 +00:00
bugfix
This commit is contained in:
parent
4882b74b60
commit
f4a9bbec32
@ -9,7 +9,6 @@
|
||||
# module.
|
||||
######
|
||||
from ascii_colors import ASCIIColors, trace_exception
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
import json
|
||||
@ -1012,90 +1011,6 @@ class File64BitsManager:
|
||||
file.write(base64.b64decode(File64BitsManager.extract_content_from_base64(b64data)))
|
||||
|
||||
return filename
|
||||
class TFIDFLoader:
|
||||
@staticmethod
|
||||
def create_vectorizer_from_dict(tfidf_info):
|
||||
vectorizer = TfidfVectorizer(**tfidf_info['params'])
|
||||
vectorizer.vocabulary_ = tfidf_info['vocabulary']
|
||||
vectorizer.idf_ = [tfidf_info['idf_values'][feature] for feature in vectorizer.get_feature_names()]
|
||||
return vectorizer
|
||||
|
||||
@staticmethod
|
||||
def create_dict_from_vectorizer(vectorizer):
|
||||
tfidf_info = {
|
||||
"vocabulary": vectorizer.vocabulary_,
|
||||
"idf_values": dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)),
|
||||
"params": vectorizer.get_params()
|
||||
}
|
||||
return tfidf_info
|
||||
|
||||
class DocumentDecomposer:
|
||||
@staticmethod
|
||||
def clean_text(text):
|
||||
# Remove extra returns and leading/trailing spaces
|
||||
text = text.replace('\r', '').strip()
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def split_into_paragraphs(text):
|
||||
# Split the text into paragraphs using two or more consecutive newlines
|
||||
paragraphs = [p+"\n" for p in re.split(r'\n{2,}', text)]
|
||||
return paragraphs
|
||||
|
||||
@staticmethod
|
||||
def tokenize_sentences(paragraph):
|
||||
# Custom sentence tokenizer using simple regex-based approach
|
||||
sentences = [s+"." for s in paragraph.split(".")]
|
||||
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
||||
return sentences
|
||||
|
||||
@staticmethod
|
||||
def decompose_document(text, max_chunk_size, overlap_size, tokenize, detokenize):
|
||||
cleaned_text = DocumentDecomposer.clean_text(text)
|
||||
paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text)
|
||||
|
||||
# List to store the final clean chunks
|
||||
clean_chunks = []
|
||||
|
||||
current_chunk = [] # To store the current chunk being built
|
||||
l=0
|
||||
for paragraph in paragraphs:
|
||||
# Tokenize the paragraph into sentences
|
||||
sentences = DocumentDecomposer.tokenize_sentences(paragraph)
|
||||
|
||||
for sentence in sentences:
|
||||
# If adding the current sentence to the chunk exceeds the max_chunk_size,
|
||||
# we add the current chunk to the list of clean chunks and start a new chunk
|
||||
tokens = tokenize(sentence)
|
||||
nb_tokens = len(tokens)
|
||||
if nb_tokens>max_chunk_size:
|
||||
while nb_tokens>max_chunk_size:
|
||||
current_chunk += tokens[:max_chunk_size-l-1]
|
||||
clean_chunks.append(current_chunk)
|
||||
tokens = tokens[max_chunk_size-l-1-overlap_size:]
|
||||
nb_tokens -= max_chunk_size-l-1-overlap_size
|
||||
l=0
|
||||
current_chunk = current_chunk[-overlap_size:]
|
||||
else:
|
||||
if l + nb_tokens + 1 > max_chunk_size:
|
||||
|
||||
clean_chunks.append(current_chunk)
|
||||
if overlap_size==0:
|
||||
current_chunk = []
|
||||
else:
|
||||
current_chunk = current_chunk[-overlap_size:]
|
||||
l=0
|
||||
|
||||
# Add the current sentence to the chunk
|
||||
current_chunk += tokens
|
||||
l += nb_tokens
|
||||
|
||||
# Add the remaining chunk from the paragraph to the clean_chunks
|
||||
if current_chunk:
|
||||
clean_chunks.append(current_chunk)
|
||||
current_chunk = ""
|
||||
|
||||
return clean_chunks
|
||||
|
||||
class PromptReshaper:
|
||||
def __init__(self, template:str):
|
||||
|
Loading…
Reference in New Issue
Block a user