mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-03-11 06:54:02 +00:00
bugfix
This commit is contained in:
parent
4882b74b60
commit
f4a9bbec32
@ -9,7 +9,6 @@
|
|||||||
# module.
|
# module.
|
||||||
######
|
######
|
||||||
from ascii_colors import ASCIIColors, trace_exception
|
from ascii_colors import ASCIIColors, trace_exception
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
@ -1012,90 +1011,6 @@ class File64BitsManager:
|
|||||||
file.write(base64.b64decode(File64BitsManager.extract_content_from_base64(b64data)))
|
file.write(base64.b64decode(File64BitsManager.extract_content_from_base64(b64data)))
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
class TFIDFLoader:
|
|
||||||
@staticmethod
|
|
||||||
def create_vectorizer_from_dict(tfidf_info):
|
|
||||||
vectorizer = TfidfVectorizer(**tfidf_info['params'])
|
|
||||||
vectorizer.vocabulary_ = tfidf_info['vocabulary']
|
|
||||||
vectorizer.idf_ = [tfidf_info['idf_values'][feature] for feature in vectorizer.get_feature_names()]
|
|
||||||
return vectorizer
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def create_dict_from_vectorizer(vectorizer):
|
|
||||||
tfidf_info = {
|
|
||||||
"vocabulary": vectorizer.vocabulary_,
|
|
||||||
"idf_values": dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)),
|
|
||||||
"params": vectorizer.get_params()
|
|
||||||
}
|
|
||||||
return tfidf_info
|
|
||||||
|
|
||||||
class DocumentDecomposer:
|
|
||||||
@staticmethod
|
|
||||||
def clean_text(text):
|
|
||||||
# Remove extra returns and leading/trailing spaces
|
|
||||||
text = text.replace('\r', '').strip()
|
|
||||||
return text
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def split_into_paragraphs(text):
|
|
||||||
# Split the text into paragraphs using two or more consecutive newlines
|
|
||||||
paragraphs = [p+"\n" for p in re.split(r'\n{2,}', text)]
|
|
||||||
return paragraphs
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def tokenize_sentences(paragraph):
|
|
||||||
# Custom sentence tokenizer using simple regex-based approach
|
|
||||||
sentences = [s+"." for s in paragraph.split(".")]
|
|
||||||
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
|
||||||
return sentences
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def decompose_document(text, max_chunk_size, overlap_size, tokenize, detokenize):
|
|
||||||
cleaned_text = DocumentDecomposer.clean_text(text)
|
|
||||||
paragraphs = DocumentDecomposer.split_into_paragraphs(cleaned_text)
|
|
||||||
|
|
||||||
# List to store the final clean chunks
|
|
||||||
clean_chunks = []
|
|
||||||
|
|
||||||
current_chunk = [] # To store the current chunk being built
|
|
||||||
l=0
|
|
||||||
for paragraph in paragraphs:
|
|
||||||
# Tokenize the paragraph into sentences
|
|
||||||
sentences = DocumentDecomposer.tokenize_sentences(paragraph)
|
|
||||||
|
|
||||||
for sentence in sentences:
|
|
||||||
# If adding the current sentence to the chunk exceeds the max_chunk_size,
|
|
||||||
# we add the current chunk to the list of clean chunks and start a new chunk
|
|
||||||
tokens = tokenize(sentence)
|
|
||||||
nb_tokens = len(tokens)
|
|
||||||
if nb_tokens>max_chunk_size:
|
|
||||||
while nb_tokens>max_chunk_size:
|
|
||||||
current_chunk += tokens[:max_chunk_size-l-1]
|
|
||||||
clean_chunks.append(current_chunk)
|
|
||||||
tokens = tokens[max_chunk_size-l-1-overlap_size:]
|
|
||||||
nb_tokens -= max_chunk_size-l-1-overlap_size
|
|
||||||
l=0
|
|
||||||
current_chunk = current_chunk[-overlap_size:]
|
|
||||||
else:
|
|
||||||
if l + nb_tokens + 1 > max_chunk_size:
|
|
||||||
|
|
||||||
clean_chunks.append(current_chunk)
|
|
||||||
if overlap_size==0:
|
|
||||||
current_chunk = []
|
|
||||||
else:
|
|
||||||
current_chunk = current_chunk[-overlap_size:]
|
|
||||||
l=0
|
|
||||||
|
|
||||||
# Add the current sentence to the chunk
|
|
||||||
current_chunk += tokens
|
|
||||||
l += nb_tokens
|
|
||||||
|
|
||||||
# Add the remaining chunk from the paragraph to the clean_chunks
|
|
||||||
if current_chunk:
|
|
||||||
clean_chunks.append(current_chunk)
|
|
||||||
current_chunk = ""
|
|
||||||
|
|
||||||
return clean_chunks
|
|
||||||
|
|
||||||
class PromptReshaper:
|
class PromptReshaper:
|
||||||
def __init__(self, template:str):
|
def __init__(self, template:str):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user