mirror of
https://github.com/ParisNeo/lollms.git
synced 2024-12-18 20:27:58 +00:00
Merge branch 'main' of https://github.com/ParisNeo/lollms into main
This commit is contained in:
commit
cf17bcea23
@ -268,10 +268,13 @@ class TextVectorizer:
|
||||
|
||||
# Connect the click event handler to the figure
|
||||
plt.gcf().canvas.mpl_connect("button_press_event", on_click)
|
||||
if save_fig_path:
|
||||
try:
|
||||
plt.savefig(save_fig_path)
|
||||
except Exception as ex:
|
||||
trace_exception(ex)
|
||||
if show_interactive_form:
|
||||
plt.show()
|
||||
if save_fig_path:
|
||||
plt.savefig(save_fig_path)
|
||||
|
||||
def add_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
|
||||
if document_id in self.embeddings and not force_vectorize:
|
||||
@ -280,8 +283,18 @@ class TextVectorizer:
|
||||
|
||||
# Split tokens into sentences
|
||||
sentences = text.split('. ')
|
||||
sentences = [sentence.strip() + '. ' if not sentence.endswith('.') else sentence for sentence in sentences]
|
||||
import regex as re
|
||||
|
||||
def remove_special_characters(input_string):
|
||||
# Define a regex pattern to match non-alphanumeric characters and also keep Arabic and Chinese characters
|
||||
pattern = r'[^\p{L}\p{N}\p{Zs}\u0600-\u06FF\u4E00-\u9FFF]+'
|
||||
# Remove special characters using the regex pattern
|
||||
cleaned_string = re.sub(pattern, '', input_string)
|
||||
return cleaned_string
|
||||
|
||||
def remove_empty_sentences(sentences):
|
||||
return [self.model.tokenize(sentence) for sentence in sentences if sentence.strip() != '']
|
||||
return [self.model.tokenize(remove_special_characters(sentence)) for sentence in sentences if sentence.strip() != '']
|
||||
sentences = remove_empty_sentences(sentences)
|
||||
# Generate chunks with overlap and sentence boundaries
|
||||
chunks = []
|
||||
@ -294,6 +307,7 @@ class TextVectorizer:
|
||||
current_chunk.extend(sentence_tokens)
|
||||
else:
|
||||
if current_chunk:
|
||||
print(f"Chunk size:{len(current_chunk)}")
|
||||
chunks.append(current_chunk)
|
||||
|
||||
current_chunk=[]
|
||||
|
Loading…
Reference in New Issue
Block a user