Merge branch 'main' of https://github.com/ParisNeo/lollms into main

This commit is contained in:
saloui 2023-07-28 09:21:09 +02:00
commit cf17bcea23
2 changed files with 18 additions and 4 deletions

View File

@ -268,10 +268,13 @@ class TextVectorizer:
# Connect the click event handler to the figure
plt.gcf().canvas.mpl_connect("button_press_event", on_click)
if save_fig_path:
try:
plt.savefig(save_fig_path)
except Exception as ex:
trace_exception(ex)
if show_interactive_form:
plt.show()
if save_fig_path:
plt.savefig(save_fig_path)
def add_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
if document_id in self.embeddings and not force_vectorize:
@ -280,8 +283,18 @@ class TextVectorizer:
# Split tokens into sentences
sentences = text.split('. ')
sentences = [sentence.strip() + '. ' if not sentence.endswith('.') else sentence for sentence in sentences]
import regex as re
def remove_special_characters(input_string):
# Define a regex pattern to match non-alphanumeric characters and also keep Arabic and Chinese characters
pattern = r'[^\p{L}\p{N}\p{Zs}\u0600-\u06FF\u4E00-\u9FFF]+'
# Remove special characters using the regex pattern
cleaned_string = re.sub(pattern, '', input_string)
return cleaned_string
def remove_empty_sentences(sentences):
return [self.model.tokenize(sentence) for sentence in sentences if sentence.strip() != '']
return [self.model.tokenize(remove_special_characters(sentence)) for sentence in sentences if sentence.strip() != '']
sentences = remove_empty_sentences(sentences)
# Generate chunks with overlap and sentence boundaries
chunks = []
@ -294,6 +307,7 @@ class TextVectorizer:
current_chunk.extend(sentence_tokens)
else:
if current_chunk:
print(f"Chunk size:{len(current_chunk)}")
chunks.append(current_chunk)
current_chunk=[]

View File

@ -26,7 +26,7 @@ def get_all_files(path):
setuptools.setup(
name="lollms",
version="2.1.59",
version="2.1.60",
author="Saifeddine ALOUI",
author_email="aloui.saifeddine@gmail.com",
description="A python library for AI personality definition",