mirror of
https://github.com/ParisNeo/lollms.git
synced 2024-12-20 05:08:00 +00:00
Merge branch 'main' of https://github.com/ParisNeo/lollms into main
This commit is contained in:
commit
cf17bcea23
@ -268,10 +268,13 @@ class TextVectorizer:
|
|||||||
|
|
||||||
# Connect the click event handler to the figure
|
# Connect the click event handler to the figure
|
||||||
plt.gcf().canvas.mpl_connect("button_press_event", on_click)
|
plt.gcf().canvas.mpl_connect("button_press_event", on_click)
|
||||||
|
if save_fig_path:
|
||||||
|
try:
|
||||||
|
plt.savefig(save_fig_path)
|
||||||
|
except Exception as ex:
|
||||||
|
trace_exception(ex)
|
||||||
if show_interactive_form:
|
if show_interactive_form:
|
||||||
plt.show()
|
plt.show()
|
||||||
if save_fig_path:
|
|
||||||
plt.savefig(save_fig_path)
|
|
||||||
|
|
||||||
def add_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
|
def add_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
|
||||||
if document_id in self.embeddings and not force_vectorize:
|
if document_id in self.embeddings and not force_vectorize:
|
||||||
@ -280,8 +283,18 @@ class TextVectorizer:
|
|||||||
|
|
||||||
# Split tokens into sentences
|
# Split tokens into sentences
|
||||||
sentences = text.split('. ')
|
sentences = text.split('. ')
|
||||||
|
sentences = [sentence.strip() + '. ' if not sentence.endswith('.') else sentence for sentence in sentences]
|
||||||
|
import regex as re
|
||||||
|
|
||||||
|
def remove_special_characters(input_string):
|
||||||
|
# Define a regex pattern to match non-alphanumeric characters and also keep Arabic and Chinese characters
|
||||||
|
pattern = r'[^\p{L}\p{N}\p{Zs}\u0600-\u06FF\u4E00-\u9FFF]+'
|
||||||
|
# Remove special characters using the regex pattern
|
||||||
|
cleaned_string = re.sub(pattern, '', input_string)
|
||||||
|
return cleaned_string
|
||||||
|
|
||||||
def remove_empty_sentences(sentences):
|
def remove_empty_sentences(sentences):
|
||||||
return [self.model.tokenize(sentence) for sentence in sentences if sentence.strip() != '']
|
return [self.model.tokenize(remove_special_characters(sentence)) for sentence in sentences if sentence.strip() != '']
|
||||||
sentences = remove_empty_sentences(sentences)
|
sentences = remove_empty_sentences(sentences)
|
||||||
# Generate chunks with overlap and sentence boundaries
|
# Generate chunks with overlap and sentence boundaries
|
||||||
chunks = []
|
chunks = []
|
||||||
@ -294,6 +307,7 @@ class TextVectorizer:
|
|||||||
current_chunk.extend(sentence_tokens)
|
current_chunk.extend(sentence_tokens)
|
||||||
else:
|
else:
|
||||||
if current_chunk:
|
if current_chunk:
|
||||||
|
print(f"Chunk size:{len(current_chunk)}")
|
||||||
chunks.append(current_chunk)
|
chunks.append(current_chunk)
|
||||||
|
|
||||||
current_chunk=[]
|
current_chunk=[]
|
||||||
|
2
setup.py
2
setup.py
@ -26,7 +26,7 @@ def get_all_files(path):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="lollms",
|
name="lollms",
|
||||||
version="2.1.59",
|
version="2.1.60",
|
||||||
author="Saifeddine ALOUI",
|
author="Saifeddine ALOUI",
|
||||||
author_email="aloui.saifeddine@gmail.com",
|
author_email="aloui.saifeddine@gmail.com",
|
||||||
description="A python library for AI personality definition",
|
description="A python library for AI personality definition",
|
||||||
|
Loading…
Reference in New Issue
Block a user