Merge branch 'main' of https://github.com/ParisNeo/lollms into main

2024-12-20 05:08:00 +00:00 · 2023-07-28 09:21:09 +02:00 · 2023-07-28 09:21:09 +02:00 · cf17bcea23
commit cf17bcea23
parent 5b36fd53fe 5f53bda4a6
2 changed files with 18 additions and 4 deletions
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@ -268,10 +268,13 @@ class TextVectorizer:
            # Connect the click event handler to the figure
            plt.gcf().canvas.mpl_connect("button_press_event", on_click)
            if save_fig_path:
                try:
                    plt.savefig(save_fig_path)
                except Exception as ex:
                    trace_exception(ex)
            if show_interactive_form:
                plt.show()
            if save_fig_path:
                plt.savefig(save_fig_path)
    def add_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
        if document_id in self.embeddings and not force_vectorize:
@ -280,8 +283,18 @@ class TextVectorizer:
        # Split tokens into sentences
        sentences = text.split('. ')
        sentences = [sentence.strip() + '. ' if not sentence.endswith('.') else sentence for sentence in sentences]
        import regex as re
        def remove_special_characters(input_string):
            # Define a regex pattern to match non-alphanumeric characters and also keep Arabic and Chinese characters
            pattern = r'[^\p{L}\p{N}\p{Zs}\u0600-\u06FF\u4E00-\u9FFF]+'
            # Remove special characters using the regex pattern
            cleaned_string = re.sub(pattern, '', input_string)
            return cleaned_string
        def remove_empty_sentences(sentences):
-            return [self.model.tokenize(sentence) for sentence in sentences if sentence.strip() != '']
+            return [self.model.tokenize(remove_special_characters(sentence)) for sentence in sentences if sentence.strip() != '']
        sentences = remove_empty_sentences(sentences)
        # Generate chunks with overlap and sentence boundaries
        chunks = []
@ -294,6 +307,7 @@ class TextVectorizer:
                current_chunk.extend(sentence_tokens)
            else:
                if current_chunk:
                    print(f"Chunk size:{len(current_chunk)}")
                    chunks.append(current_chunk)
                current_chunk=[]
--- a/setup.py
+++ b/setup.py
@ -26,7 +26,7 @@ def get_all_files(path):
 setuptools.setup(
    name="lollms",
-    version="2.1.59",
+    version="2.1.60",
    author="Saifeddine ALOUI",
    author_email="aloui.saifeddine@gmail.com",
    description="A python library for AI personality definition",