From a0d8ced80e28034dd10dceb8224323b43f937b20 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Mon, 18 Sep 2023 00:59:15 +0200 Subject: [PATCH] upgraded lollms utilisites --- lollms/utilities.py | 38 +++++++++++++++++++++++++------------- setup.py | 2 +- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/lollms/utilities.py b/lollms/utilities.py index 13ba8fd..ceeaf30 100644 --- a/lollms/utilities.py +++ b/lollms/utilities.py @@ -471,22 +471,34 @@ class TextVectorizer: - def add_document(self, document_name:Path, text:str, chunk_size: int, overlap_size:int, force_vectorize=False): - + def add_document(self, document_name:Path, text:str, chunk_size: int, overlap_size:int, force_vectorize=False,add_as_a_bloc=False): if self.file_exists(document_name) and not force_vectorize: print(f"Document {document_name} already exists. Skipping vectorization.") return - chunks_text = DocumentDecomposer.decompose_document(text, chunk_size, overlap_size, self.model.tokenize, self.model.detokenize) - for i, chunk in enumerate(chunks_text): - chunk_id = f"{document_name}_chunk_{i + 1}" - chunk_dict = { - "document_name": document_name, - "chunk_index": i+1, - "chunk_text":self.model.detokenize(chunk), - "chunk_tokens": chunk, - "embeddings":[] - } - self.chunks[chunk_id] = chunk_dict + if add_as_a_bloc: + chunks_text = [self.model.tokenize(text)] + for i, chunk in enumerate(chunks_text): + chunk_id = f"{document_name}_chunk_{i + 1}" + chunk_dict = { + "document_name": document_name, + "chunk_index": i+1, + "chunk_text":self.model.detokenize(chunk), + "chunk_tokens": chunk, + "embeddings":[] + } + self.chunks[chunk_id] = chunk_dict + else: + chunks_text = DocumentDecomposer.decompose_document(text, chunk_size, overlap_size, self.model.tokenize, self.model.detokenize) + for i, chunk in enumerate(chunks_text): + chunk_id = f"{document_name}_chunk_{i + 1}" + chunk_dict = { + "document_name": document_name, + "chunk_index": i+1, + "chunk_text":self.model.detokenize(chunk), + "chunk_tokens": chunk, + "embeddings":[] + } + self.chunks[chunk_id] = chunk_dict def index(self): if self.vectorization_method=="ftidf_vectorizer": diff --git a/setup.py b/setup.py index d99ce29..7de86a4 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ def get_all_files(path): setuptools.setup( name="lollms", - version="5.5.0", + version="5.5.1", author="Saifeddine ALOUI", author_email="aloui.saifeddine@gmail.com", description="A python library for AI personality definition",