From 5b36fd53fea543e17c8c821478d1d530909c1618 Mon Sep 17 00:00:00 2001 From: saloui <saifeddine.aloui@cea.fr> Date: Fri, 28 Jul 2023 09:21:07 +0200 Subject: [PATCH] updated --- lollms/utilities.py | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/lollms/utilities.py b/lollms/utilities.py index 329ec40..8537fe1 100644 --- a/lollms/utilities.py +++ b/lollms/utilities.py @@ -5,6 +5,10 @@ from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from pathlib import Path import json +import re +import nltk +from nltk.tokenize import sent_tokenize + class TFIDFLoader: @staticmethod @@ -22,6 +26,52 @@ class TFIDFLoader: "params": vectorizer.get_params() } return tfidf_info + +class DocumentDecomposer: + def __init__(self, max_chunk_size): + self.max_chunk_size = max_chunk_size + + def clean_text(self, text): + # Remove extra returns and leading/trailing spaces + text = text.replace('\r', '').strip() + return text + + def split_into_paragraphs(self, text): + # Split the text into paragraphs using two or more consecutive newlines + paragraphs = re.split(r'\n{2,}', text) + return paragraphs + + def decompose_document(self, text): + cleaned_text = self.clean_text(text) + paragraphs = self.split_into_paragraphs(cleaned_text) + + # List to store the final clean chunks + clean_chunks = [] + + current_chunk = "" # To store the current chunk being built + + for paragraph in paragraphs: + # Split the paragraph into sentences + sentences = sent_tokenize(paragraph) + + for sentence in sentences: + # If adding the current sentence to the chunk exceeds the max_chunk_size, + # we add the current chunk to the list of clean chunks and start a new chunk + if len(current_chunk) + len(sentence) + 1 > self.max_chunk_size: + clean_chunks.append(current_chunk.strip()) + current_chunk = "" + + # Add the current sentence to the chunk + current_chunk += sentence + " " + + # Add the remaining chunk from the paragraph to the clean_chunks + if current_chunk: + clean_chunks.append(current_chunk.strip()) + current_chunk = "" + + return clean_chunks + + class TextVectorizer: def __init__( self,