Enhanced vectorization for all

This commit is contained in:
Saifeddine ALOUI 2023-08-23 04:21:11 +02:00
parent 42fa3489b3
commit a02cc89a1d
4 changed files with 54 additions and 57 deletions

View File

@ -1,5 +1,5 @@
# =================== Lord Of Large Language Models Configuration file ===========================
version: 11
version: 19
binding_name: null
model_name: null
@ -36,4 +36,12 @@ user_avatar: default_user
# Automatic update
auto_update: false
debug: false
debug: false
# Data vectorization
data_vectorization_method: "ftidf_vectorizer" #"model_embedding" or "ftidf_vectorizer"
data_visualization_method: "PCA" #"PCA" or "TSNE"
data_vectorization_save_db: False # For each new session, new files
data_vectorization_chunk_size: 512 # chunk size
data_vectorization_overlap_size: 128 # overlap between chunks size
data_vectorization_nb_chunks: 2 # number of chunks to use

View File

@ -19,6 +19,7 @@ from lollms.helpers import ASCIIColors
from lollms.types import MSG_TYPE
from typing import Callable
import json
from lollms.utilities import TextVectorizer, GenericDataLoader
def is_package_installed(package_name):
@ -72,6 +73,7 @@ class AIPersonality:
self.callback = callback
self.files = []
self.vectorizer = None
self.installation_option = installation_option
@ -282,13 +284,30 @@ Date: {{date}}
return config
def add_file(self, path, callback=None):
if callback is not None:
callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
self.files.append(path)
return True
db_path = self.lollms_paths.personal_databases_path / self.name / "db.json"
db_path.parent.mkdir(parents=True, exist_ok=True)
if self.vectorizer is None:
self.vectorizer = TextVectorizer(self.config.data_vectorization_method, # supported "model_embedding" or "ftidf_vectorizer"
model=self.model, #needed in case of using model_embedding
database_path=db_path,
save_db=self.config.data_vectorization_save_db,
visualize_data_at_startup=False,
visualize_data_at_add_file=False,
visualize_data_at_generate=False,
data_visualization_method="PCA",
database_dict=None)
try:
data = GenericDataLoader.read_file(path)
self.vectorizer.add_document(path, data, self.config.data_vectorization_chunk_size, self.config.data_vectorization_overlap_size)
self.vectorizer.index()
if callback is not None:
callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
return True
except ValueError as ve:
ASCIIColors.error(f"Unsupported file format. Supported formats are {GenericDataLoader.get_supported_file_types()}")
return False
def save_personality(self, package_path=None):
"""
Save the personality parameters to a YAML configuration file.

View File

@ -1,6 +1,4 @@
from lollms.personality import APScript
from lollms.helpers import ASCIIColors, trace_exception
from lollms.paths import LollmsPaths
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from pathlib import Path
@ -597,6 +595,24 @@ class TextVectorizer:
class GenericDataLoader:
@staticmethod
def read_file(file_path:Path):
if file_path.suffix ==".pdf":
return GenericDataLoader.read_pdf_file(file_path)
elif file_path.suffix == ".txt":
return GenericDataLoader.read_text_file(file_path)
elif file_path.suffix == ".docx":
return GenericDataLoader.read_docx_file(file_path)
elif file_path.suffix == ".json":
return GenericDataLoader.read_json_file(file_path)
elif file_path.suffix == ".html":
return GenericDataLoader.read_html_file(file_path)
elif file_path.suffix == ".pptx":
return GenericDataLoader.read_pptx_file(file_path)
else:
raise ValueError("Unknown file type")
def get_supported_file_types():
return ["pdf", "txt", "docx", "json", "html", "pptx"]
@staticmethod
def read_pdf_file(file_path):
try:
import PyPDF2
@ -623,52 +639,6 @@ class GenericDataLoader:
markdown_text = text.replace('\n', ' \n') # Adding double spaces at the end of each line for Markdown line breaks
return markdown_text
"""
from io import BytesIO
with open(file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
all_text = []
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
if '/Resources' in page and '/XObject' in page['/Resources']:
xObject = page['/Resources']['/XObject']
if xObject is not None:
for obj in xObject:
# Check if the object is an image
if xObject[obj]['/Subtype'] == '/Image':
image_data = xObject[obj].get_object()
image_stream = image_data.get_object()
image_stream_data = image_stream.get_data()
try:
# Extract text from the image using pytesseract
extracted_text = pytesseract.image_to_string(Image.open(BytesIO(image_stream_data)))
all_text.append(extracted_text)
except pytesseract.TesseractNotFoundError:
ASCIIColors.error("Please install tesserract to enable ocr data extraction from your pdf file")
except UnidentifiedImageError:
# Ignore images that cannot be identified
pass
# Extract regular text from the page using PyPDF2's text extraction
regular_text = page.extract_text()
if regular_text:
all_text.append(regular_text)
return "\n\n".join(all_text)
"""
"""
text = ""
with open(file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page in pdf_reader.pages:
text += page.extract_text()
return text
"""
@staticmethod
def read_docx_file(file_path):

View File

@ -26,7 +26,7 @@ def get_all_files(path):
setuptools.setup(
name="lollms",
version="4.0.1",
version="4.0.2",
author="Saifeddine ALOUI",
author_email="aloui.saifeddine@gmail.com",
description="A python library for AI personality definition",