mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-04-08 11:24:14 +00:00
Enhanced vectorization for all
This commit is contained in:
parent
42fa3489b3
commit
a02cc89a1d
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Models Configuration file ===========================
|
||||
version: 11
|
||||
version: 19
|
||||
binding_name: null
|
||||
model_name: null
|
||||
|
||||
@ -36,4 +36,12 @@ user_avatar: default_user
|
||||
|
||||
# Automatic update
|
||||
auto_update: false
|
||||
debug: false
|
||||
debug: false
|
||||
|
||||
# Data vectorization
|
||||
data_vectorization_method: "ftidf_vectorizer" #"model_embedding" or "ftidf_vectorizer"
|
||||
data_visualization_method: "PCA" #"PCA" or "TSNE"
|
||||
data_vectorization_save_db: False # For each new session, new files
|
||||
data_vectorization_chunk_size: 512 # chunk size
|
||||
data_vectorization_overlap_size: 128 # overlap between chunks size
|
||||
data_vectorization_nb_chunks: 2 # number of chunks to use
|
@ -19,6 +19,7 @@ from lollms.helpers import ASCIIColors
|
||||
from lollms.types import MSG_TYPE
|
||||
from typing import Callable
|
||||
import json
|
||||
from lollms.utilities import TextVectorizer, GenericDataLoader
|
||||
|
||||
|
||||
def is_package_installed(package_name):
|
||||
@ -72,6 +73,7 @@ class AIPersonality:
|
||||
self.callback = callback
|
||||
|
||||
self.files = []
|
||||
self.vectorizer = None
|
||||
|
||||
self.installation_option = installation_option
|
||||
|
||||
@ -282,13 +284,30 @@ Date: {{date}}
|
||||
return config
|
||||
|
||||
def add_file(self, path, callback=None):
|
||||
if callback is not None:
|
||||
callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
|
||||
|
||||
self.files.append(path)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
db_path = self.lollms_paths.personal_databases_path / self.name / "db.json"
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if self.vectorizer is None:
|
||||
self.vectorizer = TextVectorizer(self.config.data_vectorization_method, # supported "model_embedding" or "ftidf_vectorizer"
|
||||
model=self.model, #needed in case of using model_embedding
|
||||
database_path=db_path,
|
||||
save_db=self.config.data_vectorization_save_db,
|
||||
visualize_data_at_startup=False,
|
||||
visualize_data_at_add_file=False,
|
||||
visualize_data_at_generate=False,
|
||||
data_visualization_method="PCA",
|
||||
database_dict=None)
|
||||
try:
|
||||
data = GenericDataLoader.read_file(path)
|
||||
self.vectorizer.add_document(path, data, self.config.data_vectorization_chunk_size, self.config.data_vectorization_overlap_size)
|
||||
self.vectorizer.index()
|
||||
if callback is not None:
|
||||
callback("File added successfully",MSG_TYPE.MSG_TYPE_INFO)
|
||||
return True
|
||||
except ValueError as ve:
|
||||
ASCIIColors.error(f"Unsupported file format. Supported formats are {GenericDataLoader.get_supported_file_types()}")
|
||||
return False
|
||||
def save_personality(self, package_path=None):
|
||||
"""
|
||||
Save the personality parameters to a YAML configuration file.
|
||||
|
@ -1,6 +1,4 @@
|
||||
from lollms.personality import APScript
|
||||
from lollms.helpers import ASCIIColors, trace_exception
|
||||
from lollms.paths import LollmsPaths
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
@ -597,6 +595,24 @@ class TextVectorizer:
|
||||
|
||||
class GenericDataLoader:
|
||||
@staticmethod
|
||||
def read_file(file_path:Path):
|
||||
if file_path.suffix ==".pdf":
|
||||
return GenericDataLoader.read_pdf_file(file_path)
|
||||
elif file_path.suffix == ".txt":
|
||||
return GenericDataLoader.read_text_file(file_path)
|
||||
elif file_path.suffix == ".docx":
|
||||
return GenericDataLoader.read_docx_file(file_path)
|
||||
elif file_path.suffix == ".json":
|
||||
return GenericDataLoader.read_json_file(file_path)
|
||||
elif file_path.suffix == ".html":
|
||||
return GenericDataLoader.read_html_file(file_path)
|
||||
elif file_path.suffix == ".pptx":
|
||||
return GenericDataLoader.read_pptx_file(file_path)
|
||||
else:
|
||||
raise ValueError("Unknown file type")
|
||||
def get_supported_file_types():
|
||||
return ["pdf", "txt", "docx", "json", "html", "pptx"]
|
||||
@staticmethod
|
||||
def read_pdf_file(file_path):
|
||||
try:
|
||||
import PyPDF2
|
||||
@ -623,52 +639,6 @@ class GenericDataLoader:
|
||||
markdown_text = text.replace('\n', ' \n') # Adding double spaces at the end of each line for Markdown line breaks
|
||||
|
||||
return markdown_text
|
||||
"""
|
||||
|
||||
|
||||
from io import BytesIO
|
||||
with open(file_path, 'rb') as pdf_file:
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
||||
all_text = []
|
||||
for page_num in range(len(pdf_reader.pages)):
|
||||
page = pdf_reader.pages[page_num]
|
||||
if '/Resources' in page and '/XObject' in page['/Resources']:
|
||||
xObject = page['/Resources']['/XObject']
|
||||
if xObject is not None:
|
||||
for obj in xObject:
|
||||
# Check if the object is an image
|
||||
if xObject[obj]['/Subtype'] == '/Image':
|
||||
image_data = xObject[obj].get_object()
|
||||
image_stream = image_data.get_object()
|
||||
image_stream_data = image_stream.get_data()
|
||||
|
||||
try:
|
||||
# Extract text from the image using pytesseract
|
||||
extracted_text = pytesseract.image_to_string(Image.open(BytesIO(image_stream_data)))
|
||||
all_text.append(extracted_text)
|
||||
except pytesseract.TesseractNotFoundError:
|
||||
ASCIIColors.error("Please install tesserract to enable ocr data extraction from your pdf file")
|
||||
except UnidentifiedImageError:
|
||||
# Ignore images that cannot be identified
|
||||
pass
|
||||
|
||||
# Extract regular text from the page using PyPDF2's text extraction
|
||||
regular_text = page.extract_text()
|
||||
if regular_text:
|
||||
all_text.append(regular_text)
|
||||
|
||||
return "\n\n".join(all_text)
|
||||
"""
|
||||
|
||||
"""
|
||||
text = ""
|
||||
with open(file_path, 'rb') as pdf_file:
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
||||
for page in pdf_reader.pages:
|
||||
text += page.extract_text()
|
||||
|
||||
return text
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def read_docx_file(file_path):
|
||||
|
Loading…
x
Reference in New Issue
Block a user