mirror of
https://github.com/ParisNeo/lollms-webui.git
synced 2025-02-05 02:19:16 +00:00
152 lines
5.7 KiB
Python
152 lines
5.7 KiB
Python
import sys
|
|
from PyQt5.QtWidgets import (
|
|
QApplication, QMainWindow, QVBoxLayout, QPushButton, QFileDialog, QCheckBox, QLabel, QWidget, QMessageBox
|
|
)
|
|
from pathlib import Path
|
|
import os
|
|
from PyPDF2 import PdfReader
|
|
import docx
|
|
from pptx import Presentation
|
|
|
|
|
|
class TextExtractorApp(QMainWindow):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.init_ui()
|
|
|
|
def init_ui(self):
|
|
self.setWindowTitle("Text Extractor")
|
|
self.setGeometry(100, 100, 400, 200)
|
|
|
|
# Main layout
|
|
layout = QVBoxLayout()
|
|
|
|
# Folder selection buttons and labels
|
|
self.source_folder_label = QLabel("Source Folder: Not Selected")
|
|
self.target_folder_label = QLabel("Target Folder: Not Selected")
|
|
layout.addWidget(self.source_folder_label)
|
|
layout.addWidget(self.target_folder_label)
|
|
|
|
self.select_source_button = QPushButton("Select Source Folder")
|
|
self.select_source_button.clicked.connect(self.select_source_folder)
|
|
layout.addWidget(self.select_source_button)
|
|
|
|
self.select_target_button = QPushButton("Select Target Folder")
|
|
self.select_target_button.clicked.connect(self.select_target_folder)
|
|
layout.addWidget(self.select_target_button)
|
|
|
|
# Subfolder checkbox
|
|
self.subfolder_checkbox = QCheckBox("Include Subfolders")
|
|
layout.addWidget(self.subfolder_checkbox)
|
|
|
|
# Start button
|
|
self.start_button = QPushButton("Start Extraction")
|
|
self.start_button.clicked.connect(self.start_extraction)
|
|
layout.addWidget(self.start_button)
|
|
|
|
# Set central widget
|
|
container = QWidget()
|
|
container.setLayout(layout)
|
|
self.setCentralWidget(container)
|
|
|
|
# Variables to store folder paths
|
|
self.source_folder = None
|
|
self.target_folder = None
|
|
|
|
def select_source_folder(self):
|
|
folder = QFileDialog.getExistingDirectory(self, "Select Source Folder")
|
|
if folder:
|
|
self.source_folder = Path(folder)
|
|
self.source_folder_label.setText(f"Source Folder: {folder}")
|
|
|
|
def select_target_folder(self):
|
|
folder = QFileDialog.getExistingDirectory(self, "Select Target Folder")
|
|
if folder:
|
|
self.target_folder = Path(folder)
|
|
self.target_folder_label.setText(f"Target Folder: {folder}")
|
|
|
|
def start_extraction(self):
|
|
if not self.source_folder or not self.target_folder:
|
|
QMessageBox.warning(self, "Error", "Please select both source and target folders.")
|
|
return
|
|
|
|
include_subfolders = self.subfolder_checkbox.isChecked()
|
|
self.extract_text_from_folder(self.source_folder, self.target_folder, include_subfolders)
|
|
QMessageBox.information(self, "Success", "Text extraction completed!")
|
|
|
|
def extract_text_from_folder(self, source_folder, target_folder, include_subfolders):
|
|
# Define file extensions to process
|
|
text_extensions = {".txt", ".md"}
|
|
pdf_extensions = {".pdf"}
|
|
docx_extensions = {".docx"}
|
|
pptx_extensions = {".pptx"}
|
|
|
|
# Walk through the folder
|
|
for root, dirs, files in os.walk(source_folder):
|
|
for file in files:
|
|
file_path = Path(root) / file
|
|
file_extension = file_path.suffix.lower()
|
|
|
|
# Extract text based on file type
|
|
extracted_text = None
|
|
if file_extension in text_extensions:
|
|
extracted_text = self.extract_text_from_txt(file_path)
|
|
elif file_extension in pdf_extensions:
|
|
extracted_text = self.extract_text_from_pdf(file_path)
|
|
elif file_extension in docx_extensions:
|
|
extracted_text = self.extract_text_from_docx(file_path)
|
|
elif file_extension in pptx_extensions:
|
|
extracted_text = self.extract_text_from_pptx(file_path)
|
|
|
|
# Write extracted text to target folder
|
|
if extracted_text:
|
|
relative_path = file_path.relative_to(source_folder)
|
|
target_file_path = target_folder / relative_path.with_suffix(".txt")
|
|
target_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(target_file_path, "w", encoding="utf-8") as f:
|
|
f.write(extracted_text)
|
|
|
|
# If subfolders are not included, break after the first directory
|
|
if not include_subfolders:
|
|
break
|
|
|
|
def extract_text_from_txt(self, file_path):
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
return f.read()
|
|
|
|
def extract_text_from_pdf(self, file_path):
|
|
try:
|
|
reader = PdfReader(file_path)
|
|
return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
|
|
except Exception as e:
|
|
print(f"Error reading PDF {file_path}: {e}")
|
|
return None
|
|
|
|
def extract_text_from_docx(self, file_path):
|
|
try:
|
|
doc = docx.Document(file_path)
|
|
return "\n".join(paragraph.text for paragraph in doc.paragraphs)
|
|
except Exception as e:
|
|
print(f"Error reading DOCX {file_path}: {e}")
|
|
return None
|
|
|
|
def extract_text_from_pptx(self, file_path):
|
|
try:
|
|
presentation = Presentation(file_path)
|
|
text = []
|
|
for slide in presentation.slides:
|
|
for shape in slide.shapes:
|
|
if shape.has_text_frame:
|
|
text.append(shape.text)
|
|
return "\n".join(text)
|
|
except Exception as e:
|
|
print(f"Error reading PPTX {file_path}: {e}")
|
|
return None
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app = QApplication(sys.argv)
|
|
window = TextExtractorApp()
|
|
window.show()
|
|
sys.exit(app.exec_())
|