lollms-webui/endpoints/libraries/lollms_anything_to_markdown.js

158 lines
5.2 KiB
JavaScript
Raw Normal View History

2024-08-19 21:12:58 +00:00
class LollmsFileLoader {
constructor() {
this.supportedExtensions = [
'txt', 'md', 'markdown', 'rtf', 'log', 'csv', 'json', 'xml',
'html', 'htm', 'css', 'js', 'py', 'java', 'c', 'cpp',
'docx', 'pdf', 'pptx'
];
}
async loadFile(file) {
const fileExtension = file.name.split('.').pop().toLowerCase();
if (!this.supportedExtensions.includes(fileExtension)) {
throw new Error('Unsupported file type');
}
let content = '';
switch (fileExtension) {
case 'docx':
content = await this.readDocxFile(file);
break;
case 'pdf':
content = await this.readPdfFile(file);
break;
case 'pptx':
content = await this.readPptxFile(file);
break;
default:
content = await this.readTextFile(file);
}
return this.convertToMarkdown(content, fileExtension);
}
readTextFile(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (e) => resolve(e.target.result);
reader.onerror = (e) => reject(e);
reader.readAsText(file);
});
}
readDocxFile(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = function(e) {
mammoth.extractRawText({arrayBuffer: e.target.result})
.then(result => resolve(result.value))
.catch(reject);
};
reader.onerror = (e) => reject(e);
reader.readAsArrayBuffer(file);
});
}
async readPdfFile(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = async function(e) {
try {
const pdf = await pdfjsLib.getDocument({data: e.target.result}).promise;
let content = '';
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
content += textContent.items.map(item => item.str).join(' ') + '\n\n';
}
resolve(content.trim());
} catch (error) {
console.error("Error processing PDF:", error);
reject(error);
}
};
reader.onerror = (e) => reject(e);
reader.readAsArrayBuffer(file);
});
}
readPptxFile(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = async function(e) {
try {
if (typeof PptxTextExtractor === 'undefined') {
throw new Error('PptxTextExtractor is not defined. The library might not be loaded correctly.');
}
const text = await PptxTextExtractor.extractText(e.target.result);
resolve(text.join('\n'));
} catch (error) {
console.error('Error extracting text from PPTX:', error);
reject(new Error('Unable to process PPTX file. ' + error.message));
}
};
reader.onerror = (e) => reject(e);
reader.readAsArrayBuffer(file);
});
}
convertToMarkdown(content, fileExtension) {
// Basic conversion to markdown
// This can be extended for more sophisticated conversions
let markdown = '';
const lines = content.split('\n');
let inCodeBlock = false;
for (let line of lines) {
line = line.trim();
if (line === '') {
markdown += '\n';
continue;
}
// Check for headings
if (line.startsWith('#')) {
markdown += line + '\n';
}
// Check for lists
else if (line.match(/^[\u2022\u25E6\u25AA\u25AB\u25CF\u25CB\u25A0\u25A1]/)) {
markdown += '- ' + line.substring(1).trim() + '\n';
}
// Check for code blocks
else if (line.startsWith('```')) {
inCodeBlock = !inCodeBlock;
markdown += line + '\n';
}
// Regular text
else {
if (inCodeBlock) {
markdown += line + '\n';
} else {
markdown += line + '\n\n';
}
}
}
return markdown.trim();
}
}
// Usage example:
async function handleFileUpload(event) {
const file = event.target.files[0];
if (!file) return;
const fileLoader = new LollmsFileLoader();
try {
const markdown = await fileLoader.loadFile(file);
console.log(markdown);
// You can now use the markdown content as needed
} catch (error) {
console.error('Error processing file:', error);
alert('Error processing file: ' + error.message);
}
}