2024-08-19 21:12:58 +00:00
|
|
|
class LollmsFileLoader {
|
|
|
|
constructor() {
|
|
|
|
this.supportedExtensions = [
|
|
|
|
'txt', 'md', 'markdown', 'rtf', 'log', 'csv', 'json', 'xml',
|
|
|
|
'html', 'htm', 'css', 'js', 'py', 'java', 'c', 'cpp',
|
|
|
|
'docx', 'pdf', 'pptx'
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
async loadFile(file) {
|
|
|
|
const fileExtension = file.name.split('.').pop().toLowerCase();
|
|
|
|
if (!this.supportedExtensions.includes(fileExtension)) {
|
|
|
|
throw new Error('Unsupported file type');
|
|
|
|
}
|
|
|
|
|
|
|
|
let content = '';
|
|
|
|
switch (fileExtension) {
|
|
|
|
case 'docx':
|
|
|
|
content = await this.readDocxFile(file);
|
|
|
|
break;
|
|
|
|
case 'pdf':
|
|
|
|
content = await this.readPdfFile(file);
|
|
|
|
break;
|
|
|
|
case 'pptx':
|
|
|
|
content = await this.readPptxFile(file);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
content = await this.readTextFile(file);
|
|
|
|
}
|
|
|
|
|
|
|
|
return this.convertToMarkdown(content, fileExtension);
|
|
|
|
}
|
|
|
|
|
|
|
|
readTextFile(file) {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
const reader = new FileReader();
|
|
|
|
reader.onload = (e) => resolve(e.target.result);
|
|
|
|
reader.onerror = (e) => reject(e);
|
|
|
|
reader.readAsText(file);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
readDocxFile(file) {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
const reader = new FileReader();
|
|
|
|
reader.onload = function(e) {
|
|
|
|
mammoth.extractRawText({arrayBuffer: e.target.result})
|
|
|
|
.then(result => resolve(result.value))
|
|
|
|
.catch(reject);
|
|
|
|
};
|
|
|
|
reader.onerror = (e) => reject(e);
|
|
|
|
reader.readAsArrayBuffer(file);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
async readPdfFile(file) {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
const reader = new FileReader();
|
|
|
|
reader.onload = async function(e) {
|
|
|
|
try {
|
|
|
|
const pdf = await pdfjsLib.getDocument({data: e.target.result}).promise;
|
|
|
|
let content = '';
|
|
|
|
|
|
|
|
for (let i = 1; i <= pdf.numPages; i++) {
|
|
|
|
const page = await pdf.getPage(i);
|
|
|
|
const textContent = await page.getTextContent();
|
|
|
|
content += textContent.items.map(item => item.str).join(' ') + '\n\n';
|
|
|
|
}
|
|
|
|
|
|
|
|
resolve(content.trim());
|
|
|
|
} catch (error) {
|
|
|
|
console.error("Error processing PDF:", error);
|
|
|
|
reject(error);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
reader.onerror = (e) => reject(e);
|
|
|
|
reader.readAsArrayBuffer(file);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
readPptxFile(file) {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
const reader = new FileReader();
|
|
|
|
reader.onload = async function(e) {
|
|
|
|
try {
|
2024-09-20 14:49:30 +00:00
|
|
|
const arrayBuffer = e.target.result;
|
|
|
|
const pptx2json = new PPTX2Json();
|
|
|
|
|
|
|
|
// Create a Blob from the ArrayBuffer
|
|
|
|
const blob = new Blob([arrayBuffer], { type: file.type });
|
|
|
|
|
|
|
|
// Create a temporary URL for the Blob
|
|
|
|
const url = URL.createObjectURL(blob);
|
|
|
|
|
|
|
|
// Use the URL with toJson
|
|
|
|
const result = await pptx2json.toJson(url);
|
|
|
|
|
|
|
|
let text = '';
|
|
|
|
result.slides.forEach((slide, index) => {
|
|
|
|
text += `Slide ${index + 1}:\n`;
|
|
|
|
slide.data.forEach(item => {
|
|
|
|
if (item.type === 'text') {
|
|
|
|
text += item.text + '\n';
|
|
|
|
}
|
|
|
|
});
|
|
|
|
text += '\n';
|
|
|
|
});
|
|
|
|
|
|
|
|
// Clean up the temporary URL
|
|
|
|
URL.revokeObjectURL(url);
|
|
|
|
|
|
|
|
resolve(text);
|
2024-08-19 21:12:58 +00:00
|
|
|
} catch (error) {
|
|
|
|
console.error('Error extracting text from PPTX:', error);
|
|
|
|
reject(new Error('Unable to process PPTX file. ' + error.message));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
reader.onerror = (e) => reject(e);
|
|
|
|
reader.readAsArrayBuffer(file);
|
|
|
|
});
|
|
|
|
}
|
2024-09-20 14:49:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-08-19 21:12:58 +00:00
|
|
|
convertToMarkdown(content, fileExtension) {
|
|
|
|
// Basic conversion to markdown
|
|
|
|
// This can be extended for more sophisticated conversions
|
|
|
|
let markdown = '';
|
|
|
|
|
|
|
|
const lines = content.split('\n');
|
|
|
|
let inCodeBlock = false;
|
|
|
|
|
|
|
|
for (let line of lines) {
|
|
|
|
line = line.trim();
|
|
|
|
if (line === '') {
|
|
|
|
markdown += '\n';
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for headings
|
|
|
|
if (line.startsWith('#')) {
|
|
|
|
markdown += line + '\n';
|
|
|
|
}
|
|
|
|
// Check for lists
|
|
|
|
else if (line.match(/^[\u2022\u25E6\u25AA\u25AB\u25CF\u25CB\u25A0\u25A1]/)) {
|
|
|
|
markdown += '- ' + line.substring(1).trim() + '\n';
|
|
|
|
}
|
|
|
|
// Check for code blocks
|
|
|
|
else if (line.startsWith('```')) {
|
|
|
|
inCodeBlock = !inCodeBlock;
|
|
|
|
markdown += line + '\n';
|
|
|
|
}
|
|
|
|
// Regular text
|
|
|
|
else {
|
|
|
|
if (inCodeBlock) {
|
|
|
|
markdown += line + '\n';
|
|
|
|
} else {
|
|
|
|
markdown += line + '\n\n';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return markdown.trim();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Usage example:
|
|
|
|
async function handleFileUpload(event) {
|
|
|
|
const file = event.target.files[0];
|
|
|
|
if (!file) return;
|
|
|
|
|
|
|
|
const fileLoader = new LollmsFileLoader();
|
|
|
|
try {
|
|
|
|
const markdown = await fileLoader.loadFile(file);
|
|
|
|
console.log(markdown);
|
|
|
|
// You can now use the markdown content as needed
|
|
|
|
} catch (error) {
|
|
|
|
console.error('Error processing file:', error);
|
|
|
|
alert('Error processing file: ' + error.message);
|
|
|
|
}
|
|
|
|
}
|