lollms-webui/endpoints/libraries/lollms_anything_to_markdown.js

class LollmsFileLoader {
    constructor() {
        this.supportedExtensions = [
            'txt', 'md', 'markdown', 'rtf', 'log', 'csv', 'json', 'xml',
            'html', 'htm', 'css', 'js', 'py', 'java', 'c', 'cpp',
            'docx', 'pdf', 'pptx'
        ];
    }

    async loadFile(file) {
        const fileExtension = file.name.split('.').pop().toLowerCase();
        if (!this.supportedExtensions.includes(fileExtension)) {
            throw new Error('Unsupported file type');
        }

        let content = '';
        switch (fileExtension) {
            case 'docx':
                content = await this.readDocxFile(file);
                break;
            case 'pdf':
                content = await this.readPdfFile(file);
                break;
            case 'pptx':
                content = await this.readPptxFile(file);
                break;
            default:
                content = await this.readTextFile(file);
        }

        return this.convertToMarkdown(content, fileExtension);
    }

    readTextFile(file) {
        return new Promise((resolve, reject) => {
            const reader = new FileReader();
            reader.onload = (e) => resolve(e.target.result);
            reader.onerror = (e) => reject(e);
            reader.readAsText(file);
        });
    }

    readDocxFile(file) {
        return new Promise((resolve, reject) => {
            const reader = new FileReader();
            reader.onload = function(e) {
                mammoth.extractRawText({arrayBuffer: e.target.result})
                    .then(result => resolve(result.value))
                    .catch(reject);
            };
            reader.onerror = (e) => reject(e);
            reader.readAsArrayBuffer(file);
        });
    }

    async readPdfFile(file) {
        return new Promise((resolve, reject) => {
            const reader = new FileReader();
            reader.onload = async function(e) {
                try {
                    const pdf = await pdfjsLib.getDocument({data: e.target.result}).promise;
                    let content = '';
                    
                    for (let i = 1; i <= pdf.numPages; i++) {
                        const page = await pdf.getPage(i);
                        const textContent = await page.getTextContent();
                        content += textContent.items.map(item => item.str).join(' ') + '\n\n';
                    }
                    
                    resolve(content.trim());
                } catch (error) {
                    console.error("Error processing PDF:", error);
                    reject(error);
                }
            };
            reader.onerror = (e) => reject(e);
            reader.readAsArrayBuffer(file);
        });
    }
    readPptxFile(file) {
        return new Promise((resolve, reject) => {
            const reader = new FileReader();
            reader.onload = async function(e) {
                try {
                    const arrayBuffer = e.target.result;
                    const pptx2json = new PPTX2Json();
                    
                    // Create a Blob from the ArrayBuffer
                    const blob = new Blob([arrayBuffer], { type: file.type });
                    
                    // Create a temporary URL for the Blob
                    const url = URL.createObjectURL(blob);
                    
                    // Use the URL with toJson
                    const result = await pptx2json.toJson(url);
                    
                    let text = '';
                    result.slides.forEach((slide, index) => {
                        text += `Slide ${index + 1}:\n`;
                        slide.data.forEach(item => {
                            if (item.type === 'text') {
                                text += item.text + '\n';
                            }
                        });
                        text += '\n';
                    });
                    
                    // Clean up the temporary URL
                    URL.revokeObjectURL(url);
                    
                    resolve(text);
                } catch (error) {
                    console.error('Error extracting text from PPTX:', error);
                    reject(new Error('Unable to process PPTX file. ' + error.message));
                }
            };
            reader.onerror = (e) => reject(e);
            reader.readAsArrayBuffer(file);
        });
    }
    
    
    
    
    convertToMarkdown(content, fileExtension) {
        // Basic conversion to markdown
        // This can be extended for more sophisticated conversions
        let markdown = '';

        const lines = content.split('\n');
        let inCodeBlock = false;

        for (let line of lines) {
            line = line.trim();
            if (line === '') {
                markdown += '\n';
                continue;
            }

            // Check for headings
            if (line.startsWith('#')) {
                markdown += line + '\n';
            } 
            // Check for lists
            else if (line.match(/^[\u2022\u25E6\u25AA\u25AB\u25CF\u25CB\u25A0\u25A1]/)) {
                markdown += '- ' + line.substring(1).trim() + '\n';
            }
            // Check for code blocks
            else if (line.startsWith('```')) {
                inCodeBlock = !inCodeBlock;
                markdown += line + '\n';
            }
            // Regular text
            else {
                if (inCodeBlock) {
                    markdown += line + '\n';
                } else {
                    markdown += line + '\n\n';
                }
            }
        }

        return markdown.trim();
    }
}

// Usage example:
async function handleFileUpload(event) {
    const file = event.target.files[0];
    if (!file) return;

    const fileLoader = new LollmsFileLoader();
    try {
        const markdown = await fileLoader.loadFile(file);
        console.log(markdown);
        // You can now use the markdown content as needed
    } catch (error) {
        console.error('Error processing file:', error);
        alert('Error processing file: ' + error.message);
    }
}
added new libraries 2024-08-19 21:12:58 +00:00			`class LollmsFileLoader {`
			`constructor() {`
			`this.supportedExtensions = [`
			`'txt', 'md', 'markdown', 'rtf', 'log', 'csv', 'json', 'xml',`
			`'html', 'htm', 'css', 'js', 'py', 'java', 'c', 'cpp',`
			`'docx', 'pdf', 'pptx'`
			`];`
			`}`

			`async loadFile(file) {`
			`const fileExtension = file.name.split('.').pop().toLowerCase();`
			`if (!this.supportedExtensions.includes(fileExtension)) {`
			`throw new Error('Unsupported file type');`
			`}`

			`let content = '';`
			`switch (fileExtension) {`
			`case 'docx':`
			`content = await this.readDocxFile(file);`
			`break;`
			`case 'pdf':`
			`content = await this.readPdfFile(file);`
			`break;`
			`case 'pptx':`
			`content = await this.readPptxFile(file);`
			`break;`
			`default:`
			`content = await this.readTextFile(file);`
			`}`

			`return this.convertToMarkdown(content, fileExtension);`
			`}`

			`readTextFile(file) {`
			`return new Promise((resolve, reject) => {`
			`const reader = new FileReader();`
			`reader.onload = (e) => resolve(e.target.result);`
			`reader.onerror = (e) => reject(e);`
			`reader.readAsText(file);`
			`});`
			`}`

			`readDocxFile(file) {`
			`return new Promise((resolve, reject) => {`
			`const reader = new FileReader();`
			`reader.onload = function(e) {`
			`mammoth.extractRawText({arrayBuffer: e.target.result})`
			`.then(result => resolve(result.value))`
			`.catch(reject);`
			`};`
			`reader.onerror = (e) => reject(e);`
			`reader.readAsArrayBuffer(file);`
			`});`
			`}`

			`async readPdfFile(file) {`
			`return new Promise((resolve, reject) => {`
			`const reader = new FileReader();`
			`reader.onload = async function(e) {`
			`try {`
			`const pdf = await pdfjsLib.getDocument({data: e.target.result}).promise;`
			`let content = '';`

			`for (let i = 1; i <= pdf.numPages; i++) {`
			`const page = await pdf.getPage(i);`
			`const textContent = await page.getTextContent();`
			`content += textContent.items.map(item => item.str).join(' ') + '\n\n';`
			`}`

			`resolve(content.trim());`
			`} catch (error) {`
			`console.error("Error processing PDF:", error);`
			`reject(error);`
			`}`
			`};`
			`reader.onerror = (e) => reject(e);`
			`reader.readAsArrayBuffer(file);`
			`});`
			`}`
			`readPptxFile(file) {`
			`return new Promise((resolve, reject) => {`
			`const reader = new FileReader();`
			`reader.onload = async function(e) {`
			`try {`
Update lollms_anything_to_markdown.js 2024-09-20 14:49:30 +00:00			`const arrayBuffer = e.target.result;`
			`const pptx2json = new PPTX2Json();`

			`// Create a Blob from the ArrayBuffer`
			`const blob = new Blob([arrayBuffer], { type: file.type });`

			`// Create a temporary URL for the Blob`
			`const url = URL.createObjectURL(blob);`

			`// Use the URL with toJson`
			`const result = await pptx2json.toJson(url);`

			`let text = '';`
			`result.slides.forEach((slide, index) => {`
			text += `Slide ${index + 1}:\n`;
			`slide.data.forEach(item => {`
			`if (item.type === 'text') {`
			`text += item.text + '\n';`
			`}`
			`});`
			`text += '\n';`
			`});`

			`// Clean up the temporary URL`
			`URL.revokeObjectURL(url);`

			`resolve(text);`
added new libraries 2024-08-19 21:12:58 +00:00			`} catch (error) {`
			`console.error('Error extracting text from PPTX:', error);`
			`reject(new Error('Unable to process PPTX file. ' + error.message));`
			`}`
			`};`
			`reader.onerror = (e) => reject(e);`
			`reader.readAsArrayBuffer(file);`
			`});`
			`}`
Update lollms_anything_to_markdown.js 2024-09-20 14:49:30 +00:00



added new libraries 2024-08-19 21:12:58 +00:00			`convertToMarkdown(content, fileExtension) {`
			`// Basic conversion to markdown`
			`// This can be extended for more sophisticated conversions`
			`let markdown = '';`

			`const lines = content.split('\n');`
			`let inCodeBlock = false;`

			`for (let line of lines) {`
			`line = line.trim();`
			`if (line === '') {`
			`markdown += '\n';`
			`continue;`
			`}`

			`// Check for headings`
			`if (line.startsWith('#')) {`
			`markdown += line + '\n';`
			`}`
			`// Check for lists`
			`else if (line.match(/^[\u2022\u25E6\u25AA\u25AB\u25CF\u25CB\u25A0\u25A1]/)) {`
			`markdown += '- ' + line.substring(1).trim() + '\n';`
			`}`
			`// Check for code blocks`
			else if (line.startsWith('```')) {
			`inCodeBlock = !inCodeBlock;`
			`markdown += line + '\n';`
			`}`
			`// Regular text`
			`else {`
			`if (inCodeBlock) {`
			`markdown += line + '\n';`
			`} else {`
			`markdown += line + '\n\n';`
			`}`
			`}`
			`}`

			`return markdown.trim();`
			`}`
			`}`

			`// Usage example:`
			`async function handleFileUpload(event) {`
			`const file = event.target.files[0];`
			`if (!file) return;`

			`const fileLoader = new LollmsFileLoader();`
			`try {`
			`const markdown = await fileLoader.loadFile(file);`
			`console.log(markdown);`
			`// You can now use the markdown content as needed`
			`} catch (error) {`
			`console.error('Error processing file:', error);`
			`alert('Error processing file: ' + error.message);`
			`}`
			`}`