diff --git a/app.py b/app.py index 230a8600..d230c3e9 100644 --- a/app.py +++ b/app.py @@ -89,7 +89,7 @@ from lollms.app import LollmsApplication from lollms.paths import LollmsPaths from lollms.main_config import LOLLMSConfig from lollms.utilities import trace_exception -from lollms.security import sanitize_path, MultipartBoundaryCheck +from lollms.security import sanitize_path from lollms_webui import LOLLMSWebUI from pathlib import Path from ascii_colors import ASCIIColors @@ -124,9 +124,12 @@ def get_ip_addresses(): app = FastAPI(title="LoLLMS", description="This is the LoLLMS-Webui API documentation") -# Add the MultipartBoundaryCheck middleware -app.add_middleware(MultipartBoundaryCheck) - +try: + from lollms.security import MultipartBoundaryCheck + # Add the MultipartBoundaryCheck middleware + app.add_middleware(MultipartBoundaryCheck) +except: + print("Couldn't activate MultipartBoundaryCheck") #app.mount("/socket.io", StaticFiles(directory="path/to/socketio.js")) diff --git a/endpoints/libraries/lollms_client_js.js b/endpoints/libraries/lollms_client_js.js index abeaca7a..89d6ad91 100644 --- a/endpoints/libraries/lollms_client_js.js +++ b/endpoints/libraries/lollms_client_js.js @@ -818,6 +818,108 @@ async listModels(host_address = this.host_address) { } } +class TextChunker { + constructor(chunkSize = 512, overlap = 0, tokenizer = null, model = null) { + this.chunkSize = chunkSize; + this.overlap = overlap; + this.tokenizer = tokenizer || new TikTokenTokenizer(); + this.model = model; + } + + getTextChunks(text, doc, cleanChunk = true, minNbTokensInChunk = 10) { + const paragraphs = text.split('\n\n'); + const chunks = []; + let currentChunk = []; + let currentTokens = 0; + let chunkId = 0; + + for (const paragraph of paragraphs) { + const cleanedParagraph = cleanChunk ? paragraph.trim() : paragraph; + const paragraphTokens = this.tokenizer.tokenize(cleanedParagraph).length; + + if (currentTokens + paragraphTokens > this.chunkSize) { + if (currentTokens > minNbTokensInChunk) { + let chunkText = currentChunk.join('\n\n'); + if (cleanChunk) { + chunkText = TextChunker.removeUnnecessaryReturns(chunkText); + } + const chunk = new Chunk(doc, '', chunkText, currentTokens, chunkId); + chunkId++; + chunks.push(chunk); + } + + if (this.overlap > 0) { + currentChunk = [...currentChunk.slice(-this.overlap), cleanedParagraph]; + } else { + currentChunk = [cleanedParagraph]; + } + currentTokens = currentChunk.reduce((sum, p) => sum + this.tokenizer.tokenize(p).length, 0); + } else { + currentChunk.push(cleanedParagraph); + currentTokens += paragraphTokens; + } + } + + if (currentChunk.length > 0 && currentTokens > minNbTokensInChunk) { + let chunkText = currentChunk.join('\n\n'); + if (cleanChunk) { + chunkText = TextChunker.removeUnnecessaryReturns(chunkText); + } + const chunk = new Chunk(doc, '', chunkText, currentTokens, chunkId); + chunks.push(chunk); + } + + return chunks; + } + + static removeUnnecessaryReturns(paragraph) { + const lines = paragraph.split('\n'); + return lines.filter(line => line.trim()).join('\n'); + } + + static chunkText(text, tokenizer, chunkSize = 512, overlap = 0, cleanChunk = true, minNbTokensInChunk = 10) { + const paragraphs = text.split('\n\n'); + const chunks = []; + let currentChunk = []; + let currentTokens = 0; + + for (const paragraph of paragraphs) { + const cleanedParagraph = cleanChunk ? paragraph.trim() : paragraph; + const paragraphTokens = tokenizer.tokenize(cleanedParagraph).length; + + if (currentTokens + paragraphTokens > chunkSize) { + if (currentTokens > minNbTokensInChunk) { + let chunkText = currentChunk.join('\n\n'); + if (cleanChunk) { + chunkText = TextChunker.removeUnnecessaryReturns(chunkText); + } + chunks.push(chunkText); + } + + if (overlap > 0) { + currentChunk = [...currentChunk.slice(-overlap), cleanedParagraph]; + } else { + currentChunk = [cleanedParagraph]; + } + currentTokens = currentChunk.reduce((sum, p) => sum + tokenizer.tokenize(p).length, 0); + } else { + currentChunk.push(cleanedParagraph); + currentTokens += paragraphTokens; + } + } + + if (currentChunk.length > 0 && currentTokens > minNbTokensInChunk) { + let chunkText = currentChunk.join('\n\n'); + if (cleanChunk) { + chunkText = TextChunker.removeUnnecessaryReturns(chunkText); + } + chunks.push(chunkText); + } + + return chunks; + } +} + class TasksLibrary { constructor(lollms) { this.lollms = lollms; @@ -1055,7 +1157,11 @@ async sequentialChunksSummary({ return summaries.join("\n"); } -// Placeholder methods for stepStart, stepEnd, fastGen +// Placeholder methods for step stepStart, stepEnd, fastGen +step(message) { + console.log(message); +} + stepStart(message) { console.log(message); }