This commit is contained in:
Saifeddine ALOUI 2024-09-12 19:28:51 +02:00
commit e01907721b
2 changed files with 114 additions and 5 deletions

11
app.py
View File

@ -89,7 +89,7 @@ from lollms.app import LollmsApplication
from lollms.paths import LollmsPaths from lollms.paths import LollmsPaths
from lollms.main_config import LOLLMSConfig from lollms.main_config import LOLLMSConfig
from lollms.utilities import trace_exception from lollms.utilities import trace_exception
from lollms.security import sanitize_path, MultipartBoundaryCheck from lollms.security import sanitize_path
from lollms_webui import LOLLMSWebUI from lollms_webui import LOLLMSWebUI
from pathlib import Path from pathlib import Path
from ascii_colors import ASCIIColors from ascii_colors import ASCIIColors
@ -124,9 +124,12 @@ def get_ip_addresses():
app = FastAPI(title="LoLLMS", description="This is the LoLLMS-Webui API documentation") app = FastAPI(title="LoLLMS", description="This is the LoLLMS-Webui API documentation")
# Add the MultipartBoundaryCheck middleware try:
app.add_middleware(MultipartBoundaryCheck) from lollms.security import MultipartBoundaryCheck
# Add the MultipartBoundaryCheck middleware
app.add_middleware(MultipartBoundaryCheck)
except:
print("Couldn't activate MultipartBoundaryCheck")
#app.mount("/socket.io", StaticFiles(directory="path/to/socketio.js")) #app.mount("/socket.io", StaticFiles(directory="path/to/socketio.js"))

View File

@ -818,6 +818,108 @@ async listModels(host_address = this.host_address) {
} }
} }
class TextChunker {
constructor(chunkSize = 512, overlap = 0, tokenizer = null, model = null) {
this.chunkSize = chunkSize;
this.overlap = overlap;
this.tokenizer = tokenizer || new TikTokenTokenizer();
this.model = model;
}
getTextChunks(text, doc, cleanChunk = true, minNbTokensInChunk = 10) {
const paragraphs = text.split('\n\n');
const chunks = [];
let currentChunk = [];
let currentTokens = 0;
let chunkId = 0;
for (const paragraph of paragraphs) {
const cleanedParagraph = cleanChunk ? paragraph.trim() : paragraph;
const paragraphTokens = this.tokenizer.tokenize(cleanedParagraph).length;
if (currentTokens + paragraphTokens > this.chunkSize) {
if (currentTokens > minNbTokensInChunk) {
let chunkText = currentChunk.join('\n\n');
if (cleanChunk) {
chunkText = TextChunker.removeUnnecessaryReturns(chunkText);
}
const chunk = new Chunk(doc, '', chunkText, currentTokens, chunkId);
chunkId++;
chunks.push(chunk);
}
if (this.overlap > 0) {
currentChunk = [...currentChunk.slice(-this.overlap), cleanedParagraph];
} else {
currentChunk = [cleanedParagraph];
}
currentTokens = currentChunk.reduce((sum, p) => sum + this.tokenizer.tokenize(p).length, 0);
} else {
currentChunk.push(cleanedParagraph);
currentTokens += paragraphTokens;
}
}
if (currentChunk.length > 0 && currentTokens > minNbTokensInChunk) {
let chunkText = currentChunk.join('\n\n');
if (cleanChunk) {
chunkText = TextChunker.removeUnnecessaryReturns(chunkText);
}
const chunk = new Chunk(doc, '', chunkText, currentTokens, chunkId);
chunks.push(chunk);
}
return chunks;
}
static removeUnnecessaryReturns(paragraph) {
const lines = paragraph.split('\n');
return lines.filter(line => line.trim()).join('\n');
}
static chunkText(text, tokenizer, chunkSize = 512, overlap = 0, cleanChunk = true, minNbTokensInChunk = 10) {
const paragraphs = text.split('\n\n');
const chunks = [];
let currentChunk = [];
let currentTokens = 0;
for (const paragraph of paragraphs) {
const cleanedParagraph = cleanChunk ? paragraph.trim() : paragraph;
const paragraphTokens = tokenizer.tokenize(cleanedParagraph).length;
if (currentTokens + paragraphTokens > chunkSize) {
if (currentTokens > minNbTokensInChunk) {
let chunkText = currentChunk.join('\n\n');
if (cleanChunk) {
chunkText = TextChunker.removeUnnecessaryReturns(chunkText);
}
chunks.push(chunkText);
}
if (overlap > 0) {
currentChunk = [...currentChunk.slice(-overlap), cleanedParagraph];
} else {
currentChunk = [cleanedParagraph];
}
currentTokens = currentChunk.reduce((sum, p) => sum + tokenizer.tokenize(p).length, 0);
} else {
currentChunk.push(cleanedParagraph);
currentTokens += paragraphTokens;
}
}
if (currentChunk.length > 0 && currentTokens > minNbTokensInChunk) {
let chunkText = currentChunk.join('\n\n');
if (cleanChunk) {
chunkText = TextChunker.removeUnnecessaryReturns(chunkText);
}
chunks.push(chunkText);
}
return chunks;
}
}
class TasksLibrary { class TasksLibrary {
constructor(lollms) { constructor(lollms) {
this.lollms = lollms; this.lollms = lollms;
@ -1055,7 +1157,11 @@ async sequentialChunksSummary({
return summaries.join("\n"); return summaries.join("\n");
} }
// Placeholder methods for stepStart, stepEnd, fastGen // Placeholder methods for step stepStart, stepEnd, fastGen
step(message) {
console.log(message);
}
stepStart(message) { stepStart(message) {
console.log(message); console.log(message);
} }