mirror of
https://github.com/ParisNeo/lollms-webui.git
synced 2024-12-18 20:17:50 +00:00
Update lollms_client_js.js
This commit is contained in:
parent
ac8be46a12
commit
aefa9475c2
@ -961,7 +961,7 @@ async listModels(host_address = this.host_address) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
class TextChunker {
|
class TextChunker {
|
||||||
constructor(chunkSize = 512, overlap = 0, tokenizer = null, model = null) {
|
constructor(chunkSize = 1024, overlap = 0, tokenizer = null, model = null) {
|
||||||
this.chunkSize = chunkSize;
|
this.chunkSize = chunkSize;
|
||||||
this.overlap = overlap;
|
this.overlap = overlap;
|
||||||
this.tokenizer = tokenizer || new TikTokenTokenizer();
|
this.tokenizer = tokenizer || new TikTokenTokenizer();
|
||||||
@ -1019,45 +1019,108 @@ class TextChunker {
|
|||||||
return lines.filter(line => line.trim()).join('\n');
|
return lines.filter(line => line.trim()).join('\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
static async chunkText(text, tokenizer, chunkSize = 512, overlap = 0, cleanChunk = true, minNbTokensInChunk = 10) {
|
static async chunkText(text, tokenizer, chunkSize = 1024, overlap = 0, cleanChunk = true, minNbTokensInChunk = 10) {
|
||||||
|
// Validate chunkSize
|
||||||
|
if (isNaN(chunkSize) || chunkSize <= 0) {
|
||||||
|
console.warn(`Invalid chunkSize: ${chunkSize}. Resetting to default value of 1024.`);
|
||||||
|
chunkSize = 1024;
|
||||||
|
}
|
||||||
|
|
||||||
const paragraphs = text.split('\n\n');
|
const paragraphs = text.split('\n\n');
|
||||||
const chunks = [];
|
const chunks = [];
|
||||||
let currentChunk = [];
|
let currentChunk = [];
|
||||||
let currentTokens = 0;
|
let currentTokens = 0;
|
||||||
|
|
||||||
|
console.log("Starting text chunking...");
|
||||||
|
console.log(`Using chunkSize: ${chunkSize}, overlap: ${overlap}, minNbTokensInChunk: ${minNbTokensInChunk}`);
|
||||||
|
|
||||||
for (const paragraph of paragraphs) {
|
for (const paragraph of paragraphs) {
|
||||||
const cleanedParagraph = cleanChunk ? paragraph.trim() : paragraph;
|
const cleanedParagraph = cleanChunk ? paragraph.trim() : paragraph;
|
||||||
const paragraphTokens = (await tokenizer.tokenize(cleanedParagraph)).length;
|
const paragraphTokens = (await tokenizer.tokenize(cleanedParagraph)).length;
|
||||||
|
|
||||||
if (currentTokens + paragraphTokens > chunkSize) {
|
console.log(`Processing paragraph: "${cleanedParagraph}"`);
|
||||||
if (currentTokens > minNbTokensInChunk) {
|
console.log(`Paragraph tokens: ${paragraphTokens}`);
|
||||||
let chunkText = currentChunk.join('\n\n');
|
console.log(`Current tokens before adding: ${currentTokens}`);
|
||||||
if (cleanChunk) {
|
|
||||||
chunkText = TextChunker.removeUnnecessaryReturns(chunkText);
|
// Handle case where a single paragraph exceeds chunkSize
|
||||||
}
|
if (paragraphTokens > chunkSize) {
|
||||||
chunks.push(chunkText);
|
console.log(`Paragraph exceeds chunk size. Splitting...`);
|
||||||
|
const splitParagraphs = await this.splitLargeParagraph(cleanedParagraph, tokenizer, chunkSize, overlap);
|
||||||
|
for (const subChunk of splitParagraphs) {
|
||||||
|
chunks.push(subChunk);
|
||||||
|
console.log(`Chunk created from large paragraph: "${subChunk}"`);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (overlap > 0) {
|
// If adding this paragraph exceeds the chunk size
|
||||||
currentChunk = [...currentChunk.slice(-overlap), cleanedParagraph];
|
if (currentTokens + paragraphTokens > chunkSize) {
|
||||||
|
if (currentTokens >= minNbTokensInChunk) {
|
||||||
|
let chunkText = currentChunk.join('\n\n');
|
||||||
|
if (cleanChunk) {
|
||||||
|
chunkText = TextChunker.removeUnnecessaryReturns(chunkText);
|
||||||
|
}
|
||||||
|
chunks.push(chunkText);
|
||||||
|
console.log(`Intermediate chunk created and added: "${chunkText}"`);
|
||||||
|
} else {
|
||||||
|
console.log("Skipping chunk creation due to insufficient tokens.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle overlap
|
||||||
|
if (overlap > 0) {
|
||||||
|
currentChunk = currentChunk.slice(-overlap); // Keep only the overlapping part
|
||||||
|
} else {
|
||||||
|
currentChunk = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
currentChunk.push(cleanedParagraph);
|
||||||
|
currentTokens = paragraphTokens; // Reset token count to the new paragraph's tokens
|
||||||
} else {
|
} else {
|
||||||
currentChunk = [cleanedParagraph];
|
// Add paragraph to the current chunk
|
||||||
|
currentChunk.push(cleanedParagraph);
|
||||||
|
currentTokens += paragraphTokens;
|
||||||
}
|
}
|
||||||
currentTokens = currentChunk.reduce(async (sum, p) => sum + await tokenizer.tokenize(p).length, 0);
|
|
||||||
} else {
|
console.log(`Current chunk after processing: ${currentChunk.join(' | ')}`);
|
||||||
currentChunk.push(cleanedParagraph);
|
console.log(`Current tokens after processing: ${currentTokens}`);
|
||||||
currentTokens += paragraphTokens;
|
}
|
||||||
}
|
|
||||||
}
|
// Add the last chunk if it meets the minimum token requirement
|
||||||
if (currentChunk.length > 0 && currentTokens > minNbTokensInChunk) {
|
if (currentChunk.length > 0 && currentTokens >= minNbTokensInChunk) {
|
||||||
let chunkText = currentChunk.join('\n\n');
|
let chunkText = currentChunk.join('\n\n');
|
||||||
if (cleanChunk) {
|
if (cleanChunk) {
|
||||||
chunkText = TextChunker.removeUnnecessaryReturns(chunkText);
|
chunkText = TextChunker.removeUnnecessaryReturns(chunkText);
|
||||||
}
|
}
|
||||||
chunks.push(chunkText);
|
chunks.push(chunkText);
|
||||||
|
console.log(`Final chunk created and added: "${chunkText}"`);
|
||||||
|
} else {
|
||||||
|
console.log("No final chunk created due to insufficient tokens.");
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("Final Chunks:");
|
||||||
|
console.log(chunks);
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to split a large paragraph into smaller chunks
|
||||||
|
static async splitLargeParagraph(paragraph, tokenizer, chunkSize, overlap) {
|
||||||
|
const tokens = await tokenizer.tokenize(paragraph);
|
||||||
|
const chunks = [];
|
||||||
|
let start = 0;
|
||||||
|
|
||||||
|
while (start < tokens.length) {
|
||||||
|
const end = Math.min(start + chunkSize, tokens.length);
|
||||||
|
const chunkTokens = tokens.slice(start, end);
|
||||||
|
const chunkText = tokenizer.detokenize(chunkTokens);
|
||||||
|
chunks.push(chunkText);
|
||||||
|
|
||||||
|
start += chunkSize - overlap; // Move start forward with overlap
|
||||||
}
|
}
|
||||||
|
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class TasksLibrary {
|
class TasksLibrary {
|
||||||
@ -1108,15 +1171,13 @@ async summarizeText(
|
|||||||
docName = "chunk",
|
docName = "chunk",
|
||||||
answerStart = "",
|
answerStart = "",
|
||||||
maxGenerationSize = 3000,
|
maxGenerationSize = 3000,
|
||||||
maxSummarySize = 512,
|
maxSummarySize = 1024,
|
||||||
callback = null,
|
callback = null,
|
||||||
chunkSummaryPostProcessing = null,
|
chunkSummaryPostProcessing = null,
|
||||||
summaryMode = "SEQUENTIAL",
|
summaryMode = "SEQUENTIAL",
|
||||||
reformat=false
|
reformat=false
|
||||||
) {
|
) {
|
||||||
console.log("Tokenizing:")
|
console.log("Tokenizing incoming text:")
|
||||||
console.log(text)
|
|
||||||
|
|
||||||
let tk = await this.tokenize(text);
|
let tk = await this.tokenize(text);
|
||||||
let prevLen = tk.length;
|
let prevLen = tk.length;
|
||||||
let documentChunks = null;
|
let documentChunks = null;
|
||||||
@ -1124,9 +1185,14 @@ async summarizeText(
|
|||||||
|
|
||||||
while (tk.length > maxSummarySize && (documentChunks === null || documentChunks.length > 1)) {
|
while (tk.length > maxSummarySize && (documentChunks === null || documentChunks.length > 1)) {
|
||||||
this.stepStart(`Compressing ${docName}...`);
|
this.stepStart(`Compressing ${docName}...`);
|
||||||
let chunkSize = Math.floor(this.lollms.ctxSize * 0.6);
|
let chunkSize=1024;
|
||||||
|
if(this.lollms.ctxSize){
|
||||||
|
chunkSize = Math.floor(this.lollms.ctxSize * 0.6);
|
||||||
|
}
|
||||||
|
console.log("Chunk size:",chunkSize)
|
||||||
documentChunks = await TextChunker.chunkText(text, this.lollms, chunkSize, 0, true);
|
documentChunks = await TextChunker.chunkText(text, this.lollms, chunkSize, 0, true);
|
||||||
console.log(`documentChunks: ${documentChunks}`)
|
console.log(`documentChunks:`)
|
||||||
|
console.log(documentChunks)
|
||||||
text = await this.summarizeChunks(
|
text = await this.summarizeChunks(
|
||||||
documentChunks,
|
documentChunks,
|
||||||
summaryInstruction,
|
summaryInstruction,
|
||||||
@ -1166,7 +1232,7 @@ async smartDataExtraction(
|
|||||||
docName = "chunk",
|
docName = "chunk",
|
||||||
answerStart = "",
|
answerStart = "",
|
||||||
maxGenerationSize = 3000,
|
maxGenerationSize = 3000,
|
||||||
maxSummarySize = 512,
|
maxSummarySize = 1024,
|
||||||
callback = null,
|
callback = null,
|
||||||
chunkSummaryPostProcessing = null,
|
chunkSummaryPostProcessing = null,
|
||||||
summaryMode = "SEQUENTIAL"
|
summaryMode = "SEQUENTIAL"
|
||||||
@ -1175,8 +1241,11 @@ async smartDataExtraction(
|
|||||||
let prevLen = tk.length;
|
let prevLen = tk.length;
|
||||||
|
|
||||||
while (tk.length > maxSummarySize) {
|
while (tk.length > maxSummarySize) {
|
||||||
let chunkSize = Math.floor(this.lollms.ctxSize * 0.6);
|
let chunkSize=1024;
|
||||||
let documentChunks = await TextChunker.chunkText(text, this.lollms, chunkSize, 0, true);
|
if(this.lollms.ctxSize){
|
||||||
|
chunkSize = Math.floor(this.lollms.ctxSize * 0.6);
|
||||||
|
}
|
||||||
|
let documentChunks = await TextChunker.chunkText(text, this.lollms, chunkSize, 0, true);
|
||||||
text = await this.summarizeChunks(
|
text = await this.summarizeChunks(
|
||||||
documentChunks,
|
documentChunks,
|
||||||
dataExtractionInstruction,
|
dataExtractionInstruction,
|
||||||
|
Loading…
Reference in New Issue
Block a user