added new summary tools

This commit is contained in:
Saifeddine ALOUI 2024-03-24 15:39:35 +01:00
parent 2209934b94
commit 6d32a1bc42

View File

@ -31,7 +31,7 @@ from lollms.types import MSG_TYPE
import json import json
from typing import Any, List, Optional, Type, Callable, Dict, Any, Union from typing import Any, List, Optional, Type, Callable, Dict, Any, Union
import json import json
from safe_store import TextVectorizer, GenericDataLoader, VisualizationMethod, VectorizationMethod from safe_store import TextVectorizer, GenericDataLoader, VisualizationMethod, VectorizationMethod, DocumentDecomposer
from functools import partial from functools import partial
import sys import sys
from lollms.com import LoLLMsCom from lollms.com import LoLLMsCom
@ -2184,13 +2184,26 @@ class APScript(StateMachine):
]), ]),
max_generation_size=max_generation_size)) max_generation_size=max_generation_size))
return translated return translated
def summerize(self, chunks, summary_instruction="summerize", chunk_name="chunk", answer_start="", max_generation_size=3000, callback=None):
def summerize_text(self, text, summary_instruction="summerize", doc_name="chunk", answer_start="", max_generation_size=3000, max_summary_size=512, callback=None):
depth=0
tk = self.personality.model.tokenize(text)
while len(tk)>max_summary_size:
self.step_start(f"Comprerssing.. [depth {depth}]")
chunk_size = int(self.personality.config.ctx_size*0.6)
document_chunks = DocumentDecomposer.decompose_document(text, chunk_size, 0, self.personality.model.tokenize, self.personality.model.detokenize, True)
text = self.summerize_chunks(document_chunks,summary_instruction, doc_name, answer_start, max_generation_size, callback)
tk = self.personality.model.tokenize(text)
self.step_end(f"Comprerssing.. [depth {depth}]")
return text
def summerize_chunks(self, chunks, summary_instruction="summerize", doc_name="chunk", answer_start="", max_generation_size=3000, callback=None):
summeries = [] summeries = []
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
self.step_start(f"Processing chunk : {i+1}/{len(chunks)}") self.step_start(f"Processing chunk : {i+1}/{len(chunks)}")
summary = f"```markdown\n{answer_start}"+ self.fast_gen( summary = f"```markdown\n{answer_start}"+ self.fast_gen(
"\n".join([ "\n".join([
f"!@>Document_chunk: {chunk_name}:", f"!@>Document_chunk: {doc_name}:",
f"{chunk}", f"{chunk}",
f"!@>instruction: {summary_instruction}", f"!@>instruction: {summary_instruction}",
f"!@>summary:", f"!@>summary:",
@ -2273,6 +2286,17 @@ class APScript(StateMachine):
return self.personality.internet_search_with_vectorization(query, quick_search=quick_search) return self.personality.internet_search_with_vectorization(query, quick_search=quick_search)
def vectorize_and_query(self, text, query, max_chunk_size=512, overlap_size=20, internet_vectorization_nb_chunks=3):
vectorizer = TextVectorizer(VectorizationMethod.TFIDF_VECTORIZER, model = self.personality.model)
decomposer = DocumentDecomposer()
chunks = decomposer.decompose_document(text, max_chunk_size, overlap_size,self.personality.model.tokenize,self.personality.model.detokenize)
for i, chunk in enumerate(chunks):
vectorizer.add_document(f"chunk_{i}", self.personality.model.detokenize(chunk))
vectorizer.index()
docs, sorted_similarities, document_ids = vectorizer.recover_text(query, internet_vectorization_nb_chunks)
return docs, sorted_similarities
def step_start(self, step_text, callback: Callable[[str, MSG_TYPE, dict, list], bool]=None): def step_start(self, step_text, callback: Callable[[str, MSG_TYPE, dict, list], bool]=None):
"""This triggers a step start """This triggers a step start