added sequencial summary

This commit is contained in:
Saifeddine ALOUI 2024-04-09 23:17:34 +02:00
parent 099ab512e3
commit 27ce8ed0a9

View File

@ -2190,7 +2190,9 @@ class APScript(StateMachine):
except Exception as e:
print(f"Error retrieving webpage content for {result['href']}: {str(e)}")
return answer_list
return answer_list
def translate(self, text_chunk, output_language="french", max_generation_size=3000):
translated = self.fast_gen(
"\n".join([
@ -2206,37 +2208,69 @@ class APScript(StateMachine):
max_generation_size=max_generation_size, callback=self.sink)
return translated
def summerize_text(self, text, summary_instruction="summerize", doc_name="chunk", answer_start="", max_generation_size=3000, max_summary_size=512, callback=None):
def summerize_text(
self,
text,
summary_instruction="summerize",
doc_name="chunk",
answer_start="",
max_generation_size=3000,
max_summary_size=512,
callback=None,
chunk_summary_post_processing=None
):
depth=0
tk = self.personality.model.tokenize(text)
while len(tk)>max_summary_size:
self.step_start(f"Comprerssing.. [depth {depth}]")
self.step_start(f"Comprerssing {doc_name}... [depth {depth+1}]")
chunk_size = int(self.personality.config.ctx_size*0.6)
document_chunks = DocumentDecomposer.decompose_document(text, chunk_size, 0, self.personality.model.tokenize, self.personality.model.detokenize, True)
text = self.summerize_chunks(document_chunks,summary_instruction, doc_name, answer_start, max_generation_size, callback)
text = self.summerize_chunks(document_chunks,summary_instruction, doc_name, answer_start, max_generation_size, callback, chunk_summary_post_processing=chunk_summary_post_processing)
tk = self.personality.model.tokenize(text)
self.step_end(f"Comprerssing.. [depth {depth}]")
self.step_end(f"Comprerssing {doc_name}... [depth {depth+1}]")
depth += 1
return text
def smart_data_extraction(self, text, data_extraction_instruction="summerize", final_task_instruction="reformulate with better wording", doc_name="chunk", answer_start="", max_generation_size=3000, max_summary_size=512, callback=None):
def smart_data_extraction(
self,
text,
data_extraction_instruction="summerize",
final_task_instruction="reformulate with better wording",
doc_name="chunk",
answer_start="",
max_generation_size=3000,
max_summary_size=512,
callback=None,
chunk_summary_post_processing=None
):
depth=0
tk = self.personality.model.tokenize(text)
while len(tk)>max_summary_size:
self.step_start(f"Comprerssing... [depth {depth+1}]")
chunk_size = int(self.personality.config.ctx_size*0.6)
document_chunks = DocumentDecomposer.decompose_document(text, chunk_size, 0, self.personality.model.tokenize, self.personality.model.detokenize, True)
text = self.summerize_chunks(document_chunks, data_extraction_instruction, doc_name, answer_start, max_generation_size, callback)
text = self.summerize_chunks(document_chunks, data_extraction_instruction, doc_name, answer_start, max_generation_size, callback, chunk_summary_post_processing=chunk_summary_post_processing)
tk = self.personality.model.tokenize(text)
self.step(f"Current text size : {len(tk)}, max summary size : {max_summary_size}")
self.step_end(f"Comprerssing... [depth {depth+1}]")
self.step_start(f"Rewriting..")
depth += 1
self.step_start(f"Rewriting ...")
text = self.summerize_chunks([text],
final_task_instruction, doc_name, answer_start, max_generation_size, callback)
self.step_end(f"Rewriting..")
final_task_instruction, doc_name, answer_start, max_generation_size, callback, chunk_summary_post_processing=chunk_summary_post_processing)
self.step_end(f"Rewriting ...")
return text
def summerize_chunks(self, chunks, summary_instruction="summerize", doc_name="chunk", answer_start="", max_generation_size=3000, callback=None):
def summerize_chunks(
self,
chunks,
summary_instruction="summerize",
doc_name="chunk",
answer_start="",
max_generation_size=3000,
callback=None,
chunk_summary_post_processing=None
):
summeries = []
for i, chunk in enumerate(chunks):
self.step_start(f" Summary of {doc_name} - Processing chunk : {i+1}/{len(chunks)}")
@ -2251,6 +2285,47 @@ class APScript(StateMachine):
]),
max_generation_size=max_generation_size,
callback=callback)
if chunk_summary_post_processing:
summary = chunk_summary_post_processing(summary)
summeries.append(summary)
self.step_end(f" Summary of {doc_name} - Processing chunk : {i+1}/{len(chunks)}")
return "\n".join(summeries)
def sequencial_summary(
self,
chunks,
summary_instruction="summerize",
doc_name="chunk",
answer_start="",
max_generation_size=3000,
callback=None,
chunk_summary_post_processing=None
):
summeries = []
for i, chunk in enumerate(chunks):
if i<len(chunks)-1:
chunk1 = chunks[i+1]
else:
chunk1=""
if i>0:
chunk=summary
self.step_start(f" Summary of {doc_name} - Processing chunk : {i+1}/{len(chunks)}")
summary = f"{answer_start}"+ self.fast_gen(
"\n".join([
f"!@>Document_chunk: {doc_name}:",
f"Block1:",
f"{chunk}",
f"Block2:",
f"{chunk1}",
f"!@>instruction: {summary_instruction}",
f"Answer directly with the summary with no extra comments.",
f"!@>summary:",
f"{answer_start}"
]),
max_generation_size=max_generation_size,
callback=callback)
if chunk_summary_post_processing:
summary = chunk_summary_post_processing(summary)
summeries.append(summary)
self.step_end(f" Summary of {doc_name} - Processing chunk : {i+1}/{len(chunks)}")
return "\n".join(summeries)