Enhanced internet skills

This commit is contained in:
Saifeddine ALOUI 2024-05-11 11:29:47 +02:00
parent 98f509163a
commit 03072a36ce
2 changed files with 62 additions and 29 deletions

View File

@ -697,7 +697,7 @@ class LollmsApplication(LoLLMsCom):
need=True
if need:
self.personality.step_start("Crafting internet search query")
query = self.personality.fast_gen(f"!@>discussion:\n{discussion[-2048:]}\n!@>system: Read the discussion and craft a web search query suited to recover needed information to reply to last {self.config.user_name} message.\nDo not answer the prompt. Do not add explanations.\n!@>current date: {datetime.now()}!@>websearch query: ", max_generation_size=256, show_progress=True, callback=self.personality.sink)
query = self.personality.fast_gen(f"!@>discussion:\n{discussion[-2048:]}\n!@>system: Read the discussion and craft a web search query suited to recover needed information to reply to last {self.config.user_name} message.\nDo not answer the prompt. Do not add explanations.\n!@>current date: {datetime.now()}\n!@>websearch query: ", max_generation_size=256, show_progress=True, callback=self.personality.sink)
self.personality.step_end("Crafting internet search query")
self.personality.step(f"web search query: {query}")
@ -706,7 +706,7 @@ class LollmsApplication(LoLLMsCom):
else:
self.personality.step_start("Performing Internet search (advanced mode: slower but more advanced)")
internet_search_results=f"!@>important information: Use the internet search results data to answer {self.config.user_name}'s last message. It is strictly forbidden to give the user an answer without having actual proof from the documentation.\n!@>Web search results:\n"
internet_search_results=f"!@>instructions: Use the internet search results data to answer {self.config.user_name}. Try to extract information from the websearch and use it to perform the requested task or answer the question. Try to stick to the websearch results and clarify if your answer was based on the resuts or on your own culture. If you don't know how to perform the task, then tell the user politely that you need more data inputs.\n!@>Web search results:\n"
docs, sorted_similarities, document_ids = self.personality.internet_search_with_vectorization(query, self.config.internet_quick_search)
for doc, infos,document_id in zip(docs, sorted_similarities, document_ids):

View File

@ -148,40 +148,73 @@ def scrape_and_save(url, file_path=None, lollms_com=None, chromedriver_path=None
return text_content
def get_relevant_text_block(
url,
driver,
internet_vectorization_chunk_size, internet_vectorization_overlap_size,
vectorizer,
title=None,
brief=None,
wait_step_delay=0.5
):
url,
driver,
internet_vectorization_chunk_size,
internet_vectorization_overlap_size,
vectorizer,
title=None,
brief=None,
wait_step_delay=0.5
):
from bs4 import BeautifulSoup
import time
try:
from bs4 import BeautifulSoup
# Load the webpage
# Chargez la page web avec le driver passé en paramètre
driver.get(url)
wait_for_page(driver, wait_step_delay)
# Wait for JavaScript to execute and get the final page source
# Attendez que le JavaScript s'exécute, avec un délai d'attente progressif si nécessaire
time.sleep(wait_step_delay)
html_content = driver.page_source
# Parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Example: Remove all <script> and <style> tags
for script in soup(["script", "style"]):
script.extract()
all_text = soup.get_text()
# Example: Remove leading/trailing whitespace and multiple consecutive line breaks
document_id = {
'url':url
}
document_id["title"] = title
document_id["brief"] = brief
vectorizer.add_document(document_id,all_text, internet_vectorization_chunk_size, internet_vectorization_overlap_size)
except:
# Supprimez les éléments non désirés
for script_or_style in soup(["script", "style", "header", "footer", "nav", "aside"]):
script_or_style.decompose()
# Ciblez l'élément contenant le texte principal
article = soup.find('article')
if article:
text_block = ''
sections = article.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li'])
for element in sections:
if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
if text_block:
text_block += '\n\n'
text_block += element.get_text(strip=True)
text_block += '\n'
else:
text_block += element.get_text(strip=True) + '\n'
document_id = {
'url':url
}
document_id["title"] = title
document_id["brief"] = brief
text_block=text_block.strip()
vectorizer.add_document(document_id,text_block, internet_vectorization_chunk_size, internet_vectorization_overlap_size)
return True
else:
body = soup.body
if body:
text_block = body.get_text(strip=True)
document_id = {
'url':url
}
document_id["title"] = title
document_id["brief"] = brief
text_block=text_block.strip()
vectorizer.add_document(document_id,text_block, internet_vectorization_chunk_size, internet_vectorization_overlap_size)
return True
else:
ASCIIColors.warning("No data found in his page.")
return False
except Exception as ex:
ASCIIColors.warning(f"Couldn't scrape: {url}")
return False
def extract_results(url, max_num, driver=None, wait_step_delay=0.5):
from bs4 import BeautifulSoup