Enhanced internet skills

2025-04-16 14:58:52 +00:00 · 2024-05-11 11:29:47 +02:00 · 2024-05-11 11:29:47 +02:00 · 03072a36ce
commit 03072a36ce
parent 98f509163a
2 changed files with 62 additions and 29 deletions
--- a/lollms/app.py
+++ b/lollms/app.py
@ -697,7 +697,7 @@ class LollmsApplication(LoLLMsCom):
                    need=True
                if need:
                    self.personality.step_start("Crafting internet search query")
-                    query = self.personality.fast_gen(f"!@>discussion:\n{discussion[-2048:]}\n!@>system: Read the discussion and craft a web search query suited to recover needed information to reply to last {self.config.user_name} message.\nDo not answer the prompt. Do not add explanations.\n!@>current date: {datetime.now()}!@>websearch query: ", max_generation_size=256, show_progress=True, callback=self.personality.sink)
+                    query = self.personality.fast_gen(f"!@>discussion:\n{discussion[-2048:]}\n!@>system: Read the discussion and craft a web search query suited to recover needed information to reply to last {self.config.user_name} message.\nDo not answer the prompt. Do not add explanations.\n!@>current date: {datetime.now()}\n!@>websearch query: ", max_generation_size=256, show_progress=True, callback=self.personality.sink)
                    self.personality.step_end("Crafting internet search query")
                    self.personality.step(f"web search query: {query}")

@ -706,7 +706,7 @@ class LollmsApplication(LoLLMsCom):
                    else:
                        self.personality.step_start("Performing Internet search (advanced mode: slower but more advanced)")

-                    internet_search_results=f"!@>important information: Use the internet search results data to answer {self.config.user_name}'s last message. It is strictly forbidden to give the user an answer without having actual proof from the documentation.\n!@>Web search results:\n"
+                    internet_search_results=f"!@>instructions: Use the internet search results data to answer {self.config.user_name}. Try to extract information from the websearch and use it to perform the requested task or answer the question. Try to stick to the websearch results and clarify if your answer was based on the resuts or on your own culture. If you don't know how to perform the task, then tell the user politely that you need more data inputs.\n!@>Web search results:\n"

                    docs, sorted_similarities, document_ids = self.personality.internet_search_with_vectorization(query, self.config.internet_quick_search)
                    for doc, infos,document_id in zip(docs, sorted_similarities, document_ids):
--- a/lollms/internet.py
+++ b/lollms/internet.py
@ -148,40 +148,73 @@ def scrape_and_save(url, file_path=None, lollms_com=None, chromedriver_path=None

    return text_content

+
 def get_relevant_text_block(
-                                url,
-                                driver,
-                                internet_vectorization_chunk_size, internet_vectorization_overlap_size,
-                                vectorizer,
-                                title=None,
-                                brief=None,
-                                wait_step_delay=0.5
-                            ):
+    url,
+    driver,
+    internet_vectorization_chunk_size,
+    internet_vectorization_overlap_size,
+    vectorizer,
+    title=None,
+    brief=None,
+    wait_step_delay=0.5
+):
+    from bs4 import BeautifulSoup
+    import time
    try:
-        from bs4 import BeautifulSoup    
-        # Load the webpage
+        # Chargez la page web avec le driver passé en paramètre
        driver.get(url)
-        wait_for_page(driver, wait_step_delay)
-
-        # Wait for JavaScript to execute and get the final page source
+        # Attendez que le JavaScript s'exécute, avec un délai d'attente progressif si nécessaire
+        time.sleep(wait_step_delay)
        html_content = driver.page_source
-
-        # Parse the HTML content
        soup = BeautifulSoup(html_content, "html.parser")
-        # Example: Remove all <script> and <style> tags
-        for script in soup(["script", "style"]):
-            script.extract()

-        all_text = soup.get_text()
-        # Example: Remove leading/trailing whitespace and multiple consecutive line breaks
-        document_id = {
-            'url':url
-        }
-        document_id["title"] = title
-        document_id["brief"] = brief
-        vectorizer.add_document(document_id,all_text, internet_vectorization_chunk_size, internet_vectorization_overlap_size)
-    except:
+        # Supprimez les éléments non désirés
+        for script_or_style in soup(["script", "style", "header", "footer", "nav", "aside"]):
+            script_or_style.decompose()
+
+        # Ciblez l'élément contenant le texte principal
+        article = soup.find('article')
+        if article:
+            text_block = ''
+            sections = article.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li'])
+            for element in sections:
+                if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+                    if text_block:
+                        text_block += '\n\n'
+                    text_block += element.get_text(strip=True)
+                    text_block += '\n'
+                else:
+                    text_block += element.get_text(strip=True) + '\n'
+
+            document_id = {
+                'url':url
+            }
+            document_id["title"] = title
+            document_id["brief"] = brief
+            text_block=text_block.strip()
+            vectorizer.add_document(document_id,text_block, internet_vectorization_chunk_size, internet_vectorization_overlap_size)
+            return True
+        else:
+            body = soup.body
+            if body:
+                text_block = body.get_text(strip=True)
+                document_id = {
+                    'url':url
+                }
+                document_id["title"] = title
+                document_id["brief"] = brief
+                text_block=text_block.strip()
+                vectorizer.add_document(document_id,text_block, internet_vectorization_chunk_size, internet_vectorization_overlap_size)
+                return True
+            else:
+                ASCIIColors.warning("No data found in his page.")
+                return False
+    except Exception as ex:
        ASCIIColors.warning(f"Couldn't scrape: {url}")
+        return False
+        
+

 def extract_results(url, max_num, driver=None, wait_step_delay=0.5):
    from bs4 import BeautifulSoup