synced with new safestore

2024-12-19 04:37:54 +00:00 · 2024-02-10 21:35:20 +01:00 · 2024-02-10 21:35:20 +01:00 · 7551e7fa02
commit 7551e7fa02
parent eda4d02626
4 changed files with 174 additions and 24 deletions
--- a/configs/config.yaml
+++ b/configs/config.yaml
@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 55
+version: 56
 binding_name: null
 model_name: null
@ -125,7 +125,8 @@ internet_vectorization_chunk_size: 512 # chunk size
 internet_vectorization_overlap_size: 128 # overlap between chunks size
 internet_vectorization_nb_chunks: 2 # number of chunks to use
 internet_nb_search_pages: 3 # number of pages to select
-
+internet_quick_search: False # If active the search engine will not load and read the webpages
 internet_activate_search_decision: False # If active the ai decides by itself if it needs to do search
 # Helpers
 pdf_latex_path: null
--- a/lollms/configs/config.yaml
+++ b/lollms/configs/config.yaml
@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 55
+version: 56
 binding_name: null
 model_name: null
@ -125,7 +125,8 @@ internet_vectorization_chunk_size: 512 # chunk size
 internet_vectorization_overlap_size: 128 # overlap between chunks size
 internet_vectorization_nb_chunks: 2 # number of chunks to use
 internet_nb_search_pages: 3 # number of pages to select
-
+internet_quick_search: False # If active the search engine will not load and read the webpages
 internet_activate_search_decision: False # If active the ai decides by itself if it needs to do search
 # Helpers
 pdf_latex_path: null
--- a/lollms/internet.py
+++ b/lollms/internet.py
@ -33,7 +33,9 @@ def get_relevant_text_block(
                                url,
                                driver,
                                config,
-                                vectorizer
+                                vectorizer,
                                title=None,
                                brief=None
                            ):
    from bs4 import BeautifulSoup    
    # Load the webpage
@ -50,7 +52,12 @@ def get_relevant_text_block(
    all_text = soup.get_text()
    # Example: Remove leading/trailing whitespace and multiple consecutive line breaks
-    vectorizer.add_document(url,all_text, config.internet_vectorization_chunk_size, config.internet_vectorization_overlap_size)
+    document_id = {
        'url':url
    }
    document_id["title"] = title
    document_id["brief"] = brief
    vectorizer.add_document(document_id,all_text, config.internet_vectorization_chunk_size, config.internet_vectorization_overlap_size)
 def extract_results(url, max_num, driver=None):
@ -112,15 +119,8 @@ def extract_results(url, max_num, driver=None):
        pass
    return results_list
-def internet_search(query, chromedriver_path, config, model = None):
+def internet_search(query, chromedriver_path, config, model = None, quick_search:bool=False):
    """
    Perform an internet search using the provided query.
    Args:
        query (str): The search query.
    Returns:
        dict: The search result as a dictionary.
    """
    from selenium import webdriver
@ -153,17 +153,21 @@ def internet_search(query, chromedriver_path, config, model = None):
                                config.internet_nb_search_pages,
                                driver
                            )
    # Close the browser
    driver.quit()
    for i, result in enumerate(results):
        title = result["title"]
        brief = result["brief"]
        href = result["href"]
-        get_relevant_text_block(href, driver, config, vectorizer)
+        if quick_search:
            vectorizer.add_document({'url':href, 'title':title, 'brief': brief}, brief)
        else:
            get_relevant_text_block(href, driver, config, vectorizer, title, brief)
        nb_non_empty += 1
        if nb_non_empty>=config.internet_nb_search_pages:
            break
    vectorizer.index()
-    # Close the browser
+    docs, sorted_similarities, document_ids = vectorizer.recover_text(query, config.internet_vectorization_nb_chunks)
-    driver.quit()
+    return docs, sorted_similarities, document_ids
    docs, sorted_similarities = vectorizer.recover_text(query, config.internet_vectorization_nb_chunks)
    return docs, sorted_similarities
--- a/lollms/personality.py
+++ b/lollms/personality.py
@ -327,6 +327,45 @@ Date: {{date}}
        if callback:
            callback(full_text, MSG_TYPE.MSG_TYPE_FULL_INVISIBLE_TO_USER)
    def build_prompt(self, prompt_parts:List[str], sacrifice_id:int=-1, context_size:int=None, minimum_spare_context_size:int=None):
        """
        Builds the prompt for code generation.
        Args:
            prompt_parts (List[str]): A list of strings representing the parts of the prompt.
            sacrifice_id (int, optional): The ID of the part to sacrifice.
            context_size (int, optional): The size of the context.
            minimum_spare_context_size (int, optional): The minimum spare context size.
        Returns:
            str: The built prompt.
        """        
        if context_size is None:
            context_size = self.config.ctx_size
        if minimum_spare_context_size is None:
            minimum_spare_context_size = self.config.min_n_predict
        if sacrifice_id == -1 or len(prompt_parts[sacrifice_id])<50:
            return "\n".join([s for s in prompt_parts if s!=""])
        else:
            part_tokens=[]
            nb_tokens=0
            for i,part in enumerate(prompt_parts):
                tk = self.model.tokenize(part)
                part_tokens.append(tk)
                if i != sacrifice_id:
                    nb_tokens += len(tk)
            if len(part_tokens[sacrifice_id])>0:
                sacrifice_tk = part_tokens[sacrifice_id]
                sacrifice_tk= sacrifice_tk[-(context_size-nb_tokens-minimum_spare_context_size):]
                sacrifice_text = self.model.detokenize(sacrifice_tk)
            else:
                sacrifice_text = ""
            prompt_parts[sacrifice_id] = sacrifice_text
            return "\n".join([s for s in prompt_parts if s!=""])
    def add_collapsible_entry(self, title, content):
        return "\n".join(
        [
@ -343,13 +382,117 @@ Date: {{date}}
        f' </details>\n'
        ])
-    def internet_search(self, query ):
+    def internet_search(self, query, quick_search:bool=False):
        """
        Do internet search and return the result
        """
        from lollms.internet import internet_search
-        return internet_search(query, "", self.config, self.model)
+        return internet_search(query, "", self.config, self.model, quick_search=quick_search)
    def sink(self, s=None,i=None,d=None):
        pass
    def yes_no(self, question: str, context:str="", max_answer_length: int = 50, conditionning="") -> bool:
        """
        Analyzes the user prompt and answers whether it is asking to generate an image.
        Args:
            question (str): The user's message.
            max_answer_length (int, optional): The maximum length of the generated answer. Defaults to 50.
            conditionning: An optional system message to put at the beginning of the prompt
        Returns:
            bool: True if the user prompt is asking to generate an image, False otherwise.
        """
        return self.multichoice_question(question, ["no","yes"], context, max_answer_length, conditionning=conditionning)>0
    def multichoice_question(self, question: str, possible_answers:list, context:str = "", max_answer_length: int = 50, conditionning="") -> int:
        """
        Interprets a multi-choice question from a users response. This function expects only one choice as true. All other choices are considered false. If none are correct, returns -1.
        Args:
            question (str): The multi-choice question posed by the user.
            possible_ansers (List[Any]): A list containing all valid options for the chosen value. For each item in the list, either 'True', 'False', None or another callable should be passed which will serve as the truth test function when checking against the actual user input.
            max_answer_length (int, optional): Maximum string length allowed while interpreting the users' responses. Defaults to 50.
            conditionning: An optional system message to put at the beginning of the prompt
        Returns:
            int: Index of the selected option within the possible_ansers list. Or -1 if there was not match found among any of them.
        """
        choices = "\n".join([f"{i}. {possible_answer}" for i, possible_answer in enumerate(possible_answers)])
        elements = [conditionning] if conditionning!="" else []
        elements += [
                "!@>instructions:",
                "Answer this multi choices question.",
                "Answer with an id from the possible answers.",
                "Do not answer with an id outside this possible answers.",
        ]
        if context!="":
            elements+=[
                       "!@>Context:",
                        f"{context}",
                    ]
        elements += [
                f"!@>question: {question}",
                "!@>possible answers:",
                f"{choices}",            
        ]
        elements += ["!@>answer:"]
        prompt = self.build_prompt(elements)
        gen = self.generate(prompt, max_answer_length, temperature=0.1, top_k=50, top_p=0.9, repeat_penalty=1.0, repeat_last_n=50, callback=self.sink).strip().replace("</s>","").replace("<s>","")
        selection = gen.strip().split()[0].replace(",","").replace(".","")
        self.print_prompt("Multi choice selection",prompt+gen)
        try:
            return int(selection)
        except:
            ASCIIColors.cyan("Model failed to answer the question")
            return -1
    def multichoice_ranking(self, question: str, possible_answers:list, context:str = "", max_answer_length: int = 50, conditionning="") -> int:
        """
        Ranks answers for a question from best to worst. returns a list of integers
        Args:
            question (str): The multi-choice question posed by the user.
            possible_ansers (List[Any]): A list containing all valid options for the chosen value. For each item in the list, either 'True', 'False', None or another callable should be passed which will serve as the truth test function when checking against the actual user input.
            max_answer_length (int, optional): Maximum string length allowed while interpreting the users' responses. Defaults to 50.
            conditionning: An optional system message to put at the beginning of the prompt
        Returns:
            int: Index of the selected option within the possible_ansers list. Or -1 if there was not match found among any of them.
        """
        choices = "\n".join([f"{i}. {possible_answer}" for i, possible_answer in enumerate(possible_answers)])
        elements = [conditionning] if conditionning!="" else []
        elements += [
                "!@>instructions:",
                "Answer this multi choices question.",
                "Answer with an id from the possible answers.",
                "Do not answer with an id outside this possible answers.",
                f"!@>question: {question}",
                "!@>possible answers:",
                f"{choices}",
        ]
        if context!="":
            elements+=[
                       "!@>Context:",
                        f"{context}",
                    ]
        elements += ["!@>answer:"]
        prompt = self.build_prompt(elements)
        gen = self.generate(prompt, max_answer_length, temperature=0.1, top_k=50, top_p=0.9, repeat_penalty=1.0, repeat_last_n=50).strip().replace("</s>","").replace("<s>","")
        self.print_prompt("Multi choice ranking",prompt+gen)
        if gen.index("]")>=0:
            try:
                ranks = eval(gen.split("]")[0]+"]")
                return ranks
            except:
                ASCIIColors.red("Model failed to rank inputs")
                return None
        else:
            ASCIIColors.red("Model failed to rank inputs")
            return None
    def step_start(self, step_text, callback: Callable[[str, MSG_TYPE, dict, list], bool]=None):
        """This triggers a step start
@ -2093,12 +2236,13 @@ class APScript(StateMachine):
        f' </details>\n'
        ])
-    def internet_search(self, query ):
+    def internet_search(self, query, quick_search:bool=False ):
        """
        Do internet search and return the result
        """
-        return self.personality.internet_search(query)
+        return self.personality.internet_search(query, quick_search=quick_search)
-    
+
    def step_start(self, step_text, callback: Callable[[str, MSG_TYPE, dict, list], bool]=None):
        """This triggers a step start