diff --git a/configs/config.yaml b/configs/config.yaml index 8405274..cd9a8b0 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 54 +version: 55 binding_name: null model_name: null @@ -121,6 +121,10 @@ data_vectorization_make_persistance: false # If true, the data will be persistan # Activate internet search activate_internet_search: false +internet_vectorization_chunk_size: 512 # chunk size +internet_vectorization_overlap_size: 128 # overlap between chunks size +internet_vectorization_nb_chunks: 2 # number of chunks to use +internet_nb_search_pages: 3 # number of pages to select # Helpers pdf_latex_path: null diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml index 8405274..cd9a8b0 100644 --- a/lollms/configs/config.yaml +++ b/lollms/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 54 +version: 55 binding_name: null model_name: null @@ -121,6 +121,10 @@ data_vectorization_make_persistance: false # If true, the data will be persistan # Activate internet search activate_internet_search: false +internet_vectorization_chunk_size: 512 # chunk size +internet_vectorization_overlap_size: 128 # overlap between chunks size +internet_vectorization_nb_chunks: 2 # number of chunks to use +internet_nb_search_pages: 3 # number of pages to select # Helpers pdf_latex_path: null diff --git a/lollms/internet.py b/lollms/internet.py new file mode 100644 index 0000000..a7fe99d --- /dev/null +++ b/lollms/internet.py @@ -0,0 +1,167 @@ + +def get_favicon_url(url): + import requests + from bs4 import BeautifulSoup + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + favicon_link = soup.find('link', rel='icon') or soup.find('link', rel='shortcut icon') + + if favicon_link: + favicon_url = favicon_link['href'] + if not favicon_url.startswith('http'): + favicon_url = url + favicon_url + return favicon_url + + return None + + +def get_root_url(url): + from urllib.parse import urlparse + parsed_url = urlparse(url) + root_url = parsed_url.scheme + "://" + parsed_url.netloc + return root_url + + +def format_url_parameter(value:str): + encoded_value = value.strip().replace("\"","") + return encoded_value + + +def get_relevant_text_block( + url, + driver, + config, + vectorizer + ): + from bs4 import BeautifulSoup + # Load the webpage + driver.get(url) + + # Wait for JavaScript to execute and get the final page source + html_content = driver.page_source + + # Parse the HTML content + soup = BeautifulSoup(html_content, "html.parser") + # Example: Remove all