mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-01-18 02:39:46 +00:00
synced
This commit is contained in:
parent
a88743be39
commit
cebd64b610
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 54
|
||||
version: 55
|
||||
binding_name: null
|
||||
model_name: null
|
||||
|
||||
@ -121,6 +121,10 @@ data_vectorization_make_persistance: false # If true, the data will be persistan
|
||||
|
||||
# Activate internet search
|
||||
activate_internet_search: false
|
||||
internet_vectorization_chunk_size: 512 # chunk size
|
||||
internet_vectorization_overlap_size: 128 # overlap between chunks size
|
||||
internet_vectorization_nb_chunks: 2 # number of chunks to use
|
||||
internet_nb_search_pages: 3 # number of pages to select
|
||||
|
||||
# Helpers
|
||||
pdf_latex_path: null
|
||||
|
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 54
|
||||
version: 55
|
||||
binding_name: null
|
||||
model_name: null
|
||||
|
||||
@ -121,6 +121,10 @@ data_vectorization_make_persistance: false # If true, the data will be persistan
|
||||
|
||||
# Activate internet search
|
||||
activate_internet_search: false
|
||||
internet_vectorization_chunk_size: 512 # chunk size
|
||||
internet_vectorization_overlap_size: 128 # overlap between chunks size
|
||||
internet_vectorization_nb_chunks: 2 # number of chunks to use
|
||||
internet_nb_search_pages: 3 # number of pages to select
|
||||
|
||||
# Helpers
|
||||
pdf_latex_path: null
|
||||
|
167
lollms/internet.py
Normal file
167
lollms/internet.py
Normal file
@ -0,0 +1,167 @@
|
||||
|
||||
def get_favicon_url(url):
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
favicon_link = soup.find('link', rel='icon') or soup.find('link', rel='shortcut icon')
|
||||
|
||||
if favicon_link:
|
||||
favicon_url = favicon_link['href']
|
||||
if not favicon_url.startswith('http'):
|
||||
favicon_url = url + favicon_url
|
||||
return favicon_url
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_root_url(url):
|
||||
from urllib.parse import urlparse
|
||||
parsed_url = urlparse(url)
|
||||
root_url = parsed_url.scheme + "://" + parsed_url.netloc
|
||||
return root_url
|
||||
|
||||
|
||||
def format_url_parameter(value:str):
|
||||
encoded_value = value.strip().replace("\"","")
|
||||
return encoded_value
|
||||
|
||||
|
||||
def get_relevant_text_block(
|
||||
url,
|
||||
driver,
|
||||
config,
|
||||
vectorizer
|
||||
):
|
||||
from bs4 import BeautifulSoup
|
||||
# Load the webpage
|
||||
driver.get(url)
|
||||
|
||||
# Wait for JavaScript to execute and get the final page source
|
||||
html_content = driver.page_source
|
||||
|
||||
# Parse the HTML content
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
# Example: Remove all <script> and <style> tags
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
all_text = soup.get_text()
|
||||
# Example: Remove leading/trailing whitespace and multiple consecutive line breaks
|
||||
vectorizer.add_document(url,all_text, config.internet_vectorization_chunk_size, config.internet_vectorization_overlap_size)
|
||||
|
||||
|
||||
def extract_results(url, max_num, driver=None):
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Load the webpage
|
||||
driver.get(url)
|
||||
|
||||
# Wait for JavaScript to execute and get the final page source
|
||||
html_content = driver.page_source
|
||||
|
||||
# Parse the HTML content
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
# Detect that no outputs are found
|
||||
Not_found = soup.find("No results found")
|
||||
|
||||
if Not_found :
|
||||
return []
|
||||
|
||||
# Find the <ol> tag with class="react-results--main"
|
||||
ol_tag = soup.find("ol", class_="react-results--main")
|
||||
|
||||
# Initialize an empty list to store the results
|
||||
results_list = []
|
||||
|
||||
try:
|
||||
# Find all <li> tags within the <ol> tag
|
||||
li_tags = ol_tag.find_all("li")
|
||||
|
||||
# Loop through each <li> tag, limited by max_num
|
||||
for index, li_tag in enumerate(li_tags):
|
||||
if index > max_num*3:
|
||||
break
|
||||
|
||||
try:
|
||||
# Find the three <div> tags within the <article> tag
|
||||
div_tags = li_tag.find_all("div")
|
||||
|
||||
# Extract the link, title, and content from the <div> tags
|
||||
links = div_tags[0].find_all("a")
|
||||
href_value = links[1].get('href')
|
||||
span = links[1].find_all("span")
|
||||
link = span[0].text.strip()
|
||||
|
||||
title = div_tags[2].text.strip()
|
||||
content = div_tags[3].text.strip()
|
||||
|
||||
# Add the extracted information to the list
|
||||
results_list.append({
|
||||
"link": link,
|
||||
"href": href_value,
|
||||
"title": title,
|
||||
"brief": content
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
return results_list
|
||||
|
||||
def internet_search(query, chromedriver_path, config, model = None):
|
||||
"""
|
||||
Perform an internet search using the provided query.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
|
||||
Returns:
|
||||
dict: The search result as a dictionary.
|
||||
"""
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from safe_store.text_vectorizer import TextVectorizer, VectorizationMethod
|
||||
|
||||
vectorizer = TextVectorizer(VectorizationMethod.TFIDF_VECTORIZER, model = model)
|
||||
|
||||
formatted_text = ""
|
||||
nb_non_empty = 0
|
||||
# Configure Chrome options
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument("--remote-debugging-port=9222")
|
||||
chrome_options.add_argument("--headless") # Run Chrome in headless mode
|
||||
|
||||
# Set path to chromedriver executable (replace with your own path)
|
||||
if chromedriver_path is None:
|
||||
chromedriver_path = ""#"/snap/bin/chromium.chromedriver"
|
||||
|
||||
# Create a new Chrome webdriver instance
|
||||
try:
|
||||
driver = webdriver.Chrome(executable_path=chromedriver_path, options=chrome_options)
|
||||
except:
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
|
||||
results = extract_results(
|
||||
f"https://duckduckgo.com/?q={format_url_parameter(query)}&t=h_&ia=web",
|
||||
config.internet_nb_search_pages,
|
||||
driver
|
||||
)
|
||||
for i, result in enumerate(results):
|
||||
title = result["title"]
|
||||
brief = result["brief"]
|
||||
href = result["href"]
|
||||
get_relevant_text_block(href, driver, config, vectorizer)
|
||||
nb_non_empty += 1
|
||||
if nb_non_empty>=config.internet_nb_search_pages:
|
||||
break
|
||||
vectorizer.index()
|
||||
|
||||
# Close the browser
|
||||
driver.quit()
|
||||
docs, sorted_similarities = vectorizer.recover_text(query, config.internet_vectorization_nb_chunks)
|
||||
return docs, sorted_similarities
|
@ -342,6 +342,14 @@ Date: {{date}}
|
||||
f' </div>',
|
||||
f' </details>\n'
|
||||
])
|
||||
|
||||
def internet_search(self, query ):
|
||||
"""
|
||||
Do internet search and return the result
|
||||
"""
|
||||
from lollms.internet import internet_search
|
||||
return internet_search(query, "", self.config, self.model)
|
||||
|
||||
|
||||
def step_start(self, step_text, callback: Callable[[str, MSG_TYPE, dict, list], bool]=None):
|
||||
"""This triggers a step start
|
||||
@ -2084,6 +2092,13 @@ class APScript(StateMachine):
|
||||
f' </div>',
|
||||
f' </details>\n'
|
||||
])
|
||||
|
||||
def internet_search(self, query ):
|
||||
"""
|
||||
Do internet search and return the result
|
||||
"""
|
||||
return self.personality.internet_search(query)
|
||||
|
||||
def step_start(self, step_text, callback: Callable[[str, MSG_TYPE, dict, list], bool]=None):
|
||||
"""This triggers a step start
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user