mirror of
https://github.com/ParisNeo/lollms.git
synced 2024-12-23 22:42:32 +00:00
added search function call
This commit is contained in:
parent
8bc8443a9e
commit
721b751a6f
64
lollms/functions/search.py
Normal file
64
lollms/functions/search.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
# Lollms function call definition file
|
||||||
|
# Import necessary libraries
|
||||||
|
import requests
|
||||||
|
from functools import partial
|
||||||
|
from typing import List
|
||||||
|
from lollms.utilities import PackageManager
|
||||||
|
from ascii_colors import trace_exception
|
||||||
|
|
||||||
|
# Ensure necessary packages are installed
|
||||||
|
if not PackageManager.check_package_installed("beautifulsoup4"):
|
||||||
|
PackageManager.install_package("beautifulsoup4")
|
||||||
|
if not PackageManager.check_package_installed("html2text"):
|
||||||
|
PackageManager.install_package("html2text")
|
||||||
|
|
||||||
|
# Import the libraries
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import html2text
|
||||||
|
|
||||||
|
def clean_html_content(html_content: str) -> str:
|
||||||
|
h = html2text.HTML2Text()
|
||||||
|
h.ignore_links = True
|
||||||
|
h.ignore_images = True
|
||||||
|
clean_text = h.handle(html_content)
|
||||||
|
return clean_text
|
||||||
|
|
||||||
|
def search_and_clean_content(keywords: List[str], num_pages: int) -> str:
|
||||||
|
try:
|
||||||
|
search_query = "+".join(keywords)
|
||||||
|
base_url = "https://www.google.com/search?q={query}&start={page}"
|
||||||
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
||||||
|
|
||||||
|
all_cleaned_content = []
|
||||||
|
|
||||||
|
for page in range(0, num_pages * 10, 10):
|
||||||
|
url = base_url.format(query=search_query, page=page)
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
return "Oops! Google is not cooperating. Try again later."
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
results = soup.find_all('div', class_='BNeawe s3v9rd AP7Wnd')
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
raw_html = result.get_text()
|
||||||
|
cleaned_content = clean_html_content(raw_html)
|
||||||
|
all_cleaned_content.append(cleaned_content)
|
||||||
|
|
||||||
|
formatted_content = "\n\n".join(all_cleaned_content)
|
||||||
|
return formatted_content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return trace_exception(e)
|
||||||
|
|
||||||
|
def search_and_clean_content_function():
|
||||||
|
return {
|
||||||
|
"function_name": "search_and_clean_content",
|
||||||
|
"function": search_and_clean_content,
|
||||||
|
"function_description": "Searches multiple websites for the given keywords, cleans the content, and returns useful information formatted in sections.",
|
||||||
|
"function_parameters": [
|
||||||
|
{"name": "keywords", "type": "List[str]"},
|
||||||
|
{"name": "num_pages", "type": "int"}
|
||||||
|
]
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user