diff --git a/lollms/functions/bibliography.py b/lollms/functions/bibliography.py new file mode 100644 index 0000000..110b73a --- /dev/null +++ b/lollms/functions/bibliography.py @@ -0,0 +1,163 @@ +# Lollms function call definition file + +# Import necessary libraries +import requests +from pathlib import Path + +# Partial is useful if we need to preset some parameters +from functools import partial + +# It is advised to import typing elements +from typing import List, Optional + +# Import PackageManager if there are potential libraries that need to be installed +from lollms.utilities import PackageManager, find_first_available_file_index, discussion_path_to_url + +# ascii_colors offers advanced console coloring and bug tracing +from ascii_colors import trace_exception + +# Import Client from lollms.client_session +from lollms.client_session import Client + +# Here is an example of how we install a non-installed library using PackageManager +if not PackageManager.check_package_installed("bs4"): + PackageManager.install_package("beautifulsoup4") + +# Now we can import the library +from bs4 import BeautifulSoup + +# Core function to search for PDFs on arXiv and download them to a specified directory +def arxiv_pdf_search(query: str, max_results: Optional[int] = 5, sort_by: Optional[str] = 'relevance', start_date: Optional[str] = None, end_date: Optional[str] = None, author: Optional[str] = None, client: Optional[Client] = None) -> str: + try: + if client is None: + download_to = Path("./pdf_search") + else: + download_to = client.discussion.discussion_folder / "pdf_search" + + # Construct the search URL with additional parameters + url = f'http://export.arxiv.org/api/query?search_query={query}&start=0&max_results={max_results}&sortBy={sort_by}' + if start_date: + url += f'&startDate={start_date}' + if end_date: + url += f'&endDate={end_date}' + if author: + url += f'&author={author}' + + response = requests.get(url) + response.raise_for_status() + + # Parse the response + soup = BeautifulSoup(response.content, 'xml') + entries = soup.find_all('entry') + + # Create the directory if it doesn't exist + download_to.mkdir(parents=True, exist_ok=True) + + # Extract PDF URLs and additional information + html_output = "
" + report_content = "" + for entry in entries: + pdf_url = entry.id.text.replace('abs', 'pdf') + '.pdf' + pdf_name = pdf_url.split('/')[-1] + pdf_path = download_to / pdf_name + + if client is None: + local_url = f'/discussions/pdf_search/{pdf_name}' + else: + local_url = discussion_path_to_url(pdf_path) + + # Extract additional information + title = entry.title.text + authors = ', '.join(author.find('name').text for author in entry.find_all('author')) + affiliations = ', '.join(affiliation.text for affiliation in entry.find_all('affiliation')) + abstract = entry.summary.text + published_date = entry.published.text + journal_ref = entry.find('journal_ref').text if entry.find('journal_ref') else 'N/A' + + # Write abstract and additional information to text file + abstract_path = download_to / f"{pdf_name}_abstract.txt" + with abstract_path.open('w', encoding='utf-8') as abstract_file: + abstract_file.write(f"Title: {title}\n") + abstract_file.write(f"Authors: {authors}\n") + abstract_file.write(f"Affiliations: {affiliations}\n") + abstract_file.write(f"Abstract: {abstract}\n") + abstract_file.write(f"Published Date: {published_date}\n") + abstract_file.write(f"Journal/Conference: {journal_ref}\n") + + # Download PDF + pdf_response = requests.get(pdf_url) + with pdf_path.open('wb') as pdf_file: + pdf_file.write(pdf_response.content) + + # Append to HTML output + html_output += f""" +Authors: {authors}
+Affiliations: {affiliations}
+Abstract: {abstract}
+Published Date: {published_date}
+Journal/Conference: {journal_ref}
+ + +