mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-01-13 16:29:49 +00:00
Create bibliography.py
This commit is contained in:
parent
7468f4414c
commit
5896a56bcc
163
lollms/functions/bibliography.py
Normal file
163
lollms/functions/bibliography.py
Normal file
@ -0,0 +1,163 @@
|
||||
# Lollms function call definition file
|
||||
|
||||
# Import necessary libraries
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
# Partial is useful if we need to preset some parameters
|
||||
from functools import partial
|
||||
|
||||
# It is advised to import typing elements
|
||||
from typing import List, Optional
|
||||
|
||||
# Import PackageManager if there are potential libraries that need to be installed
|
||||
from lollms.utilities import PackageManager, find_first_available_file_index, discussion_path_to_url
|
||||
|
||||
# ascii_colors offers advanced console coloring and bug tracing
|
||||
from ascii_colors import trace_exception
|
||||
|
||||
# Import Client from lollms.client_session
|
||||
from lollms.client_session import Client
|
||||
|
||||
# Here is an example of how we install a non-installed library using PackageManager
|
||||
if not PackageManager.check_package_installed("bs4"):
|
||||
PackageManager.install_package("beautifulsoup4")
|
||||
|
||||
# Now we can import the library
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Core function to search for PDFs on arXiv and download them to a specified directory
|
||||
def arxiv_pdf_search(query: str, max_results: Optional[int] = 5, sort_by: Optional[str] = 'relevance', start_date: Optional[str] = None, end_date: Optional[str] = None, author: Optional[str] = None, client: Optional[Client] = None) -> str:
|
||||
try:
|
||||
if client is None:
|
||||
download_to = Path("./pdf_search")
|
||||
else:
|
||||
download_to = client.discussion.discussion_folder / "pdf_search"
|
||||
|
||||
# Construct the search URL with additional parameters
|
||||
url = f'http://export.arxiv.org/api/query?search_query={query}&start=0&max_results={max_results}&sortBy={sort_by}'
|
||||
if start_date:
|
||||
url += f'&startDate={start_date}'
|
||||
if end_date:
|
||||
url += f'&endDate={end_date}'
|
||||
if author:
|
||||
url += f'&author={author}'
|
||||
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse the response
|
||||
soup = BeautifulSoup(response.content, 'xml')
|
||||
entries = soup.find_all('entry')
|
||||
|
||||
# Create the directory if it doesn't exist
|
||||
download_to.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Extract PDF URLs and additional information
|
||||
html_output = "<html><body>"
|
||||
report_content = ""
|
||||
for entry in entries:
|
||||
pdf_url = entry.id.text.replace('abs', 'pdf') + '.pdf'
|
||||
pdf_name = pdf_url.split('/')[-1]
|
||||
pdf_path = download_to / pdf_name
|
||||
|
||||
if client is None:
|
||||
local_url = f'/discussions/pdf_search/{pdf_name}'
|
||||
else:
|
||||
local_url = discussion_path_to_url(pdf_path)
|
||||
|
||||
# Extract additional information
|
||||
title = entry.title.text
|
||||
authors = ', '.join(author.find('name').text for author in entry.find_all('author'))
|
||||
affiliations = ', '.join(affiliation.text for affiliation in entry.find_all('affiliation'))
|
||||
abstract = entry.summary.text
|
||||
published_date = entry.published.text
|
||||
journal_ref = entry.find('journal_ref').text if entry.find('journal_ref') else 'N/A'
|
||||
|
||||
# Write abstract and additional information to text file
|
||||
abstract_path = download_to / f"{pdf_name}_abstract.txt"
|
||||
with abstract_path.open('w', encoding='utf-8') as abstract_file:
|
||||
abstract_file.write(f"Title: {title}\n")
|
||||
abstract_file.write(f"Authors: {authors}\n")
|
||||
abstract_file.write(f"Affiliations: {affiliations}\n")
|
||||
abstract_file.write(f"Abstract: {abstract}\n")
|
||||
abstract_file.write(f"Published Date: {published_date}\n")
|
||||
abstract_file.write(f"Journal/Conference: {journal_ref}\n")
|
||||
|
||||
# Download PDF
|
||||
pdf_response = requests.get(pdf_url)
|
||||
with pdf_path.open('wb') as pdf_file:
|
||||
pdf_file.write(pdf_response.content)
|
||||
|
||||
# Append to HTML output
|
||||
html_output += f"""
|
||||
<div>
|
||||
<h2>{title}</h2>
|
||||
<p><strong>Authors:</strong> {authors}</p>
|
||||
<p><strong>Affiliations:</strong> {affiliations}</p>
|
||||
<p><strong>Abstract:</strong> {abstract}</p>
|
||||
<p><strong>Published Date:</strong> {published_date}</p>
|
||||
<p><strong>Journal/Conference:</strong> {journal_ref}</p>
|
||||
<p><a href="{pdf_url}" target="_blank">PDF Link</a></p>
|
||||
<p><a href="{local_url}" target="_blank">Local PDF</a></p>
|
||||
</div>
|
||||
<hr />
|
||||
"""
|
||||
# Append to report content
|
||||
report_content += f"""
|
||||
Title: {title}
|
||||
Authors: {authors}
|
||||
Affiliations: {affiliations}
|
||||
Abstract: {abstract}
|
||||
Published Date: {published_date}
|
||||
Journal/Conference: {journal_ref}
|
||||
PDF Link: {pdf_url}
|
||||
Local PDF: {local_url}
|
||||
------------------------
|
||||
"""
|
||||
|
||||
# Save the report to a text file
|
||||
report_path = download_to / "pdf_search_report.txt"
|
||||
with report_path.open('w', encoding='utf-8') as report_file:
|
||||
report_file.write(report_content)
|
||||
|
||||
html_output += "</body></html>"
|
||||
return html_output
|
||||
|
||||
except Exception as e:
|
||||
return trace_exception(e)
|
||||
|
||||
# Metadata function
|
||||
def arxiv_pdf_search_function(client: Optional[Client] = None):
|
||||
return {
|
||||
"function_name": "arxiv_pdf_search", # The function name in string
|
||||
"function": partial(arxiv_pdf_search, client=client), # The function to be called with partial to preset client
|
||||
"function_description": "Searches for PDFs on arXiv based on a query, downloads them to a specified directory, and returns a HTML string containing article details and links.", # Description of the function
|
||||
"function_parameters": [ # The set of parameters
|
||||
{"name": "query", "type": "str", "description": "The search query for arXiv."},
|
||||
{"name": "max_results", "type": "int", "description": "The maximum number of results to return. (Optional)", "optional": True, "default": 5},
|
||||
{"name": "sort_by", "type": "str", "description": "The sorting criteria for the search results (e.g., relevance, lastUpdatedDate). (Optional)", "optional": True, "default": "relevance"},
|
||||
{"name": "start_date", "type": "str", "description": "The start date for the search results in the format YYYY-MM-DD. (Optional)", "optional": True},
|
||||
{"name": "end_date", "type": "str", "description": "The end date for the search results in the format YYYY-MM-DD. (Optional)", "optional": True},
|
||||
{"name": "author", "type": "str", "description": "The author name for the search results. (Optional)", "optional": True},
|
||||
]
|
||||
}
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Example client initialization (if needed)
|
||||
client = None # Replace with actual client initialization if available
|
||||
|
||||
# Example function call
|
||||
html_output = arxiv_pdf_search(
|
||||
query="machine learning",
|
||||
max_results=3,
|
||||
sort_by="relevance",
|
||||
start_date="2020-01-01",
|
||||
end_date="2021-01-01",
|
||||
author="John Doe",
|
||||
client=client
|
||||
)
|
||||
|
||||
# Print the HTML output
|
||||
print(html_output)
|
Loading…
Reference in New Issue
Block a user