Create bibliography.py

2025-04-20 16:40:48 +00:00 · 2024-05-28 09:52:41 +02:00 · 2024-05-28 09:52:41 +02:00 · 5896a56bcc
commit 5896a56bcc
parent 7468f4414c
1 changed files with 163 additions and 0 deletions
--- a/lollms/functions/bibliography.py
+++ b/lollms/functions/bibliography.py
@ -0,0 +1,163 @@
+# Lollms function call definition file
+
+# Import necessary libraries
+import requests
+from pathlib import Path
+
+# Partial is useful if we need to preset some parameters
+from functools import partial
+
+# It is advised to import typing elements
+from typing import List, Optional
+
+# Import PackageManager if there are potential libraries that need to be installed 
+from lollms.utilities import PackageManager, find_first_available_file_index, discussion_path_to_url
+
+# ascii_colors offers advanced console coloring and bug tracing
+from ascii_colors import trace_exception
+
+# Import Client from lollms.client_session
+from lollms.client_session import Client
+
+# Here is an example of how we install a non-installed library using PackageManager
+if not PackageManager.check_package_installed("bs4"):
+    PackageManager.install_package("beautifulsoup4")
+
+# Now we can import the library
+from bs4 import BeautifulSoup
+
+# Core function to search for PDFs on arXiv and download them to a specified directory
+def arxiv_pdf_search(query: str, max_results: Optional[int] = 5, sort_by: Optional[str] = 'relevance', start_date: Optional[str] = None, end_date: Optional[str] = None, author: Optional[str] = None, client: Optional[Client] = None) -> str:
+    try:
+        if client is None:
+            download_to = Path("./pdf_search")
+        else:
+            download_to = client.discussion.discussion_folder / "pdf_search"
+        
+        # Construct the search URL with additional parameters
+        url = f'http://export.arxiv.org/api/query?search_query={query}&start=0&max_results={max_results}&sortBy={sort_by}'
+        if start_date:
+            url += f'&startDate={start_date}'
+        if end_date:
+            url += f'&endDate={end_date}'
+        if author:
+            url += f'&author={author}'
+        
+        response = requests.get(url)
+        response.raise_for_status()
+
+        # Parse the response
+        soup = BeautifulSoup(response.content, 'xml')
+        entries = soup.find_all('entry')
+
+        # Create the directory if it doesn't exist
+        download_to.mkdir(parents=True, exist_ok=True)
+
+        # Extract PDF URLs and additional information
+        html_output = "<html><body>"
+        report_content = ""
+        for entry in entries:
+            pdf_url = entry.id.text.replace('abs', 'pdf') + '.pdf'
+            pdf_name = pdf_url.split('/')[-1]
+            pdf_path = download_to / pdf_name
+
+            if client is None:
+                local_url = f'/discussions/pdf_search/{pdf_name}'
+            else:
+                local_url = discussion_path_to_url(pdf_path)
+
+            # Extract additional information
+            title = entry.title.text
+            authors = ', '.join(author.find('name').text for author in entry.find_all('author'))
+            affiliations = ', '.join(affiliation.text for affiliation in entry.find_all('affiliation'))
+            abstract = entry.summary.text
+            published_date = entry.published.text
+            journal_ref = entry.find('journal_ref').text if entry.find('journal_ref') else 'N/A'
+            
+            # Write abstract and additional information to text file
+            abstract_path = download_to / f"{pdf_name}_abstract.txt"
+            with abstract_path.open('w', encoding='utf-8') as abstract_file:
+                abstract_file.write(f"Title: {title}\n")
+                abstract_file.write(f"Authors: {authors}\n")
+                abstract_file.write(f"Affiliations: {affiliations}\n")
+                abstract_file.write(f"Abstract: {abstract}\n")
+                abstract_file.write(f"Published Date: {published_date}\n")
+                abstract_file.write(f"Journal/Conference: {journal_ref}\n")
+
+            # Download PDF
+            pdf_response = requests.get(pdf_url)
+            with pdf_path.open('wb') as pdf_file:
+                pdf_file.write(pdf_response.content)
+
+            # Append to HTML output
+            html_output += f"""
+            <div>
+                <h2>{title}</h2>
+                <p><strong>Authors:</strong> {authors}</p>
+                <p><strong>Affiliations:</strong> {affiliations}</p>
+                <p><strong>Abstract:</strong> {abstract}</p>
+                <p><strong>Published Date:</strong> {published_date}</p>
+                <p><strong>Journal/Conference:</strong> {journal_ref}</p>
+                <p><a href="{pdf_url}" target="_blank">PDF Link</a></p>
+                <p><a href="{local_url}" target="_blank">Local PDF</a></p>
+            </div>
+            <hr />
+            """
+            # Append to report content
+            report_content += f"""
+            Title: {title}
+            Authors: {authors}
+            Affiliations: {affiliations}
+            Abstract: {abstract}
+            Published Date: {published_date}
+            Journal/Conference: {journal_ref}
+            PDF Link: {pdf_url}
+            Local PDF: {local_url}
+            ------------------------
+            """
+        
+        # Save the report to a text file
+        report_path = download_to / "pdf_search_report.txt"
+        with report_path.open('w', encoding='utf-8') as report_file:
+            report_file.write(report_content)
+        
+        html_output += "</body></html>"
+        return html_output
+
+    except Exception as e:
+        return trace_exception(e)
+
+# Metadata function
+def arxiv_pdf_search_function(client: Optional[Client] = None):
+    return {
+        "function_name": "arxiv_pdf_search",  # The function name in string
+        "function": partial(arxiv_pdf_search, client=client),  # The function to be called with partial to preset client
+        "function_description": "Searches for PDFs on arXiv based on a query, downloads them to a specified directory, and returns a HTML string containing article details and links.",  # Description of the function
+        "function_parameters": [  # The set of parameters
+            {"name": "query", "type": "str", "description": "The search query for arXiv."},
+            {"name": "max_results", "type": "int", "description": "The maximum number of results to return. (Optional)", "optional": True, "default": 5},
+            {"name": "sort_by", "type": "str", "description": "The sorting criteria for the search results (e.g., relevance, lastUpdatedDate). (Optional)", "optional": True, "default": "relevance"},
+            {"name": "start_date", "type": "str", "description": "The start date for the search results in the format YYYY-MM-DD. (Optional)", "optional": True},
+            {"name": "end_date", "type": "str", "description": "The end date for the search results in the format YYYY-MM-DD. (Optional)", "optional": True},
+            {"name": "author", "type": "str", "description": "The author name for the search results. (Optional)", "optional": True},
+        ]
+    }
+
+# Example usage
+if __name__ == "__main__":
+    # Example client initialization (if needed)
+    client = None  # Replace with actual client initialization if available
+
+    # Example function call
+    html_output = arxiv_pdf_search(
+        query="machine learning",
+        max_results=3,
+        sort_by="relevance",
+        start_date="2020-01-01",
+        end_date="2021-01-01",
+        author="John Doe",
+        client=client
+    )
+
+    # Print the HTML output
+    print(html_output)