diff --git a/lollms/functions/bibliography.py b/lollms/functions/bibliography.py index 4cbc9a1..07e4f31 100644 --- a/lollms/functions/bibliography.py +++ b/lollms/functions/bibliography.py @@ -155,6 +155,254 @@ def arxiv_pdf_search_function(client: Optional[Any] = None): ] } +# Core function to search for PDFs on HAL and download them to a specified directory +def hal_pdf_search(query: str, max_results: Optional[int] = 5, sort_by: Optional[str] = 'relevance', start_date: Optional[str] = None, end_date: Optional[str] = None, author: Optional[str] = None, client: Optional[Any] = None) -> (str, Dict[str, Any]): + try: + if client is None: + download_to = Path("./pdf_search") + else: + download_to = client.discussion.discussion_folder / "pdf_search" + + # Construct the search URL with additional parameters + url = f'https://api.archives-ouvertes.fr/search/?q={query}&rows={max_results}&sort={sort_by}' + if start_date: + url += f'&fq=submittedDate:[{start_date} TO {end_date if end_date else "*"}]' + if author: + url += f'&fq=authIdHal_s:{author}' + + ASCIIColors.multicolor(["URL:",url],[ASCIIColors.red, ASCIIColors.yellow]) + response = requests.get(url) + response.raise_for_status() + + # Parse the response + soup = BeautifulSoup(response.content, 'xml') + entries = soup.find_all('doc') + + # Create the directory if it doesn't exist + download_to.mkdir(parents=True, exist_ok=True) + + # Extract PDF URLs and additional information + html_output = "" + report_content = "" + pdf_info = {} + for entry in entries: + pdf_url = entry.find('str', {'name': 'fileMain_s'}).text + pdf_name = pdf_url.split('/')[-1] + pdf_path = download_to / pdf_name + + if client is None: + local_url = f'/discussions/pdf_search/{pdf_name}' + else: + local_url = discussion_path_to_url(pdf_path) + + # Extract additional information + title = entry.find('str', {'name': 'title_s'}).text + authors = ', '.join(author.text for author in entry.find_all('str', {'name': 'authFullName_s'})) + abstract = entry.find('str', {'name': 'abstract_s'}).text if entry.find('str', {'name': 'abstract_s'}) else 'N/A' + published_date = entry.find('date', {'name': 'producedDate_tdate'}).text + journal_ref = entry.find('str', {'name': 'journalTitle_s'}).text if entry.find('str', {'name': 'journalTitle_s'}) else 'N/A' + + # Write abstract and additional information to text file + abstract_path = download_to / f"{pdf_name}_abstract.txt" + with abstract_path.open('w', encoding='utf-8') as abstract_file: + abstract_file.write(f"Title: {title}\n") + abstract_file.write(f"Authors: {authors}\n") + abstract_file.write(f"Abstract: {abstract}\n") + abstract_file.write(f"Published Date: {published_date}\n") + abstract_file.write(f"Journal/Conference: {journal_ref}\n") + + # Download PDF + pdf_response = requests.get(pdf_url) + with pdf_path.open('wb') as pdf_file: + pdf_file.write(pdf_response.content) + + # Append to HTML output + html_output += f""" +
+

{title}

+

Authors: {authors}

+

Abstract: {abstract}

+

Published Date: {published_date}

+

Journal/Conference: {journal_ref}

+

PDF Link

+

Local PDF

+
+ """ + # Append to report content + report_content += f""" +Title: {title} +Authors: {authors} +Abstract: {abstract} +Published Date: {published_date} +Journal/Conference: {journal_ref} +PDF Link: {pdf_url} +Local PDF: {local_url} +------------------------ + """ + # Append to pdf_info dict + pdf_info[pdf_name] = { + "title": title, + "authors": authors, + "abstract": abstract, + "published_date": published_date, + "journal_ref": journal_ref, + "pdf_url": pdf_url, + "local_url": local_url + } + + # Save the report to a text file + report_path = download_to / "pdf_search_report.txt" + with report_path.open('w', encoding='utf-8') as report_file: + report_file.write(report_content) + + html_output += "" + return html_output, pdf_info + + except Exception as e: + return trace_exception(e), {} + +# Metadata function +def hal_pdf_search_function(client: Optional[Any] = None): + return { + "function_name": "hal_pdf_search", # The function name in string + "function": partial(hal_pdf_search, client=client), # The function to be called with partial to preset client + "function_description": "Searches for PDFs on HAL based on a query, downloads them to a specified directory, and returns a HTML string containing article details and links, along with a dictionary containing detailed information about each PDF.", # Description of the function + "function_parameters": [ # The set of parameters + {"name": "query", "type": "str", "description": "The search query for HAL."}, + {"name": "max_results", "type": "int", "description": "The maximum number of results to return. (Optional)", "optional": True, "default": 5}, + {"name": "sort_by", "type": "str", "description": "The sorting criteria for the search results (e.g., relevance, lastUpdatedDate). (Optional)", "optional": True, "default": "relevance"}, + {"name": "start_date", "type": "str", "description": "The start date for the search results in the format YYYY-MM-DD. (Optional)", "optional": True}, + {"name": "end_date", "type": "str", "description": "The end date for the search results in the format YYYY-MM-DD. (Optional)", "optional": True}, + {"name": "author", "type": "str", "description": "The author name for the search results. (Optional)", "optional": True}, + ] + } + +# Core function to search for PDFs on ResearchGate and download them to a specified directory +def researchgate_pdf_search(query: str, max_results: Optional[int] = 5, sort_by: Optional[str] = 'relevance', start_date: Optional[str] = None, end_date: Optional[str] = None, author: Optional[str] = None, client: Optional[Any] = None) -> (str, Dict[str, Any]): + try: + if client is None: + download_to = Path("./pdf_search") + else: + download_to = client.discussion.discussion_folder / "pdf_search" + + # Construct the search URL with additional parameters + url = f'https://www.researchgate.net/search/publication?q={query}&limit={max_results}&sort={sort_by}' + if start_date: + url += f'&startDate={start_date}' + if end_date: + url += f'&endDate={end_date}' + if author: + url += f'&author={author}' + + ASCIIColors.multicolor(["URL:",url],[ASCIIColors.red, ASCIIColors.yellow]) + response = requests.get(url) + response.raise_for_status() + + # Parse the response + soup = BeautifulSoup(response.content, 'html.parser') + entries = soup.find_all('div', class_='nova-o-stack__item') + + # Create the directory if it doesn't exist + download_to.mkdir(parents=True, exist_ok=True) + + # Extract PDF URLs and additional information + html_output = "" + report_content = "" + pdf_info = {} + for entry in entries: + pdf_link_tag = entry.find('a', class_='nova-e-link') + if not pdf_link_tag: + continue + pdf_url = pdf_link_tag['href'] + pdf_name = pdf_url.split('/')[-1] + '.pdf' + pdf_path = download_to / pdf_name + + if client is None: + local_url = f'/discussions/pdf_search/{pdf_name}' + else: + local_url = discussion_path_to_url(pdf_path) + + # Extract additional information + title = entry.find('h3', class_='nova-e-text').text.strip() + authors = ', '.join(author.text.strip() for author in entry.find_all('span', class_='nova-e-text')) + abstract = entry.find('div', class_='nova-e-text').text.strip() if entry.find('div', class_='nova-e-text') else 'N/A' + published_date = entry.find('span', class_='nova-e-text--size-s').text.strip() if entry.find('span', class_='nova-e-text--size-s') else 'N/A' + journal_ref = entry.find('span', class_='nova-e-text--size-s').text.strip() if entry.find('span', class_='nova-e-text--size-s') else 'N/A' + + # Write abstract and additional information to text file + abstract_path = download_to / f"{pdf_name}_abstract.txt" + with abstract_path.open('w', encoding='utf-8') as abstract_file: + abstract_file.write(f"Title: {title}\n") + abstract_file.write(f"Authors: {authors}\n") + abstract_file.write(f"Abstract: {abstract}\n") + abstract_file.write(f"Published Date: {published_date}\n") + abstract_file.write(f"Journal/Conference: {journal_ref}\n") + + # Download PDF + pdf_response = requests.get(pdf_url) + with pdf_path.open('wb') as pdf_file: + pdf_file.write(pdf_response.content) + + # Append to HTML output + html_output += f""" +
+

{title}

+

Authors: {authors}

+

Abstract: {abstract}

+

Published Date: {published_date}

+

Journal/Conference: {journal_ref}

+

PDF Link

+

Local PDF

+
+ """ + # Append to report content + report_content += f""" +Title: {title} +Authors: {authors} +Abstract: {abstract} +Published Date: {published_date} +Journal/Conference: {journal_ref} +PDF Link: {pdf_url} +Local PDF: {local_url} +------------------------ + """ + # Append to pdf_info dict + pdf_info[pdf_name] = { + "title": title, + "authors": authors, + "abstract": abstract, + "published_date": published_date, + "journal_ref": journal_ref, + "pdf_url": pdf_url, + "local_url": local_url + } + + # Save the report to a text file + report_path = download_to / "pdf_search_report.txt" + with report_path.open('w', encoding='utf-8') as report_file: + report_file.write(report_content) + + html_output += "" + return html_output, pdf_info + + except Exception as e: + return trace_exception(e), {} + +# Metadata function +def researchgate_pdf_search_function(client: Optional[Any] = None): + return { + "function_name": "researchgate_pdf_search", # The function name in string + "function": partial(researchgate_pdf_search, client=client), # The function to be called with partial to preset client + "function_description": "Searches for PDFs on ResearchGate based on a query, downloads them to a specified directory, and returns a HTML string containing article details and links, along with a dictionary containing detailed information about each PDF.", # Description of the function + "function_parameters": [ # The set of parameters + {"name": "query", "type": "str", "description": "The search query for ResearchGate."}, + {"name": "max_results", "type": "int", "description": "The maximum number of results to return. (Optional)", "optional": True, "default": 5}, + {"name": "sort_by", "type": "str", "description": "The sorting criteria for the search results (e.g., relevance, lastUpdatedDate). (Optional)", "optional": True, "default": "relevance"}, + {"name": "start_date", "type": "str", "description": "The start date for the search results in the format YYYY-MM-DD. (Optional)", "optional": True}, + {"name": "end_date", "type": "str", "description": "The end date for the search results in the format YYYY-MM-DD. (Optional)", "optional": True}, + {"name": "author", "type": "str", "description": "The author name for the search results. (Optional)", "optional": True}, + ] + } # Define the core function def rate_relevance(search_prompt: str, text_example: str, llm) -> Tuple[str, Dict[str, float]]: