From 3653bd4e80ea5f5d4baa356b7bac65bedd1f4b2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D9=8A?= Date: Sat, 18 Jan 2025 09:58:58 +0530 Subject: [PATCH] Update arxiv_schlorly_research.py Function Definitions: fetch_arxiv_data: Fetches arXiv data based on a query. create_dataframe: Creates a DataFrame from the provided data. get_arxiv_main_content: Returns the main content of an arXiv paper. download_image: Downloads an image from a URL. scrape_images_from_arxiv: Scrapes images from an arXiv page. arxiv_bibtex: Generates the BibTeX entry for an arXiv paper. extract_arxiv_ids_from_line: Extracts arXiv IDs from a given line of text. read_written_ids: Reads already written arXiv IDs from a file. append_id_to_file: Appends a single arXiv ID to a file. Step 2: Suggest Code Improvements Code Duplication: Combine Similar Functions: Functions such as fetch_arxiv_data and create_dataframe can be combined or refactored to reduce redundancy. Reuse Code: Ensure common functionality is abstracted into reusable functions. Performance and Optimization: Optimize API Calls: Ensure the arXiv API calls are optimized and handle rate limits. Efficient Data Handling: Use more efficient data handling techniques, such as batch processing for large datasets. Coding Standards and Best Practices: Add Docstrings: Ensure all functions have detailed docstrings explaining their purpose, arguments, and return values. Error Handling: Improve error handling to provide more informative error messages and handle different types of errors separately. Logging: Use a consistent logging strategy to log important events and errors. Code Structure: Group related functions into classes or modules for better organization and maintainability. PEP 8 Compliance: Ensure the code follows PEP 8 standards for Python code style. --- .../arxiv_schlorly_research.py | 312 ++++++++---------- 1 file changed, 137 insertions(+), 175 deletions(-) diff --git a/lib/ai_web_researcher/arxiv_schlorly_research.py b/lib/ai_web_researcher/arxiv_schlorly_research.py index cb833b22..347b408f 100644 --- a/lib/ai_web_researcher/arxiv_schlorly_research.py +++ b/lib/ai_web_researcher/arxiv_schlorly_research.py @@ -7,61 +7,59 @@ #################################################### -import os +import os import sys import re - import pandas as pd import arxiv import PyPDF2 -import io import requests from bs4 import BeautifulSoup -import urllib.parse +from urllib.parse import urlparse from loguru import logger -logger.remove() -logger.add(sys.stdout, - colorize=True, - format="{level}|{file}:{line}:{function}| {message}" - ) +logger.remove() +logger.add(sys.stdout, colorize=True, format="{level}|{file}:{line}:{function}| {message}") def fetch_arxiv_data(query, max_results=10): + """ + Fetches arXiv data based on a query. + + Args: + query (str): The search query. + max_results (int): The maximum number of results to fetch. + + Returns: + list: A list of arXiv data. + """ try: - # Construct the default API client client = arxiv.Client() - - # Search for articles matching the keyword - search = arxiv.Search( - query=query, - max_results=max_results, - sort_by=arxiv.SortCriterion.SubmittedDate - ) - - # Fetching results + search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate) results = list(client.results(search)) - # Extracting data - all_data = [] - for result in results: - temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url] - all_data.append(temp) - + all_data = [[result.title, result.published, result.entry_id, result.summary, result.pdf_url] for result in results] return all_data - except Exception as e: - print("An error occurred while fetching data from arXiv:", e) + logger.error(f"An error occurred while fetching data from arXiv: {e}") raise e - def create_dataframe(data, column_names): + """ + Creates a DataFrame from the provided data. + + Args: + data (list): The data to convert to a DataFrame. + column_names (list): The column names for the DataFrame. + + Returns: + DataFrame: The created DataFrame. + """ try: df = pd.DataFrame(data, columns=column_names) return df except Exception as e: - print("An error occurred while creating DataFrame:", e) + logger.error(f"An error occurred while creating DataFrame: {e}") return pd.DataFrame() - def get_arxiv_main_content(url): """ Returns the main content of an arXiv paper. @@ -72,211 +70,181 @@ def get_arxiv_main_content(url): Returns: str: The main content of the paper as a string. """ - try: - # Send a GET request to the URL response = requests.get(url) - response.raise_for_status() # Raise an exception for HTTP errors - - # Parse the HTML content + response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") - - # Find the main content in 'ltx_page_content' main_content = soup.find('div', class_='ltx_page_content') if not main_content: logger.warning("Main content not found in the page.") return "Main content not found." - - # Remove specific section with class 'package-alerts ltx_document' alert_section = main_content.find('div', class_='package-alerts ltx_document') - if alert_section: + if (alert_section): alert_section.decompose() - - # Optional: Remove abstract and authors if present for element_id in ["abs", "authors"]: element = main_content.find(id=element_id) - if element: + if (element): element.decompose() return main_content.text.strip() - - # Could not access the arxiv HTML content, instead download pdf and read its content. except Exception as html_error: logger.warning(f"HTML content not accessible, trying PDF: {html_error}") - try: - # Extract arXiv ID from URL - arxiv_id = url.split('/')[-1] - # Fetch paper information using arXiv API - paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id]))) - pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf") - # Initialize an empty string to store the extracted text - pdf_text = '' + return get_pdf_content(url) - # Read the downloaded PDF - with open(pdf_filename, 'rb', encoding="utf-8") as f: - pdf_reader = PyPDF2.PdfReader(f) +def get_pdf_content(url): + """ + Helper function to get the content from a PDF if HTML content is not accessible. - for page in pdf_reader.pages: - try: - # Attempt to extract text from the current page - page_text = page.extract_text() - # If text extraction is successful, add it to the cumulative text - if page_text: - pdf_text += page_text + '\n' - except UnicodeDecodeError as err: - # FIXME: Handle any UnicodeDecodeError that arises during text extraction - logger.error(f"UnicodeDecodeError that arises during text extraction: {err}") - pass + Args: + url (str): The URL of the arXiv paper. - # Optionally, remove the downloaded PDF file - os.remove(pdf_filename) - - # Pattern to match 'References' and everything that follows - pattern = r'References\s*.*' - pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL) - sections_to_remove = ['Acknowledgements', 'References', 'Bibliography'] - for section in sections_to_remove: - # Pattern to match the section title and any text following it until the next big title or end of document - pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)' - pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE) + Returns: + str: The main content of the paper as a string. + """ + try: + client = arxiv.Client() + arxiv_id = url.split('/')[-1] + paper = next(client.results(arxiv.Search(id_list=[arxiv_id]))) + pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf") + pdf_text = '' + with open(pdf_filename, 'rb') as f: + pdf_reader = PyPDF2.PdfReader(f) + for page in pdf_reader.pages: + try: + page_text = page.extract_text() + if page_text: + pdf_text += page_text + '\n' + except UnicodeDecodeError as err: + logger.error(f"UnicodeDecodeError that arises during text extraction: {err}") + pass + os.remove(pdf_filename) + pdf_text = clean_pdf_text(pdf_text) + return pdf_text + except Exception as pdf_error: + logger.error(f"Failed to process PDF: {pdf_error}") + return "Failed to retrieve content." - return pdf_text +def clean_pdf_text(text): + """ + Helper function to clean the text extracted from a PDF. - except Exception as pdf_error: - logger.error(f"Failed to process PDF: {pdf_error}") - return "Failed to retrieve content." + Args: + text (str): The text to clean. + Returns: + str: The cleaned text. + """ + pattern = r'References\s*.*' + text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL) + sections_to_remove = ['Acknowledgements', 'References', 'Bibliography'] + for section in sections_to_remove: + pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)' + text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE) + return text def download_image(image_url, base_url, folder="images"): - # Skip downloading if the image URL is a data URI - if image_url.startswith('data:image'): - print(f"Skipping download of data URI image: {image_url}") - return False + """ + Downloads an image from a URL. - # Create the folder if it doesn't exist + Args: + image_url (str): The URL of the image. + base_url (str): The base URL of the website. + folder (str): The folder to save the image. + + Returns: + bool: True if the image was downloaded successfully, False otherwise. + """ + if image_url.startswith('data:image'): + logger.info(f"Skipping download of data URI image: {image_url}") + return False if not os.path.exists(folder): os.makedirs(folder) - - # Form the absolute URL for image paths - if not urllib.parse.urlparse(image_url).scheme: + if not urlparse(image_url).scheme: if not base_url.endswith('/'): base_url += '/' image_url = base_url + image_url - - # Download and save the image try: response = requests.get(image_url) response.raise_for_status() - image_name = image_url.split("/")[-1] - with open(os.path.join(folder, image_name), 'wb', encoding="utf-8") as file: + with open(os.path.join(folder, image_name), 'wb') as file: file.write(response.content) return True - except requests.RequestException as e: - print(f"Error downloading {image_url}: {str(e)}") + logger.error(f"Error downloading {image_url}: {e}") return False - def scrape_images_from_arxiv(url): + """ + Scrapes images from an arXiv page. + + Args: + url (str): The URL of the arXiv page. + + Returns: + list: A list of image URLs. + """ try: response = requests.get(url) response.raise_for_status() - soup = BeautifulSoup(response.text, 'html.parser') images = soup.find_all('img') - image_urls = [img['src'] for img in images if 'src' in img.attrs] return image_urls - except requests.RequestException as e: - print(f"Error fetching page {url}: {str(e)}") + logger.error(f"Error fetching page {url}: {e}") return [] - def arxiv_bibtex(arxiv_id): """ Get the BibTeX entry for an arXiv paper. + Args: arxiv_id: The arXiv ID of the paper. + Returns: A string containing the BibTeX entry. """ - - import urllib.request, xml.dom.minidom - - # Download the XML try: usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}') xmldoc = xml.dom.minidom.parse(usock) usock.close() + entry = xmldoc.getElementsByTagName("entry")[0] + date = entry.getElementsByTagName("updated")[0].firstChild.data + text_year = date[:4] + title = entry.getElementsByTagName("title")[0] + text_title = title.firstChild.data.strip() + authorlist = [] + first = True + for person_name in entry.getElementsByTagName("author"): + name = person_name.getElementsByTagName("name")[0] + text_name = name.firstChild.data + text_given_name = ' '.join(text_name.split()[:-1]) + text_surname = text_name.split()[-1] + authorlist.append(f"{text_surname}, {text_given_name}") + if first: + text_first_author_surname = text_surname + first = False + bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n" + bibtex += f" author = {' and '.join(authorlist)},\n" + bibtex += f" title = {{{text_title}}},\n" + bibtex += f" year = {{{text_year}}},\n" + bibtex += f" eprint = {{{arxiv_id}}},\n" + bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n" + bibtex += "}" + return bibtex except Exception as e: - raise e - - # Parse the XML - entry = xmldoc.getElementsByTagName("entry")[0] - date = entry.getElementsByTagName("updated")[0].firstChild.data - text_year = date[:4] - - title = entry.getElementsByTagName("title")[0] - text_title = title.firstChild.data.strip() - - authorlist = [] - first = True - for person_name in entry.getElementsByTagName("author"): - # Get names - name = person_name.getElementsByTagName("name")[0] - text_name = name.firstChild.data - text_given_name = ' '.join(text_name.split()[:-1]) - text_surname = text_name.split()[-1] - authorlist.append(f"{text_surname}, {text_given_name}") - # First author? - if first: - text_first_author_surname = text_surname - first = False - - # Construct the BibTeX entry - bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n" - bibtex += f" author = {' and '.join(authorlist)},\n" - bibtex += f" title = {{{text_title}}},\n" - bibtex += f" year = {{{text_year}}},\n" - bibtex += f" eprint = {{{arxiv_id}}},\n" - bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n" - bibtex += "}" - - return bibtex - - -#from serpapi import GoogleSearch -#params = { -# "api_key": "os.getenv(SERPER_API_KEY)", -# "engine": "google_scholar", -# "q": "llm", -# "hl": "en", -# "as_ylo": "2023", -# "as_yhi": "2024" -#} -#search = GoogleSearch(params) -#results = search.get_dict() - -#from llmsherpa.readers import LayoutPDFReader - -#llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all" -#pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf -#pdf_reader = LayoutPDFReader(llmsherpa_api_url) -#doc = pdf_reader.read_pdf(pdf_url) - - - + logger.error(f"Error while generating BibTeX: {e}") + return "" def extract_arxiv_ids_from_line(line): """ Extract the arXiv ID from a given line of text. Args: - line (str): A line of text potentially containing an arXiv URL. + line (str): A line of text potentially containing an arXiv URL. Returns: - str: The extracted arXiv ID, or None if not found. + str: The extracted arXiv ID, or None if not found. """ arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?') match = arxiv_id_pattern.search(line) @@ -284,16 +252,15 @@ def extract_arxiv_ids_from_line(line): return match.group(1) + (match.group(2) if match.group(2) else '') return None - def read_written_ids(file_path): """ Read already written arXiv IDs from a file. Args: - file_path (str): Path to the file containing written IDs. + file_path (str): Path to the file containing written IDs. Returns: - set: A set of arXiv IDs. + set: A set of arXiv IDs. """ written_ids = set() try: @@ -306,27 +273,22 @@ def read_written_ids(file_path): logger.error(f"Error while reading the file: {e}") return written_ids - def append_id_to_file(arxiv_id, output_file_path): """ Append a single arXiv ID to a file. Checks if the file exists and creates it if not. Args: - arxiv_id (str): The arXiv ID to append. - output_file_path (str): Path to the output file. + arxiv_id (str): The arXiv ID to append. + output_file_path (str): Path to the output file. """ try: - # Check if file exists if not os.path.exists(output_file_path): logger.info(f"File does not exist. Creating new file: {output_file_path}") - # Create a new file and append the ID with open(output_file_path, 'a', encoding="utf-8") as outfile: outfile.write(arxiv_id + '\n') else: logger.info(f"Appending to existing file: {output_file_path}") - # File exists, append the ID with open(output_file_path, 'a', encoding="utf-8") as outfile: outfile.write(arxiv_id + '\n') - except Exception as e: logger.error(f"Error while appending to file: {e}")