From 3653bd4e80ea5f5d4baa356b7bac65bedd1f4b2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D9=8A?= <ajay.calsoft@gmail.com>
Date: Sat, 18 Jan 2025 09:58:58 +0530
Subject: [PATCH] Update arxiv_schlorly_research.py

Function Definitions:
fetch_arxiv_data: Fetches arXiv data based on a query.
create_dataframe: Creates a DataFrame from the provided data.
get_arxiv_main_content: Returns the main content of an arXiv paper.
download_image: Downloads an image from a URL.
scrape_images_from_arxiv: Scrapes images from an arXiv page.
arxiv_bibtex: Generates the BibTeX entry for an arXiv paper.
extract_arxiv_ids_from_line: Extracts arXiv IDs from a given line of text.
read_written_ids: Reads already written arXiv IDs from a file.
append_id_to_file: Appends a single arXiv ID to a file.
Step 2: Suggest Code Improvements
Code Duplication:
Combine Similar Functions: Functions such as fetch_arxiv_data and create_dataframe can be combined or refactored to reduce redundancy.
Reuse Code: Ensure common functionality is abstracted into reusable functions.
Performance and Optimization:
Optimize API Calls: Ensure the arXiv API calls are optimized and handle rate limits.
Efficient Data Handling: Use more efficient data handling techniques, such as batch processing for large datasets.
Coding Standards and Best Practices:
Add Docstrings: Ensure all functions have detailed docstrings explaining their purpose, arguments, and return values.
Error Handling: Improve error handling to provide more informative error messages and handle different types of errors separately.
Logging: Use a consistent logging strategy to log important events and errors.
Code Structure: Group related functions into classes or modules for better organization and maintainability.
PEP 8 Compliance: Ensure the code follows PEP 8 standards for Python code style.
---
 .../arxiv_schlorly_research.py                | 312 ++++++++----------
 1 file changed, 137 insertions(+), 175 deletions(-)
diff --git a/lib/ai_web_researcher/arxiv_schlorly_research.py b/lib/ai_web_researcher/arxiv_schlorly_research.py
index cb833b22..347b408f 100644
--- a/lib/ai_web_researcher/arxiv_schlorly_research.py
+++ b/lib/ai_web_researcher/arxiv_schlorly_research.py
@@ -7,61 +7,59 @@
 ####################################################
 
 
-import os 
+import os
 import sys
 import re
-
 import pandas as pd
 import arxiv
 import PyPDF2
-import io
 import requests
 from bs4 import BeautifulSoup
-import urllib.parse
+from urllib.parse import urlparse
 from loguru import logger
-logger.remove()
-logger.add(sys.stdout,
-        colorize=True,
-        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
-    )
 
+logger.remove()
+logger.add(sys.stdout, colorize=True, format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}")
 
 def fetch_arxiv_data(query, max_results=10):
+    """
+    Fetches arXiv data based on a query.
+
+    Args:
+        query (str): The search query.
+        max_results (int): The maximum number of results to fetch.
+
+    Returns:
+        list: A list of arXiv data.
+    """
     try:
-        # Construct the default API client
         client = arxiv.Client()
-
-        # Search for articles matching the keyword
-        search = arxiv.Search(
-            query=query,
-            max_results=max_results,
-            sort_by=arxiv.SortCriterion.SubmittedDate
-        )
-
-        # Fetching results
+        search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
         results = list(client.results(search))
-        # Extracting data
-        all_data = []
-        for result in results:
-            temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
-            all_data.append(temp)
-
+        all_data = [[result.title, result.published, result.entry_id, result.summary, result.pdf_url] for result in results]
         return all_data
-
     except Exception as e:
-        print("An error occurred while fetching data from arXiv:", e)
+        logger.error(f"An error occurred while fetching data from arXiv: {e}")
         raise e
 
-
 def create_dataframe(data, column_names):
+    """
+    Creates a DataFrame from the provided data.
+
+    Args:
+        data (list): The data to convert to a DataFrame.
+        column_names (list): The column names for the DataFrame.
+
+    Returns:
+        DataFrame: The created DataFrame.
+    """
     try:
         df = pd.DataFrame(data, columns=column_names)
         return df
     except Exception as e:
-        print("An error occurred while creating DataFrame:", e)
+        logger.error(f"An error occurred while creating DataFrame: {e}")
         return pd.DataFrame()
 
-
 def get_arxiv_main_content(url):
     """
     Returns the main content of an arXiv paper.
@@ -72,211 +70,181 @@ def get_arxiv_main_content(url):
     Returns:
         str: The main content of the paper as a string.
     """
-
     try:
-        # Send a GET request to the URL
         response = requests.get(url)
-        response.raise_for_status()  # Raise an exception for HTTP errors
-
-        # Parse the HTML content
+        response.raise_for_status()
         soup = BeautifulSoup(response.content, "html.parser")
-
-        # Find the main content in 'ltx_page_content'
         main_content = soup.find('div', class_='ltx_page_content')
         if not main_content:
             logger.warning("Main content not found in the page.")
             return "Main content not found."
-
-        # Remove specific section with class 'package-alerts ltx_document'
         alert_section = main_content.find('div', class_='package-alerts ltx_document')
-        if alert_section:
+        if (alert_section):
             alert_section.decompose()
-
-        # Optional: Remove abstract and authors if present
         for element_id in ["abs", "authors"]:
             element = main_content.find(id=element_id)
-            if element:
+            if (element):
                 element.decompose()
         return main_content.text.strip()
-
-    # Could not access the arxiv HTML content, instead download pdf and read its content.
     except Exception as html_error:
         logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
-        try:
-            # Extract arXiv ID from URL
-            arxiv_id = url.split('/')[-1]
-            # Fetch paper information using arXiv API
-            paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
-            pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
-            # Initialize an empty string to store the extracted text
-            pdf_text = ''
+        return get_pdf_content(url)
 
-            # Read the downloaded PDF
-            with open(pdf_filename, 'rb', encoding="utf-8") as f:
-                pdf_reader = PyPDF2.PdfReader(f)
+def get_pdf_content(url):
+    """
+    Helper function to get the content from a PDF if HTML content is not accessible.
 
-                for page in pdf_reader.pages:
-                    try:
-                        # Attempt to extract text from the current page
-                        page_text = page.extract_text()
-                        # If text extraction is successful, add it to the cumulative text
-                        if page_text:
-                            pdf_text += page_text + '\n'
-                    except UnicodeDecodeError as err:
-                        # FIXME: Handle any UnicodeDecodeError that arises during text extraction
-                        logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
-                        pass
+    Args:
+        url (str): The URL of the arXiv paper.
 
-            # Optionally, remove the downloaded PDF file
-            os.remove(pdf_filename)
-            
-            # Pattern to match 'References' and everything that follows
-            pattern = r'References\s*.*'
-            pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
-            sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
-            for section in sections_to_remove:
-                # Pattern to match the section title and any text following it until the next big title or end of document
-                pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
-                pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)
+    Returns:
+        str: The main content of the paper as a string.
+    """
+    try:
+        client = arxiv.Client()
+        arxiv_id = url.split('/')[-1]
+        paper = next(client.results(arxiv.Search(id_list=[arxiv_id])))
+        pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
+        pdf_text = ''
+        with open(pdf_filename, 'rb') as f:
+            pdf_reader = PyPDF2.PdfReader(f)
+            for page in pdf_reader.pages:
+                try:
+                    page_text = page.extract_text()
+                    if page_text:
+                        pdf_text += page_text + '\n'
+                except UnicodeDecodeError as err:
+                    logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
+                    pass
+        os.remove(pdf_filename)
+        pdf_text = clean_pdf_text(pdf_text)
+        return pdf_text
+    except Exception as pdf_error:
+        logger.error(f"Failed to process PDF: {pdf_error}")
+        return "Failed to retrieve content."
 
-            return pdf_text
+def clean_pdf_text(text):
+    """
+    Helper function to clean the text extracted from a PDF.
 
-        except Exception as pdf_error:
-            logger.error(f"Failed to process PDF: {pdf_error}")
-            return "Failed to retrieve content."
+    Args:
+        text (str): The text to clean.
 
+    Returns:
+        str: The cleaned text.
+    """
+    pattern = r'References\s*.*'
+    text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
+    sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
+    for section in sections_to_remove:
+        pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
+        text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
+    return text
 
 def download_image(image_url, base_url, folder="images"):
-    # Skip downloading if the image URL is a data URI
-    if image_url.startswith('data:image'):
-        print(f"Skipping download of data URI image: {image_url}")
-        return False
+    """
+    Downloads an image from a URL.
 
-    # Create the folder if it doesn't exist
+    Args:
+        image_url (str): The URL of the image.
+        base_url (str): The base URL of the website.
+        folder (str): The folder to save the image.
+
+    Returns:
+        bool: True if the image was downloaded successfully, False otherwise.
+    """
+    if image_url.startswith('data:image'):
+        logger.info(f"Skipping download of data URI image: {image_url}")
+        return False
     if not os.path.exists(folder):
         os.makedirs(folder)
-
-    # Form the absolute URL for image paths
-    if not urllib.parse.urlparse(image_url).scheme:
+    if not urlparse(image_url).scheme:
         if not base_url.endswith('/'):
             base_url += '/'
         image_url = base_url + image_url
-
-    # Download and save the image
     try:
         response = requests.get(image_url)
         response.raise_for_status()
-
         image_name = image_url.split("/")[-1]
-        with open(os.path.join(folder, image_name), 'wb', encoding="utf-8") as file:
+        with open(os.path.join(folder, image_name), 'wb') as file:
             file.write(response.content)
         return True
-
     except requests.RequestException as e:
-        print(f"Error downloading {image_url}: {str(e)}")
+        logger.error(f"Error downloading {image_url}: {e}")
         return False
 
-
 def scrape_images_from_arxiv(url):
+    """
+    Scrapes images from an arXiv page.
+
+    Args:
+        url (str): The URL of the arXiv page.
+
+    Returns:
+        list: A list of image URLs.
+    """
     try:
         response = requests.get(url)
         response.raise_for_status()
-
         soup = BeautifulSoup(response.text, 'html.parser')
         images = soup.find_all('img')
-
         image_urls = [img['src'] for img in images if 'src' in img.attrs]
         return image_urls
-
     except requests.RequestException as e:
-        print(f"Error fetching page {url}: {str(e)}")
+        logger.error(f"Error fetching page {url}: {e}")
         return []
 
-
 def arxiv_bibtex(arxiv_id):
     """
     Get the BibTeX entry for an arXiv paper.
+
     Args: 
         arxiv_id: The arXiv ID of the paper.
+
     Returns: 
         A string containing the BibTeX entry.
     """
-
-    import urllib.request, xml.dom.minidom
-
-    # Download the XML
     try:
         usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
         xmldoc = xml.dom.minidom.parse(usock)
         usock.close()
+        entry = xmldoc.getElementsByTagName("entry")[0]
+        date = entry.getElementsByTagName("updated")[0].firstChild.data
+        text_year = date[:4]
+        title = entry.getElementsByTagName("title")[0]
+        text_title = title.firstChild.data.strip()
+        authorlist = []
+        first = True
+        for person_name in entry.getElementsByTagName("author"):
+            name = person_name.getElementsByTagName("name")[0]
+            text_name = name.firstChild.data
+            text_given_name = ' '.join(text_name.split()[:-1])
+            text_surname = text_name.split()[-1]
+            authorlist.append(f"{text_surname}, {text_given_name}")
+            if first:
+                text_first_author_surname = text_surname
+                first = False
+        bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
+        bibtex += f" author = {' and '.join(authorlist)},\n"
+        bibtex += f" title = {{{text_title}}},\n"
+        bibtex += f" year = {{{text_year}}},\n"
+        bibtex += f" eprint = {{{arxiv_id}}},\n"
+        bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
+        bibtex += "}"
+        return bibtex
     except Exception as e:
-        raise e
-
-    # Parse the XML
-    entry = xmldoc.getElementsByTagName("entry")[0]
-    date = entry.getElementsByTagName("updated")[0].firstChild.data
-    text_year = date[:4]
-
-    title = entry.getElementsByTagName("title")[0]
-    text_title = title.firstChild.data.strip()
-
-    authorlist = []
-    first = True
-    for person_name in entry.getElementsByTagName("author"):
-        # Get names
-        name = person_name.getElementsByTagName("name")[0]
-        text_name = name.firstChild.data
-        text_given_name = ' '.join(text_name.split()[:-1])
-        text_surname = text_name.split()[-1]
-        authorlist.append(f"{text_surname}, {text_given_name}")
-        # First author?
-        if first:
-            text_first_author_surname = text_surname
-            first = False
-
-    # Construct the BibTeX entry
-    bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
-    bibtex += f" author = {' and '.join(authorlist)},\n"
-    bibtex += f" title = {{{text_title}}},\n"
-    bibtex += f" year = {{{text_year}}},\n"
-    bibtex += f" eprint = {{{arxiv_id}}},\n"
-    bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
-    bibtex += "}"
-
-    return bibtex
-
-
-#from serpapi import GoogleSearch
-#params = {
-#  "api_key": "os.getenv(SERPER_API_KEY)",
-#  "engine": "google_scholar",
-#  "q": "llm",
-#  "hl": "en",
-#  "as_ylo": "2023",
-#  "as_yhi": "2024"
-#}
-#search = GoogleSearch(params)
-#results = search.get_dict()
-
-#from llmsherpa.readers import LayoutPDFReader
-
-#llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
-#pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf
-#pdf_reader = LayoutPDFReader(llmsherpa_api_url)
-#doc = pdf_reader.read_pdf(pdf_url)
-
-
-
+        logger.error(f"Error while generating BibTeX: {e}")
+        return ""
 
 def extract_arxiv_ids_from_line(line):
     """
     Extract the arXiv ID from a given line of text.
 
     Args:
-    line (str): A line of text potentially containing an arXiv URL.
+        line (str): A line of text potentially containing an arXiv URL.
 
     Returns:
-    str: The extracted arXiv ID, or None if not found.
+        str: The extracted arXiv ID, or None if not found.
     """
     arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
     match = arxiv_id_pattern.search(line)
@@ -284,16 +252,15 @@ def extract_arxiv_ids_from_line(line):
         return match.group(1) + (match.group(2) if match.group(2) else '')
     return None
 
-
 def read_written_ids(file_path):
     """
     Read already written arXiv IDs from a file.
 
     Args:
-    file_path (str): Path to the file containing written IDs.
+        file_path (str): Path to the file containing written IDs.
 
     Returns:
-    set: A set of arXiv IDs.
+        set: A set of arXiv IDs.
     """
     written_ids = set()
     try:
@@ -306,27 +273,22 @@ def read_written_ids(file_path):
         logger.error(f"Error while reading the file: {e}")
     return written_ids
 
-
 def append_id_to_file(arxiv_id, output_file_path):
     """
     Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
 
     Args:
-    arxiv_id (str): The arXiv ID to append.
-    output_file_path (str): Path to the output file.
+        arxiv_id (str): The arXiv ID to append.
+        output_file_path (str): Path to the output file.
     """
     try:
-        # Check if file exists
         if not os.path.exists(output_file_path):
             logger.info(f"File does not exist. Creating new file: {output_file_path}")
-            # Create a new file and append the ID
             with open(output_file_path, 'a', encoding="utf-8") as outfile:
                 outfile.write(arxiv_id + '\n')
         else:
             logger.info(f"Appending to existing file: {output_file_path}")
-            # File exists, append the ID
             with open(output_file_path, 'a', encoding="utf-8") as outfile:
                 outfile.write(arxiv_id + '\n')
-
     except Exception as e:
         logger.error(f"Error while appending to file: {e}")