ALwrity/lib/arxiv_schlorly_research.py

####################################################
#
# FIXME: Gotta use this lib: https://github.com/monk1337/resp/tree/main
# https://github.com/danielnsilva/semanticscholar
# https://github.com/shauryr/S2QA
#
####################################################


import os
import sys
import re

import pandas as pd
import arxiv
import PyPDF2
import io
import requests
from bs4 import BeautifulSoup
import urllib.parse
from loguru import logger
logger.remove()
logger.add(sys.stdout,
        colorize=True,
        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
    )


def fetch_arxiv_data(query, max_results=10):
    try:
        # Construct the default API client
        client = arxiv.Client()

        # Search for articles matching the keyword
        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        # Fetching results
        results = list(client.results(search))
        # Extracting data
        all_data = []
        for result in results:
            temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
            all_data.append(temp)

        return all_data

    except Exception as e:
        print("An error occurred while fetching data from arXiv:", e)
        raise e


def create_dataframe(data, column_names):
    try:
        df = pd.DataFrame(data, columns=column_names)
        return df
    except Exception as e:
        print("An error occurred while creating DataFrame:", e)
        return pd.DataFrame()


def get_arxiv_main_content(url):
    """
    Returns the main content of an arXiv paper.

    Args:
        url (str): The URL of the arXiv paper.

    Returns:
        str: The main content of the paper as a string.
    """

    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the main content in 'ltx_page_content'
        main_content = soup.find('div', class_='ltx_page_content')
        if not main_content:
            logger.warning("Main content not found in the page.")
            return "Main content not found."

        # Remove specific section with class 'package-alerts ltx_document'
        alert_section = main_content.find('div', class_='package-alerts ltx_document')
        if alert_section:
            alert_section.decompose()

        # Optional: Remove abstract and authors if present
        for element_id in ["abs", "authors"]:
            element = main_content.find(id=element_id)
            if element:
                element.decompose()
        return main_content.text.strip()

    # Could not access the arxiv HTML content, instead download pdf and read its content.
    except Exception as html_error:
        logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
        try:
            # Extract arXiv ID from URL
            arxiv_id = url.split('/')[-1]
            # Fetch paper information using arXiv API
            paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
            pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
            # Initialize an empty string to store the extracted text
            pdf_text = ''

            # Read the downloaded PDF
            with open(pdf_filename, 'rb') as f:
                pdf_reader = PyPDF2.PdfReader(f)

                for page in pdf_reader.pages:
                    try:
                        # Attempt to extract text from the current page
                        page_text = page.extract_text()
                        # If text extraction is successful, add it to the cumulative text
                        if page_text:
                            pdf_text += page_text + '\n'
                    except UnicodeDecodeError as err:
                        # FIXME: Handle any UnicodeDecodeError that arises during text extraction
                        logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
                        pass

            # Optionally, remove the downloaded PDF file
            os.remove(pdf_filename)

            # Pattern to match 'References' and everything that follows
            pattern = r'References\s*.*'
            pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
            sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
            for section in sections_to_remove:
                # Pattern to match the section title and any text following it until the next big title or end of document
                pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
                pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)

            return pdf_text

        except Exception as pdf_error:
            logger.error(f"Failed to process PDF: {pdf_error}")
            return "Failed to retrieve content."


def download_image(image_url, base_url, folder="images"):
    # Skip downloading if the image URL is a data URI
    if image_url.startswith('data:image'):
        print(f"Skipping download of data URI image: {image_url}")
        return False

    # Create the folder if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)

    # Form the absolute URL for image paths
    if not urllib.parse.urlparse(image_url).scheme:
        if not base_url.endswith('/'):
            base_url += '/'
        image_url = base_url + image_url

    # Download and save the image
    try:
        response = requests.get(image_url)
        response.raise_for_status()

        image_name = image_url.split("/")[-1]
        with open(os.path.join(folder, image_name), 'wb') as file:
            file.write(response.content)
        return True

    except requests.RequestException as e:
        print(f"Error downloading {image_url}: {str(e)}")
        return False


def scrape_images_from_arxiv(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        images = soup.find_all('img')

        image_urls = [img['src'] for img in images if 'src' in img.attrs]
        return image_urls

    except requests.RequestException as e:
        print(f"Error fetching page {url}: {str(e)}")
        return []


def arxiv_bibtex(arxiv_id):
    """
    Get the BibTeX entry for an arXiv paper.
    Args:
        arxiv_id: The arXiv ID of the paper.
    Returns:
        A string containing the BibTeX entry.
    """

    import urllib.request, xml.dom.minidom

    # Download the XML
    try:
        usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
        xmldoc = xml.dom.minidom.parse(usock)
        usock.close()
    except Exception as e:
        raise e

    # Parse the XML
    entry = xmldoc.getElementsByTagName("entry")[0]
    date = entry.getElementsByTagName("updated")[0].firstChild.data
    text_year = date[:4]

    title = entry.getElementsByTagName("title")[0]
    text_title = title.firstChild.data.strip()

    authorlist = []
    first = True
    for person_name in entry.getElementsByTagName("author"):
        # Get names
        name = person_name.getElementsByTagName("name")[0]
        text_name = name.firstChild.data
        text_given_name = ' '.join(text_name.split()[:-1])
        text_surname = text_name.split()[-1]
        authorlist.append(f"{text_surname}, {text_given_name}")
        # First author?
        if first:
            text_first_author_surname = text_surname
            first = False

    # Construct the BibTeX entry
    bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
    bibtex += f" author = {' and '.join(authorlist)},\n"
    bibtex += f" title = {{{text_title}}},\n"
    bibtex += f" year = {{{text_year}}},\n"
    bibtex += f" eprint = {{{arxiv_id}}},\n"
    bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
    bibtex += "}"

    return bibtex


#from serpapi import GoogleSearch
#params = {
#  "api_key": "os.getenv(SERPER_API_KEY)",
#  "engine": "google_scholar",
#  "q": "llm",
#  "hl": "en",
#  "as_ylo": "2023",
#  "as_yhi": "2024"
#}
#search = GoogleSearch(params)
#results = search.get_dict()


def extract_arxiv_ids_from_line(line):
    """
    Extract the arXiv ID from a given line of text.

    Args:
    line (str): A line of text potentially containing an arXiv URL.

    Returns:
    str: The extracted arXiv ID, or None if not found.
    """
    arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
    match = arxiv_id_pattern.search(line)
    if match:
        return match.group(1) + (match.group(2) if match.group(2) else '')
    return None


def read_written_ids(file_path):
    """
    Read already written arXiv IDs from a file.

    Args:
    file_path (str): Path to the file containing written IDs.

    Returns:
    set: A set of arXiv IDs.
    """
    written_ids = set()
    try:
        with open(file_path, 'r') as file:
            for line in file:
                written_ids.add(line.strip())
    except FileNotFoundError:
        logger.error(f"File not found: {file_path}")
    except Exception as e:
        logger.error(f"Error while reading the file: {e}")
    return written_ids


def append_id_to_file(arxiv_id, output_file_path):
    """
    Append a single arXiv ID to a file. Checks if the file exists and creates it if not.

    Args:
    arxiv_id (str): The arXiv ID to append.
    output_file_path (str): Path to the output file.
    """
    try:
        # Check if file exists
        if not os.path.exists(output_file_path):
            logger.info(f"File does not exist. Creating new file: {output_file_path}")
            # Create a new file and append the ID
            with open(output_file_path, 'a') as outfile:
                outfile.write(arxiv_id + '\n')
        else:
            logger.info(f"Appending to existing file: {output_file_path}")
            # File exists, append the ID
            with open(output_file_path, 'a') as outfile:
                outfile.write(arxiv_id + '\n')

    except Exception as e:
        logger.error(f"Error while appending to file: {e}")