ALwrity/lib/ai_web_researcher/arxiv_schlorly_research.py

####################################################
#
# FIXME: Gotta use this lib: https://github.com/monk1337/resp/tree/main
# https://github.com/danielnsilva/semanticscholar
# https://github.com/shauryr/S2QA
#
####################################################


import os
import sys
import re
import pandas as pd
import arxiv
import PyPDF2
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from loguru import logger

logger.remove()
logger.add(sys.stdout, colorize=True, format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}")

def fetch_arxiv_data(query, max_results=10):
    """
    Fetches arXiv data based on a query.

    Args:
        query (str): The search query.
        max_results (int): The maximum number of results to fetch.

    Returns:
        list: A list of arXiv data.
    """
    try:
        client = arxiv.Client()
        search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
        results = list(client.results(search))
        all_data = [[result.title, result.published, result.entry_id, result.summary, result.pdf_url] for result in results]
        return all_data
    except Exception as e:
        logger.error(f"An error occurred while fetching data from arXiv: {e}")
        raise e

def create_dataframe(data, column_names):
    """
    Creates a DataFrame from the provided data.

    Args:
        data (list): The data to convert to a DataFrame.
        column_names (list): The column names for the DataFrame.

    Returns:
        DataFrame: The created DataFrame.
    """
    try:
        df = pd.DataFrame(data, columns=column_names)
        return df
    except Exception as e:
        logger.error(f"An error occurred while creating DataFrame: {e}")
        return pd.DataFrame()

def get_arxiv_main_content(url):
    """
    Returns the main content of an arXiv paper.

    Args:
        url (str): The URL of the arXiv paper.

    Returns:
        str: The main content of the paper as a string.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        main_content = soup.find('div', class_='ltx_page_content')
        if not main_content:
            logger.warning("Main content not found in the page.")
            return "Main content not found."
        alert_section = main_content.find('div', class_='package-alerts ltx_document')
        if (alert_section):
            alert_section.decompose()
        for element_id in ["abs", "authors"]:
            element = main_content.find(id=element_id)
            if (element):
                element.decompose()
        return main_content.text.strip()
    except Exception as html_error:
        logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
        return get_pdf_content(url)

def get_pdf_content(url):
    """
    Helper function to get the content from a PDF if HTML content is not accessible.

    Args:
        url (str): The URL of the arXiv paper.

    Returns:
        str: The main content of the paper as a string.
    """
    try:
        client = arxiv.Client()
        arxiv_id = url.split('/')[-1]
        paper = next(client.results(arxiv.Search(id_list=[arxiv_id])))
        pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
        pdf_text = ''
        with open(pdf_filename, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            for page in pdf_reader.pages:
                try:
                    page_text = page.extract_text()
                    if page_text:
                        pdf_text += page_text + '\n'
                except UnicodeDecodeError as err:
                    logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
                    pass
        os.remove(pdf_filename)
        pdf_text = clean_pdf_text(pdf_text)
        return pdf_text
    except Exception as pdf_error:
        logger.error(f"Failed to process PDF: {pdf_error}")
        return "Failed to retrieve content."

def clean_pdf_text(text):
    """
    Helper function to clean the text extracted from a PDF.

    Args:
        text (str): The text to clean.

    Returns:
        str: The cleaned text.
    """
    pattern = r'References\s*.*'
    text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
    sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
    for section in sections_to_remove:
        pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
        text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
    return text

def download_image(image_url, base_url, folder="images"):
    """
    Downloads an image from a URL.

    Args:
        image_url (str): The URL of the image.
        base_url (str): The base URL of the website.
        folder (str): The folder to save the image.

    Returns:
        bool: True if the image was downloaded successfully, False otherwise.
    """
    if image_url.startswith('data:image'):
        logger.info(f"Skipping download of data URI image: {image_url}")
        return False
    if not os.path.exists(folder):
        os.makedirs(folder)
    if not urlparse(image_url).scheme:
        if not base_url.endswith('/'):
            base_url += '/'
        image_url = base_url + image_url
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        image_name = image_url.split("/")[-1]
        with open(os.path.join(folder, image_name), 'wb') as file:
            file.write(response.content)
        return True
    except requests.RequestException as e:
        logger.error(f"Error downloading {image_url}: {e}")
        return False

def scrape_images_from_arxiv(url):
    """
    Scrapes images from an arXiv page.

    Args:
        url (str): The URL of the arXiv page.

    Returns:
        list: A list of image URLs.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        images = soup.find_all('img')
        image_urls = [img['src'] for img in images if 'src' in img.attrs]
        return image_urls
    except requests.RequestException as e:
        logger.error(f"Error fetching page {url}: {e}")
        return []

def arxiv_bibtex(arxiv_id):
    """
    Get the BibTeX entry for an arXiv paper.

    Args:
        arxiv_id: The arXiv ID of the paper.

    Returns:
        A string containing the BibTeX entry.
    """
    try:
        usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
        xmldoc = xml.dom.minidom.parse(usock)
        usock.close()
        entry = xmldoc.getElementsByTagName("entry")[0]
        date = entry.getElementsByTagName("updated")[0].firstChild.data
        text_year = date[:4]
        title = entry.getElementsByTagName("title")[0]
        text_title = title.firstChild.data.strip()
        authorlist = []
        first = True
        for person_name in entry.getElementsByTagName("author"):
            name = person_name.getElementsByTagName("name")[0]
            text_name = name.firstChild.data
            text_given_name = ' '.join(text_name.split()[:-1])
            text_surname = text_name.split()[-1]
            authorlist.append(f"{text_surname}, {text_given_name}")
            if first:
                text_first_author_surname = text_surname
                first = False
        bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
        bibtex += f" author = {' and '.join(authorlist)},\n"
        bibtex += f" title = {{{text_title}}},\n"
        bibtex += f" year = {{{text_year}}},\n"
        bibtex += f" eprint = {{{arxiv_id}}},\n"
        bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
        bibtex += "}"
        return bibtex
    except Exception as e:
        logger.error(f"Error while generating BibTeX: {e}")
        return ""

def extract_arxiv_ids_from_line(line):
    """
    Extract the arXiv ID from a given line of text.

    Args:
        line (str): A line of text potentially containing an arXiv URL.

    Returns:
        str: The extracted arXiv ID, or None if not found.
    """
    arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
    match = arxiv_id_pattern.search(line)
    if match:
        return match.group(1) + (match.group(2) if match.group(2) else '')
    return None

def read_written_ids(file_path):
    """
    Read already written arXiv IDs from a file.

    Args:
        file_path (str): Path to the file containing written IDs.

    Returns:
        set: A set of arXiv IDs.
    """
    written_ids = set()
    try:
        with open(file_path, 'r', encoding="utf-8") as file:
            for line in file:
                written_ids.add(line.strip())
    except FileNotFoundError:
        logger.error(f"File not found: {file_path}")
    except Exception as e:
        logger.error(f"Error while reading the file: {e}")
    return written_ids

def append_id_to_file(arxiv_id, output_file_path):
    """
    Append a single arXiv ID to a file. Checks if the file exists and creates it if not.

    Args:
        arxiv_id (str): The arXiv ID to append.
        output_file_path (str): Path to the output file.
    """
    try:
        if not os.path.exists(output_file_path):
            logger.info(f"File does not exist. Creating new file: {output_file_path}")
            with open(output_file_path, 'a', encoding="utf-8") as outfile:
                outfile.write(arxiv_id + '\n')
        else:
            logger.info(f"Appending to existing file: {output_file_path}")
            with open(output_file_path, 'a', encoding="utf-8") as outfile:
                outfile.write(arxiv_id + '\n')
    except Exception as e:
        logger.error(f"Error while appending to file: {e}")