diff --git a/lib/ai_web_researcher/arxiv_schlorly_research.py b/lib/ai_web_researcher/arxiv_schlorly_research.py
index cb833b22..347b408f 100644
--- a/lib/ai_web_researcher/arxiv_schlorly_research.py
+++ b/lib/ai_web_researcher/arxiv_schlorly_research.py
@@ -7,61 +7,59 @@
####################################################
-import os
+import os
import sys
import re
-
import pandas as pd
import arxiv
import PyPDF2
-import io
import requests
from bs4 import BeautifulSoup
-import urllib.parse
+from urllib.parse import urlparse
from loguru import logger
-logger.remove()
-logger.add(sys.stdout,
- colorize=True,
- format="{level}|{file}:{line}:{function}| {message}"
- )
+logger.remove()
+logger.add(sys.stdout, colorize=True, format="{level}|{file}:{line}:{function}| {message}")
def fetch_arxiv_data(query, max_results=10):
+ """
+ Fetches arXiv data based on a query.
+
+ Args:
+ query (str): The search query.
+ max_results (int): The maximum number of results to fetch.
+
+ Returns:
+ list: A list of arXiv data.
+ """
try:
- # Construct the default API client
client = arxiv.Client()
-
- # Search for articles matching the keyword
- search = arxiv.Search(
- query=query,
- max_results=max_results,
- sort_by=arxiv.SortCriterion.SubmittedDate
- )
-
- # Fetching results
+ search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
results = list(client.results(search))
- # Extracting data
- all_data = []
- for result in results:
- temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
- all_data.append(temp)
-
+ all_data = [[result.title, result.published, result.entry_id, result.summary, result.pdf_url] for result in results]
return all_data
-
except Exception as e:
- print("An error occurred while fetching data from arXiv:", e)
+ logger.error(f"An error occurred while fetching data from arXiv: {e}")
raise e
-
def create_dataframe(data, column_names):
+ """
+ Creates a DataFrame from the provided data.
+
+ Args:
+ data (list): The data to convert to a DataFrame.
+ column_names (list): The column names for the DataFrame.
+
+ Returns:
+ DataFrame: The created DataFrame.
+ """
try:
df = pd.DataFrame(data, columns=column_names)
return df
except Exception as e:
- print("An error occurred while creating DataFrame:", e)
+ logger.error(f"An error occurred while creating DataFrame: {e}")
return pd.DataFrame()
-
def get_arxiv_main_content(url):
"""
Returns the main content of an arXiv paper.
@@ -72,211 +70,181 @@ def get_arxiv_main_content(url):
Returns:
str: The main content of the paper as a string.
"""
-
try:
- # Send a GET request to the URL
response = requests.get(url)
- response.raise_for_status() # Raise an exception for HTTP errors
-
- # Parse the HTML content
+ response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
-
- # Find the main content in 'ltx_page_content'
main_content = soup.find('div', class_='ltx_page_content')
if not main_content:
logger.warning("Main content not found in the page.")
return "Main content not found."
-
- # Remove specific section with class 'package-alerts ltx_document'
alert_section = main_content.find('div', class_='package-alerts ltx_document')
- if alert_section:
+ if (alert_section):
alert_section.decompose()
-
- # Optional: Remove abstract and authors if present
for element_id in ["abs", "authors"]:
element = main_content.find(id=element_id)
- if element:
+ if (element):
element.decompose()
return main_content.text.strip()
-
- # Could not access the arxiv HTML content, instead download pdf and read its content.
except Exception as html_error:
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
- try:
- # Extract arXiv ID from URL
- arxiv_id = url.split('/')[-1]
- # Fetch paper information using arXiv API
- paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
- pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
- # Initialize an empty string to store the extracted text
- pdf_text = ''
+ return get_pdf_content(url)
- # Read the downloaded PDF
- with open(pdf_filename, 'rb', encoding="utf-8") as f:
- pdf_reader = PyPDF2.PdfReader(f)
+def get_pdf_content(url):
+ """
+ Helper function to get the content from a PDF if HTML content is not accessible.
- for page in pdf_reader.pages:
- try:
- # Attempt to extract text from the current page
- page_text = page.extract_text()
- # If text extraction is successful, add it to the cumulative text
- if page_text:
- pdf_text += page_text + '\n'
- except UnicodeDecodeError as err:
- # FIXME: Handle any UnicodeDecodeError that arises during text extraction
- logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
- pass
+ Args:
+ url (str): The URL of the arXiv paper.
- # Optionally, remove the downloaded PDF file
- os.remove(pdf_filename)
-
- # Pattern to match 'References' and everything that follows
- pattern = r'References\s*.*'
- pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
- sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
- for section in sections_to_remove:
- # Pattern to match the section title and any text following it until the next big title or end of document
- pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
- pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)
+ Returns:
+ str: The main content of the paper as a string.
+ """
+ try:
+ client = arxiv.Client()
+ arxiv_id = url.split('/')[-1]
+ paper = next(client.results(arxiv.Search(id_list=[arxiv_id])))
+ pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
+ pdf_text = ''
+ with open(pdf_filename, 'rb') as f:
+ pdf_reader = PyPDF2.PdfReader(f)
+ for page in pdf_reader.pages:
+ try:
+ page_text = page.extract_text()
+ if page_text:
+ pdf_text += page_text + '\n'
+ except UnicodeDecodeError as err:
+ logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
+ pass
+ os.remove(pdf_filename)
+ pdf_text = clean_pdf_text(pdf_text)
+ return pdf_text
+ except Exception as pdf_error:
+ logger.error(f"Failed to process PDF: {pdf_error}")
+ return "Failed to retrieve content."
- return pdf_text
+def clean_pdf_text(text):
+ """
+ Helper function to clean the text extracted from a PDF.
- except Exception as pdf_error:
- logger.error(f"Failed to process PDF: {pdf_error}")
- return "Failed to retrieve content."
+ Args:
+ text (str): The text to clean.
+ Returns:
+ str: The cleaned text.
+ """
+ pattern = r'References\s*.*'
+ text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
+ sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
+ for section in sections_to_remove:
+ pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
+ text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
+ return text
def download_image(image_url, base_url, folder="images"):
- # Skip downloading if the image URL is a data URI
- if image_url.startswith('data:image'):
- print(f"Skipping download of data URI image: {image_url}")
- return False
+ """
+ Downloads an image from a URL.
- # Create the folder if it doesn't exist
+ Args:
+ image_url (str): The URL of the image.
+ base_url (str): The base URL of the website.
+ folder (str): The folder to save the image.
+
+ Returns:
+ bool: True if the image was downloaded successfully, False otherwise.
+ """
+ if image_url.startswith('data:image'):
+ logger.info(f"Skipping download of data URI image: {image_url}")
+ return False
if not os.path.exists(folder):
os.makedirs(folder)
-
- # Form the absolute URL for image paths
- if not urllib.parse.urlparse(image_url).scheme:
+ if not urlparse(image_url).scheme:
if not base_url.endswith('/'):
base_url += '/'
image_url = base_url + image_url
-
- # Download and save the image
try:
response = requests.get(image_url)
response.raise_for_status()
-
image_name = image_url.split("/")[-1]
- with open(os.path.join(folder, image_name), 'wb', encoding="utf-8") as file:
+ with open(os.path.join(folder, image_name), 'wb') as file:
file.write(response.content)
return True
-
except requests.RequestException as e:
- print(f"Error downloading {image_url}: {str(e)}")
+ logger.error(f"Error downloading {image_url}: {e}")
return False
-
def scrape_images_from_arxiv(url):
+ """
+ Scrapes images from an arXiv page.
+
+ Args:
+ url (str): The URL of the arXiv page.
+
+ Returns:
+ list: A list of image URLs.
+ """
try:
response = requests.get(url)
response.raise_for_status()
-
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
-
image_urls = [img['src'] for img in images if 'src' in img.attrs]
return image_urls
-
except requests.RequestException as e:
- print(f"Error fetching page {url}: {str(e)}")
+ logger.error(f"Error fetching page {url}: {e}")
return []
-
def arxiv_bibtex(arxiv_id):
"""
Get the BibTeX entry for an arXiv paper.
+
Args:
arxiv_id: The arXiv ID of the paper.
+
Returns:
A string containing the BibTeX entry.
"""
-
- import urllib.request, xml.dom.minidom
-
- # Download the XML
try:
usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
xmldoc = xml.dom.minidom.parse(usock)
usock.close()
+ entry = xmldoc.getElementsByTagName("entry")[0]
+ date = entry.getElementsByTagName("updated")[0].firstChild.data
+ text_year = date[:4]
+ title = entry.getElementsByTagName("title")[0]
+ text_title = title.firstChild.data.strip()
+ authorlist = []
+ first = True
+ for person_name in entry.getElementsByTagName("author"):
+ name = person_name.getElementsByTagName("name")[0]
+ text_name = name.firstChild.data
+ text_given_name = ' '.join(text_name.split()[:-1])
+ text_surname = text_name.split()[-1]
+ authorlist.append(f"{text_surname}, {text_given_name}")
+ if first:
+ text_first_author_surname = text_surname
+ first = False
+ bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
+ bibtex += f" author = {' and '.join(authorlist)},\n"
+ bibtex += f" title = {{{text_title}}},\n"
+ bibtex += f" year = {{{text_year}}},\n"
+ bibtex += f" eprint = {{{arxiv_id}}},\n"
+ bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
+ bibtex += "}"
+ return bibtex
except Exception as e:
- raise e
-
- # Parse the XML
- entry = xmldoc.getElementsByTagName("entry")[0]
- date = entry.getElementsByTagName("updated")[0].firstChild.data
- text_year = date[:4]
-
- title = entry.getElementsByTagName("title")[0]
- text_title = title.firstChild.data.strip()
-
- authorlist = []
- first = True
- for person_name in entry.getElementsByTagName("author"):
- # Get names
- name = person_name.getElementsByTagName("name")[0]
- text_name = name.firstChild.data
- text_given_name = ' '.join(text_name.split()[:-1])
- text_surname = text_name.split()[-1]
- authorlist.append(f"{text_surname}, {text_given_name}")
- # First author?
- if first:
- text_first_author_surname = text_surname
- first = False
-
- # Construct the BibTeX entry
- bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
- bibtex += f" author = {' and '.join(authorlist)},\n"
- bibtex += f" title = {{{text_title}}},\n"
- bibtex += f" year = {{{text_year}}},\n"
- bibtex += f" eprint = {{{arxiv_id}}},\n"
- bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
- bibtex += "}"
-
- return bibtex
-
-
-#from serpapi import GoogleSearch
-#params = {
-# "api_key": "os.getenv(SERPER_API_KEY)",
-# "engine": "google_scholar",
-# "q": "llm",
-# "hl": "en",
-# "as_ylo": "2023",
-# "as_yhi": "2024"
-#}
-#search = GoogleSearch(params)
-#results = search.get_dict()
-
-#from llmsherpa.readers import LayoutPDFReader
-
-#llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
-#pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf
-#pdf_reader = LayoutPDFReader(llmsherpa_api_url)
-#doc = pdf_reader.read_pdf(pdf_url)
-
-
-
+ logger.error(f"Error while generating BibTeX: {e}")
+ return ""
def extract_arxiv_ids_from_line(line):
"""
Extract the arXiv ID from a given line of text.
Args:
- line (str): A line of text potentially containing an arXiv URL.
+ line (str): A line of text potentially containing an arXiv URL.
Returns:
- str: The extracted arXiv ID, or None if not found.
+ str: The extracted arXiv ID, or None if not found.
"""
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
match = arxiv_id_pattern.search(line)
@@ -284,16 +252,15 @@ def extract_arxiv_ids_from_line(line):
return match.group(1) + (match.group(2) if match.group(2) else '')
return None
-
def read_written_ids(file_path):
"""
Read already written arXiv IDs from a file.
Args:
- file_path (str): Path to the file containing written IDs.
+ file_path (str): Path to the file containing written IDs.
Returns:
- set: A set of arXiv IDs.
+ set: A set of arXiv IDs.
"""
written_ids = set()
try:
@@ -306,27 +273,22 @@ def read_written_ids(file_path):
logger.error(f"Error while reading the file: {e}")
return written_ids
-
def append_id_to_file(arxiv_id, output_file_path):
"""
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
Args:
- arxiv_id (str): The arXiv ID to append.
- output_file_path (str): Path to the output file.
+ arxiv_id (str): The arXiv ID to append.
+ output_file_path (str): Path to the output file.
"""
try:
- # Check if file exists
if not os.path.exists(output_file_path):
logger.info(f"File does not exist. Creating new file: {output_file_path}")
- # Create a new file and append the ID
with open(output_file_path, 'a', encoding="utf-8") as outfile:
outfile.write(arxiv_id + '\n')
else:
logger.info(f"Appending to existing file: {output_file_path}")
- # File exists, append the ID
with open(output_file_path, 'a', encoding="utf-8") as outfile:
outfile.write(arxiv_id + '\n')
-
except Exception as e:
logger.error(f"Error while appending to file: {e}")