Update arxiv_schlorly_research.py

Function Definitions:
fetch_arxiv_data: Fetches arXiv data based on a query.
create_dataframe: Creates a DataFrame from the provided data.
get_arxiv_main_content: Returns the main content of an arXiv paper.
download_image: Downloads an image from a URL.
scrape_images_from_arxiv: Scrapes images from an arXiv page.
arxiv_bibtex: Generates the BibTeX entry for an arXiv paper.
extract_arxiv_ids_from_line: Extracts arXiv IDs from a given line of text.
read_written_ids: Reads already written arXiv IDs from a file.
append_id_to_file: Appends a single arXiv ID to a file.
Step 2: Suggest Code Improvements
Code Duplication:
Combine Similar Functions: Functions such as fetch_arxiv_data and create_dataframe can be combined or refactored to reduce redundancy.
Reuse Code: Ensure common functionality is abstracted into reusable functions.
Performance and Optimization:
Optimize API Calls: Ensure the arXiv API calls are optimized and handle rate limits.
Efficient Data Handling: Use more efficient data handling techniques, such as batch processing for large datasets.
Coding Standards and Best Practices:
Add Docstrings: Ensure all functions have detailed docstrings explaining their purpose, arguments, and return values.
Error Handling: Improve error handling to provide more informative error messages and handle different types of errors separately.
Logging: Use a consistent logging strategy to log important events and errors.
Code Structure: Group related functions into classes or modules for better organization and maintainability.
PEP 8 Compliance: Ensure the code follows PEP 8 standards for Python code style.
This commit is contained in:
ي
2025-01-18 09:58:58 +05:30
parent f75375eaaa
commit 3653bd4e80

View File

@@ -7,61 +7,59 @@
####################################################
import os
import os
import sys
import re
import pandas as pd
import arxiv
import PyPDF2
import io
import requests
from bs4 import BeautifulSoup
import urllib.parse
from urllib.parse import urlparse
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
logger.remove()
logger.add(sys.stdout, colorize=True, format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}")
def fetch_arxiv_data(query, max_results=10):
"""
Fetches arXiv data based on a query.
Args:
query (str): The search query.
max_results (int): The maximum number of results to fetch.
Returns:
list: A list of arXiv data.
"""
try:
# Construct the default API client
client = arxiv.Client()
# Search for articles matching the keyword
search = arxiv.Search(
query=query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate
)
# Fetching results
search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
results = list(client.results(search))
# Extracting data
all_data = []
for result in results:
temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
all_data.append(temp)
all_data = [[result.title, result.published, result.entry_id, result.summary, result.pdf_url] for result in results]
return all_data
except Exception as e:
print("An error occurred while fetching data from arXiv:", e)
logger.error(f"An error occurred while fetching data from arXiv: {e}")
raise e
def create_dataframe(data, column_names):
"""
Creates a DataFrame from the provided data.
Args:
data (list): The data to convert to a DataFrame.
column_names (list): The column names for the DataFrame.
Returns:
DataFrame: The created DataFrame.
"""
try:
df = pd.DataFrame(data, columns=column_names)
return df
except Exception as e:
print("An error occurred while creating DataFrame:", e)
logger.error(f"An error occurred while creating DataFrame: {e}")
return pd.DataFrame()
def get_arxiv_main_content(url):
"""
Returns the main content of an arXiv paper.
@@ -72,211 +70,181 @@ def get_arxiv_main_content(url):
Returns:
str: The main content of the paper as a string.
"""
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Raise an exception for HTTP errors
# Parse the HTML content
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Find the main content in 'ltx_page_content'
main_content = soup.find('div', class_='ltx_page_content')
if not main_content:
logger.warning("Main content not found in the page.")
return "Main content not found."
# Remove specific section with class 'package-alerts ltx_document'
alert_section = main_content.find('div', class_='package-alerts ltx_document')
if alert_section:
if (alert_section):
alert_section.decompose()
# Optional: Remove abstract and authors if present
for element_id in ["abs", "authors"]:
element = main_content.find(id=element_id)
if element:
if (element):
element.decompose()
return main_content.text.strip()
# Could not access the arxiv HTML content, instead download pdf and read its content.
except Exception as html_error:
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
try:
# Extract arXiv ID from URL
arxiv_id = url.split('/')[-1]
# Fetch paper information using arXiv API
paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
# Initialize an empty string to store the extracted text
pdf_text = ''
return get_pdf_content(url)
# Read the downloaded PDF
with open(pdf_filename, 'rb', encoding="utf-8") as f:
pdf_reader = PyPDF2.PdfReader(f)
def get_pdf_content(url):
"""
Helper function to get the content from a PDF if HTML content is not accessible.
for page in pdf_reader.pages:
try:
# Attempt to extract text from the current page
page_text = page.extract_text()
# If text extraction is successful, add it to the cumulative text
if page_text:
pdf_text += page_text + '\n'
except UnicodeDecodeError as err:
# FIXME: Handle any UnicodeDecodeError that arises during text extraction
logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
pass
Args:
url (str): The URL of the arXiv paper.
# Optionally, remove the downloaded PDF file
os.remove(pdf_filename)
# Pattern to match 'References' and everything that follows
pattern = r'References\s*.*'
pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
for section in sections_to_remove:
# Pattern to match the section title and any text following it until the next big title or end of document
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)
Returns:
str: The main content of the paper as a string.
"""
try:
client = arxiv.Client()
arxiv_id = url.split('/')[-1]
paper = next(client.results(arxiv.Search(id_list=[arxiv_id])))
pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
pdf_text = ''
with open(pdf_filename, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for page in pdf_reader.pages:
try:
page_text = page.extract_text()
if page_text:
pdf_text += page_text + '\n'
except UnicodeDecodeError as err:
logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
pass
os.remove(pdf_filename)
pdf_text = clean_pdf_text(pdf_text)
return pdf_text
except Exception as pdf_error:
logger.error(f"Failed to process PDF: {pdf_error}")
return "Failed to retrieve content."
return pdf_text
def clean_pdf_text(text):
"""
Helper function to clean the text extracted from a PDF.
except Exception as pdf_error:
logger.error(f"Failed to process PDF: {pdf_error}")
return "Failed to retrieve content."
Args:
text (str): The text to clean.
Returns:
str: The cleaned text.
"""
pattern = r'References\s*.*'
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
for section in sections_to_remove:
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
return text
def download_image(image_url, base_url, folder="images"):
# Skip downloading if the image URL is a data URI
if image_url.startswith('data:image'):
print(f"Skipping download of data URI image: {image_url}")
return False
"""
Downloads an image from a URL.
# Create the folder if it doesn't exist
Args:
image_url (str): The URL of the image.
base_url (str): The base URL of the website.
folder (str): The folder to save the image.
Returns:
bool: True if the image was downloaded successfully, False otherwise.
"""
if image_url.startswith('data:image'):
logger.info(f"Skipping download of data URI image: {image_url}")
return False
if not os.path.exists(folder):
os.makedirs(folder)
# Form the absolute URL for image paths
if not urllib.parse.urlparse(image_url).scheme:
if not urlparse(image_url).scheme:
if not base_url.endswith('/'):
base_url += '/'
image_url = base_url + image_url
# Download and save the image
try:
response = requests.get(image_url)
response.raise_for_status()
image_name = image_url.split("/")[-1]
with open(os.path.join(folder, image_name), 'wb', encoding="utf-8") as file:
with open(os.path.join(folder, image_name), 'wb') as file:
file.write(response.content)
return True
except requests.RequestException as e:
print(f"Error downloading {image_url}: {str(e)}")
logger.error(f"Error downloading {image_url}: {e}")
return False
def scrape_images_from_arxiv(url):
"""
Scrapes images from an arXiv page.
Args:
url (str): The URL of the arXiv page.
Returns:
list: A list of image URLs.
"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
image_urls = [img['src'] for img in images if 'src' in img.attrs]
return image_urls
except requests.RequestException as e:
print(f"Error fetching page {url}: {str(e)}")
logger.error(f"Error fetching page {url}: {e}")
return []
def arxiv_bibtex(arxiv_id):
"""
Get the BibTeX entry for an arXiv paper.
Args:
arxiv_id: The arXiv ID of the paper.
Returns:
A string containing the BibTeX entry.
"""
import urllib.request, xml.dom.minidom
# Download the XML
try:
usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
xmldoc = xml.dom.minidom.parse(usock)
usock.close()
entry = xmldoc.getElementsByTagName("entry")[0]
date = entry.getElementsByTagName("updated")[0].firstChild.data
text_year = date[:4]
title = entry.getElementsByTagName("title")[0]
text_title = title.firstChild.data.strip()
authorlist = []
first = True
for person_name in entry.getElementsByTagName("author"):
name = person_name.getElementsByTagName("name")[0]
text_name = name.firstChild.data
text_given_name = ' '.join(text_name.split()[:-1])
text_surname = text_name.split()[-1]
authorlist.append(f"{text_surname}, {text_given_name}")
if first:
text_first_author_surname = text_surname
first = False
bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
bibtex += f" author = {' and '.join(authorlist)},\n"
bibtex += f" title = {{{text_title}}},\n"
bibtex += f" year = {{{text_year}}},\n"
bibtex += f" eprint = {{{arxiv_id}}},\n"
bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
bibtex += "}"
return bibtex
except Exception as e:
raise e
# Parse the XML
entry = xmldoc.getElementsByTagName("entry")[0]
date = entry.getElementsByTagName("updated")[0].firstChild.data
text_year = date[:4]
title = entry.getElementsByTagName("title")[0]
text_title = title.firstChild.data.strip()
authorlist = []
first = True
for person_name in entry.getElementsByTagName("author"):
# Get names
name = person_name.getElementsByTagName("name")[0]
text_name = name.firstChild.data
text_given_name = ' '.join(text_name.split()[:-1])
text_surname = text_name.split()[-1]
authorlist.append(f"{text_surname}, {text_given_name}")
# First author?
if first:
text_first_author_surname = text_surname
first = False
# Construct the BibTeX entry
bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
bibtex += f" author = {' and '.join(authorlist)},\n"
bibtex += f" title = {{{text_title}}},\n"
bibtex += f" year = {{{text_year}}},\n"
bibtex += f" eprint = {{{arxiv_id}}},\n"
bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
bibtex += "}"
return bibtex
#from serpapi import GoogleSearch
#params = {
# "api_key": "os.getenv(SERPER_API_KEY)",
# "engine": "google_scholar",
# "q": "llm",
# "hl": "en",
# "as_ylo": "2023",
# "as_yhi": "2024"
#}
#search = GoogleSearch(params)
#results = search.get_dict()
#from llmsherpa.readers import LayoutPDFReader
#llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
#pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf
#pdf_reader = LayoutPDFReader(llmsherpa_api_url)
#doc = pdf_reader.read_pdf(pdf_url)
logger.error(f"Error while generating BibTeX: {e}")
return ""
def extract_arxiv_ids_from_line(line):
"""
Extract the arXiv ID from a given line of text.
Args:
line (str): A line of text potentially containing an arXiv URL.
line (str): A line of text potentially containing an arXiv URL.
Returns:
str: The extracted arXiv ID, or None if not found.
str: The extracted arXiv ID, or None if not found.
"""
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
match = arxiv_id_pattern.search(line)
@@ -284,16 +252,15 @@ def extract_arxiv_ids_from_line(line):
return match.group(1) + (match.group(2) if match.group(2) else '')
return None
def read_written_ids(file_path):
"""
Read already written arXiv IDs from a file.
Args:
file_path (str): Path to the file containing written IDs.
file_path (str): Path to the file containing written IDs.
Returns:
set: A set of arXiv IDs.
set: A set of arXiv IDs.
"""
written_ids = set()
try:
@@ -306,27 +273,22 @@ def read_written_ids(file_path):
logger.error(f"Error while reading the file: {e}")
return written_ids
def append_id_to_file(arxiv_id, output_file_path):
"""
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
Args:
arxiv_id (str): The arXiv ID to append.
output_file_path (str): Path to the output file.
arxiv_id (str): The arXiv ID to append.
output_file_path (str): Path to the output file.
"""
try:
# Check if file exists
if not os.path.exists(output_file_path):
logger.info(f"File does not exist. Creating new file: {output_file_path}")
# Create a new file and append the ID
with open(output_file_path, 'a', encoding="utf-8") as outfile:
outfile.write(arxiv_id + '\n')
else:
logger.info(f"Appending to existing file: {output_file_path}")
# File exists, append the ID
with open(output_file_path, 'a', encoding="utf-8") as outfile:
outfile.write(arxiv_id + '\n')
except Exception as e:
logger.error(f"Error while appending to file: {e}")