Files
ALwrity/lib/arxiv_schlorly_research.py
2024-01-13 16:27:28 +05:30

324 lines
10 KiB
Python

####################################################
#
# FIXME: Gotta use this lib: https://github.com/monk1337/resp/tree/main
# https://github.com/danielnsilva/semanticscholar
# https://github.com/shauryr/S2QA
#
####################################################
import os
import sys
import re
import pandas as pd
import arxiv
import PyPDF2
import io
import requests
from bs4 import BeautifulSoup
import urllib.parse
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def fetch_arxiv_data(query, max_results=10):
try:
# Construct the default API client
client = arxiv.Client()
# Search for articles matching the keyword
search = arxiv.Search(
query=query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate
)
# Fetching results
results = list(client.results(search))
# Extracting data
all_data = []
for result in results:
temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
all_data.append(temp)
return all_data
except Exception as e:
print("An error occurred while fetching data from arXiv:", e)
raise e
def create_dataframe(data, column_names):
try:
df = pd.DataFrame(data, columns=column_names)
return df
except Exception as e:
print("An error occurred while creating DataFrame:", e)
return pd.DataFrame()
def get_arxiv_main_content(url):
"""
Returns the main content of an arXiv paper.
Args:
url (str): The URL of the arXiv paper.
Returns:
str: The main content of the paper as a string.
"""
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Raise an exception for HTTP errors
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Find the main content in 'ltx_page_content'
main_content = soup.find('div', class_='ltx_page_content')
if not main_content:
logger.warning("Main content not found in the page.")
return "Main content not found."
# Remove specific section with class 'package-alerts ltx_document'
alert_section = main_content.find('div', class_='package-alerts ltx_document')
if alert_section:
alert_section.decompose()
# Optional: Remove abstract and authors if present
for element_id in ["abs", "authors"]:
element = main_content.find(id=element_id)
if element:
element.decompose()
return main_content.text.strip()
# Could not access the arxiv HTML content, instead download pdf and read its content.
except Exception as html_error:
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
try:
# Extract arXiv ID from URL
arxiv_id = url.split('/')[-1]
# Fetch paper information using arXiv API
paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
# Initialize an empty string to store the extracted text
pdf_text = ''
# Read the downloaded PDF
with open(pdf_filename, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for page in pdf_reader.pages:
try:
# Attempt to extract text from the current page
page_text = page.extract_text()
# If text extraction is successful, add it to the cumulative text
if page_text:
pdf_text += page_text + '\n'
except UnicodeDecodeError as err:
# FIXME: Handle any UnicodeDecodeError that arises during text extraction
logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
pass
# Optionally, remove the downloaded PDF file
os.remove(pdf_filename)
# Pattern to match 'References' and everything that follows
pattern = r'References\s*.*'
pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
for section in sections_to_remove:
# Pattern to match the section title and any text following it until the next big title or end of document
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)
return pdf_text
except Exception as pdf_error:
logger.error(f"Failed to process PDF: {pdf_error}")
return "Failed to retrieve content."
def download_image(image_url, base_url, folder="images"):
# Skip downloading if the image URL is a data URI
if image_url.startswith('data:image'):
print(f"Skipping download of data URI image: {image_url}")
return False
# Create the folder if it doesn't exist
if not os.path.exists(folder):
os.makedirs(folder)
# Form the absolute URL for image paths
if not urllib.parse.urlparse(image_url).scheme:
if not base_url.endswith('/'):
base_url += '/'
image_url = base_url + image_url
# Download and save the image
try:
response = requests.get(image_url)
response.raise_for_status()
image_name = image_url.split("/")[-1]
with open(os.path.join(folder, image_name), 'wb') as file:
file.write(response.content)
return True
except requests.RequestException as e:
print(f"Error downloading {image_url}: {str(e)}")
return False
def scrape_images_from_arxiv(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
image_urls = [img['src'] for img in images if 'src' in img.attrs]
return image_urls
except requests.RequestException as e:
print(f"Error fetching page {url}: {str(e)}")
return []
def arxiv_bibtex(arxiv_id):
"""
Get the BibTeX entry for an arXiv paper.
Args:
arxiv_id: The arXiv ID of the paper.
Returns:
A string containing the BibTeX entry.
"""
import urllib.request, xml.dom.minidom
# Download the XML
try:
usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
xmldoc = xml.dom.minidom.parse(usock)
usock.close()
except Exception as e:
raise e
# Parse the XML
entry = xmldoc.getElementsByTagName("entry")[0]
date = entry.getElementsByTagName("updated")[0].firstChild.data
text_year = date[:4]
title = entry.getElementsByTagName("title")[0]
text_title = title.firstChild.data.strip()
authorlist = []
first = True
for person_name in entry.getElementsByTagName("author"):
# Get names
name = person_name.getElementsByTagName("name")[0]
text_name = name.firstChild.data
text_given_name = ' '.join(text_name.split()[:-1])
text_surname = text_name.split()[-1]
authorlist.append(f"{text_surname}, {text_given_name}")
# First author?
if first:
text_first_author_surname = text_surname
first = False
# Construct the BibTeX entry
bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
bibtex += f" author = {' and '.join(authorlist)},\n"
bibtex += f" title = {{{text_title}}},\n"
bibtex += f" year = {{{text_year}}},\n"
bibtex += f" eprint = {{{arxiv_id}}},\n"
bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
bibtex += "}"
return bibtex
#from serpapi import GoogleSearch
#params = {
# "api_key": "os.getenv(SERPER_API_KEY)",
# "engine": "google_scholar",
# "q": "llm",
# "hl": "en",
# "as_ylo": "2023",
# "as_yhi": "2024"
#}
#search = GoogleSearch(params)
#results = search.get_dict()
def extract_arxiv_ids_from_line(line):
"""
Extract the arXiv ID from a given line of text.
Args:
line (str): A line of text potentially containing an arXiv URL.
Returns:
str: The extracted arXiv ID, or None if not found.
"""
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
match = arxiv_id_pattern.search(line)
if match:
return match.group(1) + (match.group(2) if match.group(2) else '')
return None
def read_written_ids(file_path):
"""
Read already written arXiv IDs from a file.
Args:
file_path (str): Path to the file containing written IDs.
Returns:
set: A set of arXiv IDs.
"""
written_ids = set()
try:
with open(file_path, 'r') as file:
for line in file:
written_ids.add(line.strip())
except FileNotFoundError:
logger.error(f"File not found: {file_path}")
except Exception as e:
logger.error(f"Error while reading the file: {e}")
return written_ids
def append_id_to_file(arxiv_id, output_file_path):
"""
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
Args:
arxiv_id (str): The arXiv ID to append.
output_file_path (str): Path to the output file.
"""
try:
# Check if file exists
if not os.path.exists(output_file_path):
logger.info(f"File does not exist. Creating new file: {output_file_path}")
# Create a new file and append the ID
with open(output_file_path, 'a') as outfile:
outfile.write(arxiv_id + '\n')
else:
logger.info(f"Appending to existing file: {output_file_path}")
# File exists, append the ID
with open(output_file_path, 'a') as outfile:
outfile.write(arxiv_id + '\n')
except Exception as e:
logger.error(f"Error while appending to file: {e}")