Update arxiv_schlorly_research.py
Function Definitions: fetch_arxiv_data: Fetches arXiv data based on a query. create_dataframe: Creates a DataFrame from the provided data. get_arxiv_main_content: Returns the main content of an arXiv paper. download_image: Downloads an image from a URL. scrape_images_from_arxiv: Scrapes images from an arXiv page. arxiv_bibtex: Generates the BibTeX entry for an arXiv paper. extract_arxiv_ids_from_line: Extracts arXiv IDs from a given line of text. read_written_ids: Reads already written arXiv IDs from a file. append_id_to_file: Appends a single arXiv ID to a file. Step 2: Suggest Code Improvements Code Duplication: Combine Similar Functions: Functions such as fetch_arxiv_data and create_dataframe can be combined or refactored to reduce redundancy. Reuse Code: Ensure common functionality is abstracted into reusable functions. Performance and Optimization: Optimize API Calls: Ensure the arXiv API calls are optimized and handle rate limits. Efficient Data Handling: Use more efficient data handling techniques, such as batch processing for large datasets. Coding Standards and Best Practices: Add Docstrings: Ensure all functions have detailed docstrings explaining their purpose, arguments, and return values. Error Handling: Improve error handling to provide more informative error messages and handle different types of errors separately. Logging: Use a consistent logging strategy to log important events and errors. Code Structure: Group related functions into classes or modules for better organization and maintainability. PEP 8 Compliance: Ensure the code follows PEP 8 standards for Python code style.
This commit is contained in:
@@ -7,61 +7,59 @@
|
|||||||
####################################################
|
####################################################
|
||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import arxiv
|
import arxiv
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
import io
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import urllib.parse
|
from urllib.parse import urlparse
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
logger.remove()
|
|
||||||
logger.add(sys.stdout,
|
|
||||||
colorize=True,
|
|
||||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
logger.remove()
|
||||||
|
logger.add(sys.stdout, colorize=True, format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}")
|
||||||
|
|
||||||
def fetch_arxiv_data(query, max_results=10):
|
def fetch_arxiv_data(query, max_results=10):
|
||||||
|
"""
|
||||||
|
Fetches arXiv data based on a query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): The search query.
|
||||||
|
max_results (int): The maximum number of results to fetch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of arXiv data.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
# Construct the default API client
|
|
||||||
client = arxiv.Client()
|
client = arxiv.Client()
|
||||||
|
search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
|
||||||
# Search for articles matching the keyword
|
|
||||||
search = arxiv.Search(
|
|
||||||
query=query,
|
|
||||||
max_results=max_results,
|
|
||||||
sort_by=arxiv.SortCriterion.SubmittedDate
|
|
||||||
)
|
|
||||||
|
|
||||||
# Fetching results
|
|
||||||
results = list(client.results(search))
|
results = list(client.results(search))
|
||||||
# Extracting data
|
all_data = [[result.title, result.published, result.entry_id, result.summary, result.pdf_url] for result in results]
|
||||||
all_data = []
|
|
||||||
for result in results:
|
|
||||||
temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
|
|
||||||
all_data.append(temp)
|
|
||||||
|
|
||||||
return all_data
|
return all_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("An error occurred while fetching data from arXiv:", e)
|
logger.error(f"An error occurred while fetching data from arXiv: {e}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
def create_dataframe(data, column_names):
|
def create_dataframe(data, column_names):
|
||||||
|
"""
|
||||||
|
Creates a DataFrame from the provided data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (list): The data to convert to a DataFrame.
|
||||||
|
column_names (list): The column names for the DataFrame.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: The created DataFrame.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
df = pd.DataFrame(data, columns=column_names)
|
df = pd.DataFrame(data, columns=column_names)
|
||||||
return df
|
return df
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("An error occurred while creating DataFrame:", e)
|
logger.error(f"An error occurred while creating DataFrame: {e}")
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
def get_arxiv_main_content(url):
|
def get_arxiv_main_content(url):
|
||||||
"""
|
"""
|
||||||
Returns the main content of an arXiv paper.
|
Returns the main content of an arXiv paper.
|
||||||
@@ -72,211 +70,181 @@ def get_arxiv_main_content(url):
|
|||||||
Returns:
|
Returns:
|
||||||
str: The main content of the paper as a string.
|
str: The main content of the paper as a string.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Send a GET request to the URL
|
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
response.raise_for_status() # Raise an exception for HTTP errors
|
response.raise_for_status()
|
||||||
|
|
||||||
# Parse the HTML content
|
|
||||||
soup = BeautifulSoup(response.content, "html.parser")
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
|
||||||
# Find the main content in 'ltx_page_content'
|
|
||||||
main_content = soup.find('div', class_='ltx_page_content')
|
main_content = soup.find('div', class_='ltx_page_content')
|
||||||
if not main_content:
|
if not main_content:
|
||||||
logger.warning("Main content not found in the page.")
|
logger.warning("Main content not found in the page.")
|
||||||
return "Main content not found."
|
return "Main content not found."
|
||||||
|
|
||||||
# Remove specific section with class 'package-alerts ltx_document'
|
|
||||||
alert_section = main_content.find('div', class_='package-alerts ltx_document')
|
alert_section = main_content.find('div', class_='package-alerts ltx_document')
|
||||||
if alert_section:
|
if (alert_section):
|
||||||
alert_section.decompose()
|
alert_section.decompose()
|
||||||
|
|
||||||
# Optional: Remove abstract and authors if present
|
|
||||||
for element_id in ["abs", "authors"]:
|
for element_id in ["abs", "authors"]:
|
||||||
element = main_content.find(id=element_id)
|
element = main_content.find(id=element_id)
|
||||||
if element:
|
if (element):
|
||||||
element.decompose()
|
element.decompose()
|
||||||
return main_content.text.strip()
|
return main_content.text.strip()
|
||||||
|
|
||||||
# Could not access the arxiv HTML content, instead download pdf and read its content.
|
|
||||||
except Exception as html_error:
|
except Exception as html_error:
|
||||||
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
|
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
|
||||||
try:
|
return get_pdf_content(url)
|
||||||
# Extract arXiv ID from URL
|
|
||||||
arxiv_id = url.split('/')[-1]
|
|
||||||
# Fetch paper information using arXiv API
|
|
||||||
paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
|
|
||||||
pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
|
|
||||||
# Initialize an empty string to store the extracted text
|
|
||||||
pdf_text = ''
|
|
||||||
|
|
||||||
# Read the downloaded PDF
|
def get_pdf_content(url):
|
||||||
with open(pdf_filename, 'rb', encoding="utf-8") as f:
|
"""
|
||||||
pdf_reader = PyPDF2.PdfReader(f)
|
Helper function to get the content from a PDF if HTML content is not accessible.
|
||||||
|
|
||||||
for page in pdf_reader.pages:
|
Args:
|
||||||
try:
|
url (str): The URL of the arXiv paper.
|
||||||
# Attempt to extract text from the current page
|
|
||||||
page_text = page.extract_text()
|
|
||||||
# If text extraction is successful, add it to the cumulative text
|
|
||||||
if page_text:
|
|
||||||
pdf_text += page_text + '\n'
|
|
||||||
except UnicodeDecodeError as err:
|
|
||||||
# FIXME: Handle any UnicodeDecodeError that arises during text extraction
|
|
||||||
logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Optionally, remove the downloaded PDF file
|
Returns:
|
||||||
os.remove(pdf_filename)
|
str: The main content of the paper as a string.
|
||||||
|
"""
|
||||||
# Pattern to match 'References' and everything that follows
|
try:
|
||||||
pattern = r'References\s*.*'
|
client = arxiv.Client()
|
||||||
pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
|
arxiv_id = url.split('/')[-1]
|
||||||
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
|
paper = next(client.results(arxiv.Search(id_list=[arxiv_id])))
|
||||||
for section in sections_to_remove:
|
pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
|
||||||
# Pattern to match the section title and any text following it until the next big title or end of document
|
pdf_text = ''
|
||||||
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
|
with open(pdf_filename, 'rb') as f:
|
||||||
pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)
|
pdf_reader = PyPDF2.PdfReader(f)
|
||||||
|
for page in pdf_reader.pages:
|
||||||
|
try:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text:
|
||||||
|
pdf_text += page_text + '\n'
|
||||||
|
except UnicodeDecodeError as err:
|
||||||
|
logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
|
||||||
|
pass
|
||||||
|
os.remove(pdf_filename)
|
||||||
|
pdf_text = clean_pdf_text(pdf_text)
|
||||||
|
return pdf_text
|
||||||
|
except Exception as pdf_error:
|
||||||
|
logger.error(f"Failed to process PDF: {pdf_error}")
|
||||||
|
return "Failed to retrieve content."
|
||||||
|
|
||||||
return pdf_text
|
def clean_pdf_text(text):
|
||||||
|
"""
|
||||||
|
Helper function to clean the text extracted from a PDF.
|
||||||
|
|
||||||
except Exception as pdf_error:
|
Args:
|
||||||
logger.error(f"Failed to process PDF: {pdf_error}")
|
text (str): The text to clean.
|
||||||
return "Failed to retrieve content."
|
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The cleaned text.
|
||||||
|
"""
|
||||||
|
pattern = r'References\s*.*'
|
||||||
|
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
|
||||||
|
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
|
||||||
|
for section in sections_to_remove:
|
||||||
|
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
|
||||||
|
text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||||
|
return text
|
||||||
|
|
||||||
def download_image(image_url, base_url, folder="images"):
|
def download_image(image_url, base_url, folder="images"):
|
||||||
# Skip downloading if the image URL is a data URI
|
"""
|
||||||
if image_url.startswith('data:image'):
|
Downloads an image from a URL.
|
||||||
print(f"Skipping download of data URI image: {image_url}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Create the folder if it doesn't exist
|
Args:
|
||||||
|
image_url (str): The URL of the image.
|
||||||
|
base_url (str): The base URL of the website.
|
||||||
|
folder (str): The folder to save the image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the image was downloaded successfully, False otherwise.
|
||||||
|
"""
|
||||||
|
if image_url.startswith('data:image'):
|
||||||
|
logger.info(f"Skipping download of data URI image: {image_url}")
|
||||||
|
return False
|
||||||
if not os.path.exists(folder):
|
if not os.path.exists(folder):
|
||||||
os.makedirs(folder)
|
os.makedirs(folder)
|
||||||
|
if not urlparse(image_url).scheme:
|
||||||
# Form the absolute URL for image paths
|
|
||||||
if not urllib.parse.urlparse(image_url).scheme:
|
|
||||||
if not base_url.endswith('/'):
|
if not base_url.endswith('/'):
|
||||||
base_url += '/'
|
base_url += '/'
|
||||||
image_url = base_url + image_url
|
image_url = base_url + image_url
|
||||||
|
|
||||||
# Download and save the image
|
|
||||||
try:
|
try:
|
||||||
response = requests.get(image_url)
|
response = requests.get(image_url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
image_name = image_url.split("/")[-1]
|
image_name = image_url.split("/")[-1]
|
||||||
with open(os.path.join(folder, image_name), 'wb', encoding="utf-8") as file:
|
with open(os.path.join(folder, image_name), 'wb') as file:
|
||||||
file.write(response.content)
|
file.write(response.content)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
print(f"Error downloading {image_url}: {str(e)}")
|
logger.error(f"Error downloading {image_url}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def scrape_images_from_arxiv(url):
|
def scrape_images_from_arxiv(url):
|
||||||
|
"""
|
||||||
|
Scrapes images from an arXiv page.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL of the arXiv page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of image URLs.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
images = soup.find_all('img')
|
images = soup.find_all('img')
|
||||||
|
|
||||||
image_urls = [img['src'] for img in images if 'src' in img.attrs]
|
image_urls = [img['src'] for img in images if 'src' in img.attrs]
|
||||||
return image_urls
|
return image_urls
|
||||||
|
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
print(f"Error fetching page {url}: {str(e)}")
|
logger.error(f"Error fetching page {url}: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def arxiv_bibtex(arxiv_id):
|
def arxiv_bibtex(arxiv_id):
|
||||||
"""
|
"""
|
||||||
Get the BibTeX entry for an arXiv paper.
|
Get the BibTeX entry for an arXiv paper.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
arxiv_id: The arXiv ID of the paper.
|
arxiv_id: The arXiv ID of the paper.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A string containing the BibTeX entry.
|
A string containing the BibTeX entry.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import urllib.request, xml.dom.minidom
|
|
||||||
|
|
||||||
# Download the XML
|
|
||||||
try:
|
try:
|
||||||
usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
|
usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
|
||||||
xmldoc = xml.dom.minidom.parse(usock)
|
xmldoc = xml.dom.minidom.parse(usock)
|
||||||
usock.close()
|
usock.close()
|
||||||
|
entry = xmldoc.getElementsByTagName("entry")[0]
|
||||||
|
date = entry.getElementsByTagName("updated")[0].firstChild.data
|
||||||
|
text_year = date[:4]
|
||||||
|
title = entry.getElementsByTagName("title")[0]
|
||||||
|
text_title = title.firstChild.data.strip()
|
||||||
|
authorlist = []
|
||||||
|
first = True
|
||||||
|
for person_name in entry.getElementsByTagName("author"):
|
||||||
|
name = person_name.getElementsByTagName("name")[0]
|
||||||
|
text_name = name.firstChild.data
|
||||||
|
text_given_name = ' '.join(text_name.split()[:-1])
|
||||||
|
text_surname = text_name.split()[-1]
|
||||||
|
authorlist.append(f"{text_surname}, {text_given_name}")
|
||||||
|
if first:
|
||||||
|
text_first_author_surname = text_surname
|
||||||
|
first = False
|
||||||
|
bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
|
||||||
|
bibtex += f" author = {' and '.join(authorlist)},\n"
|
||||||
|
bibtex += f" title = {{{text_title}}},\n"
|
||||||
|
bibtex += f" year = {{{text_year}}},\n"
|
||||||
|
bibtex += f" eprint = {{{arxiv_id}}},\n"
|
||||||
|
bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
|
||||||
|
bibtex += "}"
|
||||||
|
return bibtex
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
logger.error(f"Error while generating BibTeX: {e}")
|
||||||
|
return ""
|
||||||
# Parse the XML
|
|
||||||
entry = xmldoc.getElementsByTagName("entry")[0]
|
|
||||||
date = entry.getElementsByTagName("updated")[0].firstChild.data
|
|
||||||
text_year = date[:4]
|
|
||||||
|
|
||||||
title = entry.getElementsByTagName("title")[0]
|
|
||||||
text_title = title.firstChild.data.strip()
|
|
||||||
|
|
||||||
authorlist = []
|
|
||||||
first = True
|
|
||||||
for person_name in entry.getElementsByTagName("author"):
|
|
||||||
# Get names
|
|
||||||
name = person_name.getElementsByTagName("name")[0]
|
|
||||||
text_name = name.firstChild.data
|
|
||||||
text_given_name = ' '.join(text_name.split()[:-1])
|
|
||||||
text_surname = text_name.split()[-1]
|
|
||||||
authorlist.append(f"{text_surname}, {text_given_name}")
|
|
||||||
# First author?
|
|
||||||
if first:
|
|
||||||
text_first_author_surname = text_surname
|
|
||||||
first = False
|
|
||||||
|
|
||||||
# Construct the BibTeX entry
|
|
||||||
bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
|
|
||||||
bibtex += f" author = {' and '.join(authorlist)},\n"
|
|
||||||
bibtex += f" title = {{{text_title}}},\n"
|
|
||||||
bibtex += f" year = {{{text_year}}},\n"
|
|
||||||
bibtex += f" eprint = {{{arxiv_id}}},\n"
|
|
||||||
bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
|
|
||||||
bibtex += "}"
|
|
||||||
|
|
||||||
return bibtex
|
|
||||||
|
|
||||||
|
|
||||||
#from serpapi import GoogleSearch
|
|
||||||
#params = {
|
|
||||||
# "api_key": "os.getenv(SERPER_API_KEY)",
|
|
||||||
# "engine": "google_scholar",
|
|
||||||
# "q": "llm",
|
|
||||||
# "hl": "en",
|
|
||||||
# "as_ylo": "2023",
|
|
||||||
# "as_yhi": "2024"
|
|
||||||
#}
|
|
||||||
#search = GoogleSearch(params)
|
|
||||||
#results = search.get_dict()
|
|
||||||
|
|
||||||
#from llmsherpa.readers import LayoutPDFReader
|
|
||||||
|
|
||||||
#llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
|
|
||||||
#pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf
|
|
||||||
#pdf_reader = LayoutPDFReader(llmsherpa_api_url)
|
|
||||||
#doc = pdf_reader.read_pdf(pdf_url)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extract_arxiv_ids_from_line(line):
|
def extract_arxiv_ids_from_line(line):
|
||||||
"""
|
"""
|
||||||
Extract the arXiv ID from a given line of text.
|
Extract the arXiv ID from a given line of text.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
line (str): A line of text potentially containing an arXiv URL.
|
line (str): A line of text potentially containing an arXiv URL.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: The extracted arXiv ID, or None if not found.
|
str: The extracted arXiv ID, or None if not found.
|
||||||
"""
|
"""
|
||||||
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
|
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
|
||||||
match = arxiv_id_pattern.search(line)
|
match = arxiv_id_pattern.search(line)
|
||||||
@@ -284,16 +252,15 @@ def extract_arxiv_ids_from_line(line):
|
|||||||
return match.group(1) + (match.group(2) if match.group(2) else '')
|
return match.group(1) + (match.group(2) if match.group(2) else '')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def read_written_ids(file_path):
|
def read_written_ids(file_path):
|
||||||
"""
|
"""
|
||||||
Read already written arXiv IDs from a file.
|
Read already written arXiv IDs from a file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path (str): Path to the file containing written IDs.
|
file_path (str): Path to the file containing written IDs.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
set: A set of arXiv IDs.
|
set: A set of arXiv IDs.
|
||||||
"""
|
"""
|
||||||
written_ids = set()
|
written_ids = set()
|
||||||
try:
|
try:
|
||||||
@@ -306,27 +273,22 @@ def read_written_ids(file_path):
|
|||||||
logger.error(f"Error while reading the file: {e}")
|
logger.error(f"Error while reading the file: {e}")
|
||||||
return written_ids
|
return written_ids
|
||||||
|
|
||||||
|
|
||||||
def append_id_to_file(arxiv_id, output_file_path):
|
def append_id_to_file(arxiv_id, output_file_path):
|
||||||
"""
|
"""
|
||||||
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
|
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
arxiv_id (str): The arXiv ID to append.
|
arxiv_id (str): The arXiv ID to append.
|
||||||
output_file_path (str): Path to the output file.
|
output_file_path (str): Path to the output file.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Check if file exists
|
|
||||||
if not os.path.exists(output_file_path):
|
if not os.path.exists(output_file_path):
|
||||||
logger.info(f"File does not exist. Creating new file: {output_file_path}")
|
logger.info(f"File does not exist. Creating new file: {output_file_path}")
|
||||||
# Create a new file and append the ID
|
|
||||||
with open(output_file_path, 'a', encoding="utf-8") as outfile:
|
with open(output_file_path, 'a', encoding="utf-8") as outfile:
|
||||||
outfile.write(arxiv_id + '\n')
|
outfile.write(arxiv_id + '\n')
|
||||||
else:
|
else:
|
||||||
logger.info(f"Appending to existing file: {output_file_path}")
|
logger.info(f"Appending to existing file: {output_file_path}")
|
||||||
# File exists, append the ID
|
|
||||||
with open(output_file_path, 'a', encoding="utf-8") as outfile:
|
with open(output_file_path, 'a', encoding="utf-8") as outfile:
|
||||||
outfile.write(arxiv_id + '\n')
|
outfile.write(arxiv_id + '\n')
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error while appending to file: {e}")
|
logger.error(f"Error while appending to file: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user