ALwrity Version 0.5.1 (Fastapi + React)
This commit is contained in:
980
ToBeMigrated/ai_web_researcher/arxiv_schlorly_research.py
Normal file
980
ToBeMigrated/ai_web_researcher/arxiv_schlorly_research.py
Normal file
@@ -0,0 +1,980 @@
|
||||
####################################################
|
||||
#
|
||||
# FIXME: Gotta use this lib: https://github.com/monk1337/resp/tree/main
|
||||
# https://github.com/danielnsilva/semanticscholar
|
||||
# https://github.com/shauryr/S2QA
|
||||
#
|
||||
####################################################
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import pandas as pd
|
||||
import arxiv
|
||||
import PyPDF2
|
||||
import requests
|
||||
import networkx as nx
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from ..gpt_providers.text_generation.main_text_generation import llm_text_gen
|
||||
import bibtexparser
|
||||
from pylatexenc.latex2text import LatexNodes2Text
|
||||
from matplotlib import pyplot as plt
|
||||
from collections import defaultdict
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.cluster import KMeans
|
||||
import numpy as np
|
||||
|
||||
logger.remove()
|
||||
logger.add(sys.stdout, colorize=True, format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}")
|
||||
|
||||
def create_arxiv_client(page_size=100, delay_seconds=3.0, num_retries=3):
|
||||
"""
|
||||
Creates a reusable arXiv API client with custom configuration.
|
||||
|
||||
Args:
|
||||
page_size (int): Number of results per page (default: 100)
|
||||
delay_seconds (float): Delay between API requests (default: 3.0)
|
||||
num_retries (int): Number of retries for failed requests (default: 3)
|
||||
|
||||
Returns:
|
||||
arxiv.Client: Configured arXiv API client
|
||||
"""
|
||||
try:
|
||||
client = arxiv.Client(
|
||||
page_size=page_size,
|
||||
delay_seconds=delay_seconds,
|
||||
num_retries=num_retries
|
||||
)
|
||||
return client
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating arXiv client: {e}")
|
||||
raise e
|
||||
|
||||
def expand_search_query(query, research_interests=None):
|
||||
"""
|
||||
Uses AI to expand the search query based on user's research interests.
|
||||
|
||||
Args:
|
||||
query (str): Original search query
|
||||
research_interests (list): List of user's research interests
|
||||
|
||||
Returns:
|
||||
str: Expanded search query
|
||||
"""
|
||||
try:
|
||||
interests_context = "\n".join(research_interests) if research_interests else ""
|
||||
prompt = f"""Given the original arXiv search query: '{query}'
|
||||
{f'And considering these research interests:\n{interests_context}' if interests_context else ''}
|
||||
Generate an expanded arXiv search query that:
|
||||
1. Includes relevant synonyms and related concepts
|
||||
2. Uses appropriate arXiv search operators (AND, OR, etc.)
|
||||
3. Incorporates field-specific tags (ti:, abs:, au:, etc.)
|
||||
4. Maintains focus on the core topic
|
||||
Return only the expanded query without any explanation."""
|
||||
|
||||
expanded_query = llm_text_gen(prompt)
|
||||
logger.info(f"Expanded query: {expanded_query}")
|
||||
return expanded_query
|
||||
except Exception as e:
|
||||
logger.error(f"Error expanding search query: {e}")
|
||||
return query
|
||||
|
||||
def analyze_citation_network(papers):
|
||||
"""
|
||||
Analyzes citation relationships between papers using DOIs and references.
|
||||
|
||||
Args:
|
||||
papers (list): List of paper metadata dictionaries
|
||||
|
||||
Returns:
|
||||
dict: Citation network analysis results
|
||||
"""
|
||||
try:
|
||||
# Create a directed graph for citations
|
||||
G = nx.DiGraph()
|
||||
|
||||
# Add nodes and edges
|
||||
for paper in papers:
|
||||
paper_id = paper['entry_id']
|
||||
G.add_node(paper_id, title=paper['title'])
|
||||
|
||||
# Add edges based on DOIs and references
|
||||
if paper['doi']:
|
||||
for other_paper in papers:
|
||||
if other_paper['doi'] and other_paper['doi'] in paper['summary']:
|
||||
G.add_edge(paper_id, other_paper['entry_id'])
|
||||
|
||||
# Calculate network metrics
|
||||
analysis = {
|
||||
'influential_papers': sorted(nx.pagerank(G).items(), key=lambda x: x[1], reverse=True),
|
||||
'citation_clusters': list(nx.connected_components(G.to_undirected())),
|
||||
'citation_paths': dict(nx.all_pairs_shortest_path_length(G))
|
||||
}
|
||||
return analysis
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing citation network: {e}")
|
||||
return {}
|
||||
|
||||
def categorize_papers(papers):
|
||||
"""
|
||||
Uses AI to categorize papers based on their metadata and content.
|
||||
|
||||
Args:
|
||||
papers (list): List of paper metadata dictionaries
|
||||
|
||||
Returns:
|
||||
dict: Paper categorization results
|
||||
"""
|
||||
try:
|
||||
categorized_papers = {}
|
||||
for paper in papers:
|
||||
prompt = f"""Analyze this research paper and provide detailed categorization:
|
||||
Title: {paper['title']}
|
||||
Abstract: {paper['summary']}
|
||||
Primary Category: {paper['primary_category']}
|
||||
Categories: {', '.join(paper['categories'])}
|
||||
|
||||
Provide a JSON response with these fields:
|
||||
1. main_theme: Primary research theme
|
||||
2. sub_themes: List of related sub-themes
|
||||
3. methodology: Research methodology used
|
||||
4. application_domains: Potential application areas
|
||||
5. technical_complexity: Level (Basic/Intermediate/Advanced)"""
|
||||
|
||||
categorization = llm_text_gen(prompt)
|
||||
categorized_papers[paper['entry_id']] = categorization
|
||||
|
||||
return categorized_papers
|
||||
except Exception as e:
|
||||
logger.error(f"Error categorizing papers: {e}")
|
||||
return {}
|
||||
|
||||
def get_paper_recommendations(papers, research_interests):
|
||||
"""
|
||||
Generates personalized paper recommendations based on user's research interests.
|
||||
|
||||
Args:
|
||||
papers (list): List of paper metadata dictionaries
|
||||
research_interests (list): User's research interests
|
||||
|
||||
Returns:
|
||||
dict: Personalized paper recommendations
|
||||
"""
|
||||
try:
|
||||
interests_text = "\n".join(research_interests)
|
||||
recommendations = {}
|
||||
|
||||
for paper in papers:
|
||||
prompt = f"""Evaluate this paper's relevance to the user's research interests:
|
||||
Paper:
|
||||
- Title: {paper['title']}
|
||||
- Abstract: {paper['summary']}
|
||||
- Categories: {', '.join(paper['categories'])}
|
||||
|
||||
User's Research Interests:
|
||||
{interests_text}
|
||||
|
||||
Provide a JSON response with:
|
||||
1. relevance_score: 0-100
|
||||
2. relevance_aspects: List of matching aspects
|
||||
3. potential_value: How this paper could benefit the user's research"""
|
||||
|
||||
evaluation = llm_text_gen(prompt)
|
||||
recommendations[paper['entry_id']] = evaluation
|
||||
|
||||
return recommendations
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating paper recommendations: {e}")
|
||||
return {}
|
||||
|
||||
def fetch_arxiv_data(query, max_results=10, sort_by=arxiv.SortCriterion.SubmittedDate, sort_order=None, client=None, research_interests=None):
|
||||
"""
|
||||
Fetches arXiv data based on a query with advanced search options.
|
||||
|
||||
Args:
|
||||
query (str): The search query (supports advanced syntax, e.g., 'au:einstein AND cat:physics')
|
||||
max_results (int): The maximum number of results to fetch
|
||||
sort_by (arxiv.SortCriterion): Sorting criterion (default: SubmittedDate)
|
||||
sort_order (str): Sort order ('ascending' or 'descending', default: None)
|
||||
client (arxiv.Client): Optional custom client (default: None, creates new client)
|
||||
|
||||
Returns:
|
||||
list: A list of arXiv data with extended metadata
|
||||
"""
|
||||
try:
|
||||
if client is None:
|
||||
client = create_arxiv_client()
|
||||
|
||||
# Expand search query using AI if research interests are provided
|
||||
expanded_query = expand_search_query(query, research_interests) if research_interests else query
|
||||
logger.info(f"Using expanded query: {expanded_query}")
|
||||
|
||||
search = arxiv.Search(
|
||||
query=expanded_query,
|
||||
max_results=max_results,
|
||||
sort_by=sort_by,
|
||||
sort_order=sort_order
|
||||
)
|
||||
|
||||
results = list(client.results(search))
|
||||
all_data = [
|
||||
{
|
||||
'title': result.title,
|
||||
'published': result.published,
|
||||
'updated': result.updated,
|
||||
'entry_id': result.entry_id,
|
||||
'summary': result.summary,
|
||||
'authors': [str(author) for author in result.authors],
|
||||
'pdf_url': result.pdf_url,
|
||||
'journal_ref': getattr(result, 'journal_ref', None),
|
||||
'doi': getattr(result, 'doi', None),
|
||||
'primary_category': getattr(result, 'primary_category', None),
|
||||
'categories': getattr(result, 'categories', []),
|
||||
'links': [link.href for link in getattr(result, 'links', [])]
|
||||
}
|
||||
for result in results
|
||||
]
|
||||
|
||||
# Enhance results with AI-powered analysis
|
||||
if all_data:
|
||||
# Analyze citation network
|
||||
citation_analysis = analyze_citation_network(all_data)
|
||||
|
||||
# Categorize papers using AI
|
||||
paper_categories = categorize_papers(all_data)
|
||||
|
||||
# Generate recommendations if research interests are provided
|
||||
recommendations = get_paper_recommendations(all_data, research_interests) if research_interests else {}
|
||||
|
||||
# Perform content analysis
|
||||
content_analyses = [analyze_paper_content(paper['entry_id']) for paper in all_data]
|
||||
trend_analysis = analyze_research_trends(all_data)
|
||||
concept_mapping = map_cross_paper_concepts(all_data)
|
||||
|
||||
# Generate bibliography data
|
||||
bibliography_data = {
|
||||
'bibtex_entries': [generate_bibtex_entry(paper) for paper in all_data],
|
||||
'citations': {
|
||||
'apa': [convert_citation_format(generate_bibtex_entry(paper), 'apa') for paper in all_data],
|
||||
'mla': [convert_citation_format(generate_bibtex_entry(paper), 'mla') for paper in all_data],
|
||||
'chicago': [convert_citation_format(generate_bibtex_entry(paper), 'chicago') for paper in all_data]
|
||||
},
|
||||
'reference_graph': visualize_reference_graph(all_data),
|
||||
'citation_impact': analyze_citation_impact(all_data)
|
||||
}
|
||||
|
||||
# Add enhanced data to results
|
||||
enhanced_data = {
|
||||
'papers': all_data,
|
||||
'citation_analysis': citation_analysis,
|
||||
'paper_categories': paper_categories,
|
||||
'recommendations': recommendations,
|
||||
'content_analyses': content_analyses,
|
||||
'trend_analysis': trend_analysis,
|
||||
'concept_mapping': concept_mapping,
|
||||
'bibliography': bibliography_data
|
||||
}
|
||||
return enhanced_data
|
||||
|
||||
return {'papers': all_data}
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred while fetching data from arXiv: {e}")
|
||||
raise e
|
||||
|
||||
def create_dataframe(data, column_names):
|
||||
"""
|
||||
Creates a DataFrame from the provided data.
|
||||
|
||||
Args:
|
||||
data (list): The data to convert to a DataFrame.
|
||||
column_names (list): The column names for the DataFrame.
|
||||
|
||||
Returns:
|
||||
DataFrame: The created DataFrame.
|
||||
"""
|
||||
try:
|
||||
df = pd.DataFrame(data, columns=column_names)
|
||||
return df
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred while creating DataFrame: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
def get_arxiv_main_content(url):
|
||||
"""
|
||||
Returns the main content of an arXiv paper.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the arXiv paper.
|
||||
|
||||
Returns:
|
||||
str: The main content of the paper as a string.
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
main_content = soup.find('div', class_='ltx_page_content')
|
||||
if not main_content:
|
||||
logger.warning("Main content not found in the page.")
|
||||
return "Main content not found."
|
||||
alert_section = main_content.find('div', class_='package-alerts ltx_document')
|
||||
if (alert_section):
|
||||
alert_section.decompose()
|
||||
for element_id in ["abs", "authors"]:
|
||||
element = main_content.find(id=element_id)
|
||||
if (element):
|
||||
element.decompose()
|
||||
return main_content.text.strip()
|
||||
except Exception as html_error:
|
||||
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
|
||||
return get_pdf_content(url)
|
||||
|
||||
def download_paper(paper_id, output_dir="downloads", filename=None, get_source=False):
|
||||
"""
|
||||
Downloads a paper's PDF or source files with enhanced error handling.
|
||||
|
||||
Args:
|
||||
paper_id (str): The arXiv ID of the paper
|
||||
output_dir (str): Directory to save the downloaded file (default: 'downloads')
|
||||
filename (str): Custom filename (default: None, uses paper ID)
|
||||
get_source (bool): If True, downloads source files instead of PDF (default: False)
|
||||
|
||||
Returns:
|
||||
str: Path to the downloaded file or None if download fails
|
||||
"""
|
||||
try:
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Get paper metadata
|
||||
client = create_arxiv_client()
|
||||
paper = next(client.results(arxiv.Search(id_list=[paper_id])))
|
||||
|
||||
# Set filename if not provided
|
||||
if not filename:
|
||||
safe_title = re.sub(r'[^\w\-_.]', '_', paper.title[:50])
|
||||
filename = f"{paper_id}_{safe_title}"
|
||||
filename += ".tar.gz" if get_source else ".pdf"
|
||||
|
||||
# Full path for the downloaded file
|
||||
file_path = os.path.join(output_dir, filename)
|
||||
|
||||
# Download the file
|
||||
if get_source:
|
||||
paper.download_source(dirpath=output_dir, filename=filename)
|
||||
else:
|
||||
paper.download_pdf(dirpath=output_dir, filename=filename)
|
||||
|
||||
logger.info(f"Successfully downloaded {'source' if get_source else 'PDF'} to {file_path}")
|
||||
return file_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading {'source' if get_source else 'PDF'} for {paper_id}: {e}")
|
||||
return None
|
||||
|
||||
def analyze_paper_content(url_or_id, cleanup=True):
|
||||
"""
|
||||
Analyzes paper content using AI to extract key information and insights.
|
||||
|
||||
Args:
|
||||
url_or_id (str): The arXiv URL or ID of the paper
|
||||
cleanup (bool): Whether to delete the PDF after extraction (default: True)
|
||||
|
||||
Returns:
|
||||
dict: Analysis results including summary, key findings, and concepts
|
||||
"""
|
||||
try:
|
||||
# Get paper content
|
||||
content = get_pdf_content(url_or_id, cleanup)
|
||||
if not content or 'Failed to' in content:
|
||||
return {'error': content}
|
||||
|
||||
# Generate paper summary
|
||||
summary_prompt = f"""Analyze this research paper and provide a comprehensive summary:
|
||||
{content[:8000]} # Limit content length for API
|
||||
|
||||
Provide a JSON response with:
|
||||
1. executive_summary: Brief overview (2-3 sentences)
|
||||
2. key_findings: List of main research findings
|
||||
3. methodology: Research methods used
|
||||
4. implications: Practical implications of the research
|
||||
5. limitations: Study limitations and constraints"""
|
||||
|
||||
summary_analysis = llm_text_gen(summary_prompt)
|
||||
|
||||
# Extract key concepts and relationships
|
||||
concepts_prompt = f"""Analyze this research paper and identify key concepts and relationships:
|
||||
{content[:8000]}
|
||||
|
||||
Provide a JSON response with:
|
||||
1. main_concepts: List of key technical concepts
|
||||
2. concept_relationships: How concepts are related
|
||||
3. novel_contributions: New ideas or approaches introduced
|
||||
4. technical_requirements: Required technologies or methods
|
||||
5. future_directions: Suggested future research"""
|
||||
|
||||
concept_analysis = llm_text_gen(concepts_prompt)
|
||||
|
||||
return {
|
||||
'summary_analysis': summary_analysis,
|
||||
'concept_analysis': concept_analysis,
|
||||
'full_text': content
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing paper content: {e}")
|
||||
return {'error': str(e)}
|
||||
|
||||
def analyze_research_trends(papers):
|
||||
"""
|
||||
Analyzes research trends across multiple papers.
|
||||
|
||||
Args:
|
||||
papers (list): List of paper metadata and content
|
||||
|
||||
Returns:
|
||||
dict: Trend analysis results
|
||||
"""
|
||||
try:
|
||||
# Collect paper information
|
||||
papers_info = []
|
||||
for paper in papers:
|
||||
content = get_pdf_content(paper['entry_id'], cleanup=True)
|
||||
if content and 'Failed to' not in content:
|
||||
papers_info.append({
|
||||
'title': paper['title'],
|
||||
'abstract': paper['summary'],
|
||||
'content': content[:8000], # Limit content length
|
||||
'year': paper['published'].year
|
||||
})
|
||||
|
||||
if not papers_info:
|
||||
return {'error': 'No valid paper content found for analysis'}
|
||||
|
||||
# Analyze trends
|
||||
trends_prompt = f"""Analyze these research papers and identify key trends:
|
||||
Papers:
|
||||
{str(papers_info)}
|
||||
|
||||
Provide a JSON response with:
|
||||
1. temporal_trends: How research focus evolved over time
|
||||
2. emerging_themes: New and growing research areas
|
||||
3. declining_themes: Decreasing research focus areas
|
||||
4. methodology_trends: Evolution of research methods
|
||||
5. technology_trends: Trends in technology usage
|
||||
6. research_gaps: Identified gaps and opportunities"""
|
||||
|
||||
trend_analysis = llm_text_gen(trends_prompt)
|
||||
return {'trend_analysis': trend_analysis}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing research trends: {e}")
|
||||
return {'error': str(e)}
|
||||
|
||||
def map_cross_paper_concepts(papers):
|
||||
"""
|
||||
Maps concepts and relationships across multiple papers.
|
||||
|
||||
Args:
|
||||
papers (list): List of paper metadata and content
|
||||
|
||||
Returns:
|
||||
dict: Concept mapping results
|
||||
"""
|
||||
try:
|
||||
# Analyze each paper
|
||||
paper_analyses = []
|
||||
for paper in papers:
|
||||
analysis = analyze_paper_content(paper['entry_id'])
|
||||
if 'error' not in analysis:
|
||||
paper_analyses.append({
|
||||
'paper_id': paper['entry_id'],
|
||||
'title': paper['title'],
|
||||
'analysis': analysis
|
||||
})
|
||||
|
||||
if not paper_analyses:
|
||||
return {'error': 'No valid paper analyses for concept mapping'}
|
||||
|
||||
# Generate cross-paper concept map
|
||||
mapping_prompt = f"""Analyze relationships between concepts across these papers:
|
||||
{str(paper_analyses)}
|
||||
|
||||
Provide a JSON response with:
|
||||
1. shared_concepts: Concepts appearing in multiple papers
|
||||
2. concept_evolution: How concepts developed across papers
|
||||
3. conflicting_views: Different interpretations of same concepts
|
||||
4. complementary_findings: How papers complement each other
|
||||
5. knowledge_gaps: Areas needing more research"""
|
||||
|
||||
concept_mapping = llm_text_gen(mapping_prompt)
|
||||
return {'concept_mapping': concept_mapping}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error mapping cross-paper concepts: {e}")
|
||||
return {'error': str(e)}
|
||||
|
||||
def generate_bibtex_entry(paper):
|
||||
"""
|
||||
Generates a BibTeX entry for a paper with complete metadata.
|
||||
|
||||
Args:
|
||||
paper (dict): Paper metadata dictionary
|
||||
|
||||
Returns:
|
||||
str: BibTeX entry string
|
||||
"""
|
||||
try:
|
||||
# Generate a unique citation key
|
||||
first_author = paper['authors'][0].split()[-1] if paper['authors'] else 'Unknown'
|
||||
year = paper['published'].year if paper['published'] else '0000'
|
||||
citation_key = f"{first_author}{year}{paper['entry_id'].split('/')[-1]}"
|
||||
|
||||
# Format authors for BibTeX
|
||||
authors = ' and '.join(paper['authors'])
|
||||
|
||||
# Create BibTeX entry
|
||||
bibtex = f"@article{{{citation_key},\n"
|
||||
bibtex += f" title = {{{paper['title']}}},\n"
|
||||
bibtex += f" author = {{{authors}}},\n"
|
||||
bibtex += f" year = {{{year}}},\n"
|
||||
bibtex += f" journal = {{arXiv preprint}},\n"
|
||||
bibtex += f" archivePrefix = {{arXiv}},\n"
|
||||
bibtex += f" eprint = {{{paper['entry_id'].split('/')[-1]}}},\n"
|
||||
if paper['doi']:
|
||||
bibtex += f" doi = {{{paper['doi']}}},\n"
|
||||
bibtex += f" url = {{{paper['entry_id']}}},\n"
|
||||
bibtex += f" abstract = {{{paper['summary']}}}\n"
|
||||
bibtex += "}"
|
||||
|
||||
return bibtex
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating BibTeX entry: {e}")
|
||||
return ""
|
||||
|
||||
def convert_citation_format(bibtex_str, target_format):
|
||||
"""
|
||||
Converts BibTeX citations to other formats and validates the output.
|
||||
|
||||
Args:
|
||||
bibtex_str (str): BibTeX entry string
|
||||
target_format (str): Target citation format ('apa', 'mla', 'chicago', etc.)
|
||||
|
||||
Returns:
|
||||
str: Formatted citation string
|
||||
"""
|
||||
try:
|
||||
# Parse BibTeX entry
|
||||
bib_database = bibtexparser.loads(bibtex_str)
|
||||
entry = bib_database.entries[0]
|
||||
|
||||
# Generate citation format prompt
|
||||
prompt = f"""Convert this bibliographic information to {target_format} format:
|
||||
Title: {entry.get('title', '')}
|
||||
Authors: {entry.get('author', '')}
|
||||
Year: {entry.get('year', '')}
|
||||
Journal: {entry.get('journal', '')}
|
||||
DOI: {entry.get('doi', '')}
|
||||
URL: {entry.get('url', '')}
|
||||
|
||||
Return only the formatted citation without any explanation."""
|
||||
|
||||
# Use AI to generate formatted citation
|
||||
formatted_citation = llm_text_gen(prompt)
|
||||
return formatted_citation.strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting citation format: {e}")
|
||||
return ""
|
||||
|
||||
def visualize_reference_graph(papers):
|
||||
"""
|
||||
Creates a visual representation of the citation network.
|
||||
|
||||
Args:
|
||||
papers (list): List of paper metadata dictionaries
|
||||
|
||||
Returns:
|
||||
str: Path to the saved visualization file
|
||||
"""
|
||||
try:
|
||||
# Create directed graph
|
||||
G = nx.DiGraph()
|
||||
|
||||
# Add nodes and edges
|
||||
for paper in papers:
|
||||
paper_id = paper['entry_id']
|
||||
G.add_node(paper_id, title=paper['title'])
|
||||
|
||||
# Add citation edges
|
||||
if paper['doi']:
|
||||
for other_paper in papers:
|
||||
if other_paper['doi'] and other_paper['doi'] in paper['summary']:
|
||||
G.add_edge(paper_id, other_paper['entry_id'])
|
||||
|
||||
# Set up the visualization
|
||||
plt.figure(figsize=(12, 8))
|
||||
pos = nx.spring_layout(G)
|
||||
|
||||
# Draw the graph
|
||||
nx.draw(G, pos, with_labels=False, node_color='lightblue',
|
||||
node_size=1000, arrowsize=20)
|
||||
|
||||
# Add labels
|
||||
labels = nx.get_node_attributes(G, 'title')
|
||||
nx.draw_networkx_labels(G, pos, labels, font_size=8)
|
||||
|
||||
# Save the visualization
|
||||
output_path = 'reference_graph.png'
|
||||
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
return output_path
|
||||
except Exception as e:
|
||||
logger.error(f"Error visualizing reference graph: {e}")
|
||||
return ""
|
||||
|
||||
def analyze_citation_impact(papers):
|
||||
"""
|
||||
Analyzes citation impact and influence patterns.
|
||||
|
||||
Args:
|
||||
papers (list): List of paper metadata dictionaries
|
||||
|
||||
Returns:
|
||||
dict: Citation impact analysis results
|
||||
"""
|
||||
try:
|
||||
# Create citation network
|
||||
G = nx.DiGraph()
|
||||
for paper in papers:
|
||||
G.add_node(paper['entry_id'], **paper)
|
||||
if paper['doi']:
|
||||
for other_paper in papers:
|
||||
if other_paper['doi'] and other_paper['doi'] in paper['summary']:
|
||||
G.add_edge(paper_id, other_paper['entry_id'])
|
||||
|
||||
# Calculate impact metrics
|
||||
impact_analysis = {
|
||||
'citation_counts': dict(G.in_degree()),
|
||||
'influence_scores': nx.pagerank(G),
|
||||
'authority_scores': nx.authority_matrix(G).diagonal(),
|
||||
'hub_scores': nx.hub_matrix(G).diagonal(),
|
||||
'citation_paths': dict(nx.all_pairs_shortest_path_length(G))
|
||||
}
|
||||
|
||||
# Add temporal analysis
|
||||
year_citations = defaultdict(int)
|
||||
for paper in papers:
|
||||
if paper['published']:
|
||||
year = paper['published'].year
|
||||
year_citations[year] += G.in_degree(paper['entry_id'])
|
||||
impact_analysis['temporal_trends'] = dict(year_citations)
|
||||
|
||||
return impact_analysis
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing citation impact: {e}")
|
||||
return {}
|
||||
|
||||
def get_pdf_content(url_or_id, cleanup=True):
|
||||
"""
|
||||
Extracts text content from a paper's PDF with improved error handling.
|
||||
|
||||
Args:
|
||||
url_or_id (str): The arXiv URL or ID of the paper
|
||||
cleanup (bool): Whether to delete the PDF after extraction (default: True)
|
||||
|
||||
Returns:
|
||||
str: The extracted text content or error message
|
||||
"""
|
||||
try:
|
||||
# Extract arxiv ID from URL if needed
|
||||
arxiv_id = url_or_id.split('/')[-1] if '/' in url_or_id else url_or_id
|
||||
|
||||
# Download PDF
|
||||
pdf_path = download_paper(arxiv_id)
|
||||
if not pdf_path:
|
||||
return "Failed to download PDF."
|
||||
|
||||
# Extract text from PDF
|
||||
pdf_text = ''
|
||||
with open(pdf_path, 'rb') as f:
|
||||
pdf_reader = PyPDF2.PdfReader(f)
|
||||
for page_num, page in enumerate(pdf_reader.pages, 1):
|
||||
try:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
pdf_text += f"\n--- Page {page_num} ---\n{page_text}"
|
||||
except Exception as err:
|
||||
logger.error(f"Error extracting text from page {page_num}: {err}")
|
||||
continue
|
||||
|
||||
# Clean up
|
||||
if cleanup:
|
||||
try:
|
||||
os.remove(pdf_path)
|
||||
logger.debug(f"Cleaned up temporary PDF file: {pdf_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to cleanup PDF file {pdf_path}: {e}")
|
||||
|
||||
# Process and return text
|
||||
if not pdf_text.strip():
|
||||
return "No text content could be extracted from the PDF."
|
||||
|
||||
return clean_pdf_text(pdf_text)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process PDF: {e}")
|
||||
return f"Failed to retrieve content: {str(e)}"
|
||||
|
||||
def clean_pdf_text(text):
|
||||
"""
|
||||
Helper function to clean the text extracted from a PDF.
|
||||
|
||||
Args:
|
||||
text (str): The text to clean.
|
||||
|
||||
Returns:
|
||||
str: The cleaned text.
|
||||
"""
|
||||
pattern = r'References\s*.*'
|
||||
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
|
||||
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
|
||||
for section in sections_to_remove:
|
||||
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
|
||||
text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
return text
|
||||
|
||||
def download_image(image_url, base_url, folder="images"):
|
||||
"""
|
||||
Downloads an image from a URL.
|
||||
|
||||
Args:
|
||||
image_url (str): The URL of the image.
|
||||
base_url (str): The base URL of the website.
|
||||
folder (str): The folder to save the image.
|
||||
|
||||
Returns:
|
||||
bool: True if the image was downloaded successfully, False otherwise.
|
||||
"""
|
||||
if image_url.startswith('data:image'):
|
||||
logger.info(f"Skipping download of data URI image: {image_url}")
|
||||
return False
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
if not urlparse(image_url).scheme:
|
||||
if not base_url.endswith('/'):
|
||||
base_url += '/'
|
||||
image_url = base_url + image_url
|
||||
try:
|
||||
response = requests.get(image_url)
|
||||
response.raise_for_status()
|
||||
image_name = image_url.split("/")[-1]
|
||||
with open(os.path.join(folder, image_name), 'wb') as file:
|
||||
file.write(response.content)
|
||||
return True
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error downloading {image_url}: {e}")
|
||||
return False
|
||||
|
||||
def scrape_images_from_arxiv(url):
|
||||
"""
|
||||
Scrapes images from an arXiv page.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the arXiv page.
|
||||
|
||||
Returns:
|
||||
list: A list of image URLs.
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
images = soup.find_all('img')
|
||||
image_urls = [img['src'] for img in images if 'src' in img.attrs]
|
||||
return image_urls
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error fetching page {url}: {e}")
|
||||
return []
|
||||
|
||||
def generate_bibtex(paper_id, client=None):
|
||||
"""
|
||||
Generate a BibTeX entry for an arXiv paper with enhanced metadata.
|
||||
|
||||
Args:
|
||||
paper_id (str): The arXiv ID of the paper
|
||||
client (arxiv.Client): Optional custom client (default: None)
|
||||
|
||||
Returns:
|
||||
str: BibTeX entry as a string
|
||||
"""
|
||||
try:
|
||||
if client is None:
|
||||
client = create_arxiv_client()
|
||||
|
||||
# Fetch paper metadata
|
||||
paper = next(client.results(arxiv.Search(id_list=[paper_id])))
|
||||
|
||||
# Extract author information
|
||||
authors = [str(author) for author in paper.authors]
|
||||
first_author = authors[0].split(', ')[0] if authors else 'Unknown'
|
||||
|
||||
# Format year
|
||||
year = paper.published.year if paper.published else 'Unknown'
|
||||
|
||||
# Create citation key
|
||||
citation_key = f"{first_author}{str(year)[-2:]}"
|
||||
|
||||
# Build BibTeX entry
|
||||
bibtex = [
|
||||
f"@article{{{citation_key},",
|
||||
f" author = {{{' and '.join(authors)}}},",
|
||||
f" title = {{{paper.title}}},",
|
||||
f" year = {{{year}}},",
|
||||
f" eprint = {{{paper_id}}},",
|
||||
f" archivePrefix = {{arXiv}},"
|
||||
]
|
||||
|
||||
# Add optional fields if available
|
||||
if paper.doi:
|
||||
bibtex.append(f" doi = {{{paper.doi}}},")
|
||||
if getattr(paper, 'journal_ref', None):
|
||||
bibtex.append(f" journal = {{{paper.journal_ref}}},")
|
||||
if getattr(paper, 'primary_category', None):
|
||||
bibtex.append(f" primaryClass = {{{paper.primary_category}}},")
|
||||
|
||||
# Add URL and close entry
|
||||
bibtex.extend([
|
||||
f" url = {{https://arxiv.org/abs/{paper_id}}}",
|
||||
"}"
|
||||
])
|
||||
|
||||
return '\n'.join(bibtex)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating BibTeX for {paper_id}: {e}")
|
||||
return ""
|
||||
|
||||
def batch_download_papers(paper_ids, output_dir="downloads", get_source=False):
|
||||
"""
|
||||
Download multiple papers in batch with progress tracking.
|
||||
|
||||
Args:
|
||||
paper_ids (list): List of arXiv IDs to download
|
||||
output_dir (str): Directory to save downloaded files (default: 'downloads')
|
||||
get_source (bool): If True, downloads source files instead of PDFs (default: False)
|
||||
|
||||
Returns:
|
||||
dict: Mapping of paper IDs to their download status and paths
|
||||
"""
|
||||
results = {}
|
||||
client = create_arxiv_client()
|
||||
|
||||
for paper_id in paper_ids:
|
||||
try:
|
||||
file_path = download_paper(paper_id, output_dir, get_source=get_source)
|
||||
results[paper_id] = {
|
||||
'success': bool(file_path),
|
||||
'path': file_path,
|
||||
'error': None
|
||||
}
|
||||
except Exception as e:
|
||||
results[paper_id] = {
|
||||
'success': False,
|
||||
'path': None,
|
||||
'error': str(e)
|
||||
}
|
||||
logger.error(f"Failed to download {paper_id}: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def batch_generate_bibtex(paper_ids):
|
||||
"""
|
||||
Generate BibTeX entries for multiple papers.
|
||||
|
||||
Args:
|
||||
paper_ids (list): List of arXiv IDs
|
||||
|
||||
Returns:
|
||||
dict: Mapping of paper IDs to their BibTeX entries
|
||||
"""
|
||||
results = {}
|
||||
client = create_arxiv_client()
|
||||
|
||||
for paper_id in paper_ids:
|
||||
try:
|
||||
bibtex = generate_bibtex(paper_id, client)
|
||||
results[paper_id] = {
|
||||
'success': bool(bibtex),
|
||||
'bibtex': bibtex,
|
||||
'error': None
|
||||
}
|
||||
except Exception as e:
|
||||
results[paper_id] = {
|
||||
'success': False,
|
||||
'bibtex': '',
|
||||
'error': str(e)
|
||||
}
|
||||
logger.error(f"Failed to generate BibTeX for {paper_id}: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def extract_arxiv_ids_from_line(line):
|
||||
"""
|
||||
Extract the arXiv ID from a given line of text.
|
||||
|
||||
Args:
|
||||
line (str): A line of text potentially containing an arXiv URL.
|
||||
|
||||
Returns:
|
||||
str: The extracted arXiv ID, or None if not found.
|
||||
"""
|
||||
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
|
||||
match = arxiv_id_pattern.search(line)
|
||||
if match:
|
||||
return match.group(1) + (match.group(2) if match.group(2) else '')
|
||||
return None
|
||||
|
||||
def read_written_ids(file_path):
|
||||
"""
|
||||
Read already written arXiv IDs from a file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the file containing written IDs.
|
||||
|
||||
Returns:
|
||||
set: A set of arXiv IDs.
|
||||
"""
|
||||
written_ids = set()
|
||||
try:
|
||||
with open(file_path, 'r', encoding="utf-8") as file:
|
||||
for line in file:
|
||||
written_ids.add(line.strip())
|
||||
except FileNotFoundError:
|
||||
logger.error(f"File not found: {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while reading the file: {e}")
|
||||
return written_ids
|
||||
|
||||
def append_id_to_file(arxiv_id, output_file_path):
|
||||
"""
|
||||
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
|
||||
|
||||
Args:
|
||||
arxiv_id (str): The arXiv ID to append.
|
||||
output_file_path (str): Path to the output file.
|
||||
"""
|
||||
try:
|
||||
if not os.path.exists(output_file_path):
|
||||
logger.info(f"File does not exist. Creating new file: {output_file_path}")
|
||||
with open(output_file_path, 'a', encoding="utf-8") as outfile:
|
||||
outfile.write(arxiv_id + '\n')
|
||||
else:
|
||||
logger.info(f"Appending to existing file: {output_file_path}")
|
||||
with open(output_file_path, 'a', encoding="utf-8") as outfile:
|
||||
outfile.write(arxiv_id + '\n')
|
||||
except Exception as e:
|
||||
logger.error(f"Error while appending to file: {e}")
|
||||
Reference in New Issue
Block a user