Added new features to the project

2025-06-30 07:49:48 +05:30
parent bbe56a364d
commit b21cbb68da
48 changed files with 19774 additions and 1889 deletions
--- a/lib/ai_web_researcher/arxiv_schlorly_research.py
+++ b/lib/ai_web_researcher/arxiv_schlorly_research.py
@@ -14,30 +14,273 @@ import pandas as pd
 import arxiv
 import PyPDF2
 import requests
+import networkx as nx
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 from loguru import logger
+from ..gpt_providers.text_generation.main_text_generation import llm_text_gen
+import bibtexparser
+from pylatexenc.latex2text import LatexNodes2Text
+from matplotlib import pyplot as plt
+from collections import defaultdict
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.cluster import KMeans
+import numpy as np

 logger.remove()
 logger.add(sys.stdout, colorize=True, format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}")

-def fetch_arxiv_data(query, max_results=10):
+def create_arxiv_client(page_size=100, delay_seconds=3.0, num_retries=3):
    """
-    Fetches arXiv data based on a query.
+    Creates a reusable arXiv API client with custom configuration.

    Args:
-        query (str): The search query.
-        max_results (int): The maximum number of results to fetch.
+        page_size (int): Number of results per page (default: 100)
+        delay_seconds (float): Delay between API requests (default: 3.0)
+        num_retries (int): Number of retries for failed requests (default: 3)

    Returns:
-        list: A list of arXiv data.
+        arxiv.Client: Configured arXiv API client
    """
    try:
-        client = arxiv.Client()
-        search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
+        client = arxiv.Client(
+            page_size=page_size,
+            delay_seconds=delay_seconds,
+            num_retries=num_retries
+        )
+        return client
+    except Exception as e:
+        logger.error(f"Error creating arXiv client: {e}")
+        raise e
+
+def expand_search_query(query, research_interests=None):
+    """
+    Uses AI to expand the search query based on user's research interests.
+
+    Args:
+        query (str): Original search query
+        research_interests (list): List of user's research interests
+
+    Returns:
+        str: Expanded search query
+    """
+    try:
+        interests_context = "\n".join(research_interests) if research_interests else ""
+        prompt = f"""Given the original arXiv search query: '{query}'
+        {f'And considering these research interests:\n{interests_context}' if interests_context else ''}
+        Generate an expanded arXiv search query that:
+        1. Includes relevant synonyms and related concepts
+        2. Uses appropriate arXiv search operators (AND, OR, etc.)
+        3. Incorporates field-specific tags (ti:, abs:, au:, etc.)
+        4. Maintains focus on the core topic
+        Return only the expanded query without any explanation."""
+        
+        expanded_query = llm_text_gen(prompt)
+        logger.info(f"Expanded query: {expanded_query}")
+        return expanded_query
+    except Exception as e:
+        logger.error(f"Error expanding search query: {e}")
+        return query
+
+def analyze_citation_network(papers):
+    """
+    Analyzes citation relationships between papers using DOIs and references.
+
+    Args:
+        papers (list): List of paper metadata dictionaries
+
+    Returns:
+        dict: Citation network analysis results
+    """
+    try:
+        # Create a directed graph for citations
+        G = nx.DiGraph()
+        
+        # Add nodes and edges
+        for paper in papers:
+            paper_id = paper['entry_id']
+            G.add_node(paper_id, title=paper['title'])
+            
+            # Add edges based on DOIs and references
+            if paper['doi']:
+                for other_paper in papers:
+                    if other_paper['doi'] and other_paper['doi'] in paper['summary']:
+                        G.add_edge(paper_id, other_paper['entry_id'])
+        
+        # Calculate network metrics
+        analysis = {
+            'influential_papers': sorted(nx.pagerank(G).items(), key=lambda x: x[1], reverse=True),
+            'citation_clusters': list(nx.connected_components(G.to_undirected())),
+            'citation_paths': dict(nx.all_pairs_shortest_path_length(G))
+        }
+        return analysis
+    except Exception as e:
+        logger.error(f"Error analyzing citation network: {e}")
+        return {}
+
+def categorize_papers(papers):
+    """
+    Uses AI to categorize papers based on their metadata and content.
+
+    Args:
+        papers (list): List of paper metadata dictionaries
+
+    Returns:
+        dict: Paper categorization results
+    """
+    try:
+        categorized_papers = {}
+        for paper in papers:
+            prompt = f"""Analyze this research paper and provide detailed categorization:
+            Title: {paper['title']}
+            Abstract: {paper['summary']}
+            Primary Category: {paper['primary_category']}
+            Categories: {', '.join(paper['categories'])}
+            
+            Provide a JSON response with these fields:
+            1. main_theme: Primary research theme
+            2. sub_themes: List of related sub-themes
+            3. methodology: Research methodology used
+            4. application_domains: Potential application areas
+            5. technical_complexity: Level (Basic/Intermediate/Advanced)"""
+            
+            categorization = llm_text_gen(prompt)
+            categorized_papers[paper['entry_id']] = categorization
+        
+        return categorized_papers
+    except Exception as e:
+        logger.error(f"Error categorizing papers: {e}")
+        return {}
+
+def get_paper_recommendations(papers, research_interests):
+    """
+    Generates personalized paper recommendations based on user's research interests.
+
+    Args:
+        papers (list): List of paper metadata dictionaries
+        research_interests (list): User's research interests
+
+    Returns:
+        dict: Personalized paper recommendations
+    """
+    try:
+        interests_text = "\n".join(research_interests)
+        recommendations = {}
+        
+        for paper in papers:
+            prompt = f"""Evaluate this paper's relevance to the user's research interests:
+            Paper:
+            - Title: {paper['title']}
+            - Abstract: {paper['summary']}
+            - Categories: {', '.join(paper['categories'])}
+            
+            User's Research Interests:
+            {interests_text}
+            
+            Provide a JSON response with:
+            1. relevance_score: 0-100
+            2. relevance_aspects: List of matching aspects
+            3. potential_value: How this paper could benefit the user's research"""
+            
+            evaluation = llm_text_gen(prompt)
+            recommendations[paper['entry_id']] = evaluation
+        
+        return recommendations
+    except Exception as e:
+        logger.error(f"Error generating paper recommendations: {e}")
+        return {}
+
+def fetch_arxiv_data(query, max_results=10, sort_by=arxiv.SortCriterion.SubmittedDate, sort_order=None, client=None, research_interests=None):
+    """
+    Fetches arXiv data based on a query with advanced search options.
+
+    Args:
+        query (str): The search query (supports advanced syntax, e.g., 'au:einstein AND cat:physics')
+        max_results (int): The maximum number of results to fetch
+        sort_by (arxiv.SortCriterion): Sorting criterion (default: SubmittedDate)
+        sort_order (str): Sort order ('ascending' or 'descending', default: None)
+        client (arxiv.Client): Optional custom client (default: None, creates new client)
+
+    Returns:
+        list: A list of arXiv data with extended metadata
+    """
+    try:
+        if client is None:
+            client = create_arxiv_client()
+
+        # Expand search query using AI if research interests are provided
+        expanded_query = expand_search_query(query, research_interests) if research_interests else query
+        logger.info(f"Using expanded query: {expanded_query}")
+
+        search = arxiv.Search(
+            query=expanded_query,
+            max_results=max_results,
+            sort_by=sort_by,
+            sort_order=sort_order
+        )
+
        results = list(client.results(search))
-        all_data = [[result.title, result.published, result.entry_id, result.summary, result.pdf_url] for result in results]
-        return all_data
+        all_data = [
+            {
+                'title': result.title,
+                'published': result.published,
+                'updated': result.updated,
+                'entry_id': result.entry_id,
+                'summary': result.summary,
+                'authors': [str(author) for author in result.authors],
+                'pdf_url': result.pdf_url,
+                'journal_ref': getattr(result, 'journal_ref', None),
+                'doi': getattr(result, 'doi', None),
+                'primary_category': getattr(result, 'primary_category', None),
+                'categories': getattr(result, 'categories', []),
+                'links': [link.href for link in getattr(result, 'links', [])]
+            }
+            for result in results
+        ]
+
+        # Enhance results with AI-powered analysis
+        if all_data:
+            # Analyze citation network
+            citation_analysis = analyze_citation_network(all_data)
+            
+            # Categorize papers using AI
+            paper_categories = categorize_papers(all_data)
+            
+            # Generate recommendations if research interests are provided
+            recommendations = get_paper_recommendations(all_data, research_interests) if research_interests else {}
+            
+            # Perform content analysis
+            content_analyses = [analyze_paper_content(paper['entry_id']) for paper in all_data]
+            trend_analysis = analyze_research_trends(all_data)
+            concept_mapping = map_cross_paper_concepts(all_data)
+            
+            # Generate bibliography data
+            bibliography_data = {
+                'bibtex_entries': [generate_bibtex_entry(paper) for paper in all_data],
+                'citations': {
+                    'apa': [convert_citation_format(generate_bibtex_entry(paper), 'apa') for paper in all_data],
+                    'mla': [convert_citation_format(generate_bibtex_entry(paper), 'mla') for paper in all_data],
+                    'chicago': [convert_citation_format(generate_bibtex_entry(paper), 'chicago') for paper in all_data]
+                },
+                'reference_graph': visualize_reference_graph(all_data),
+                'citation_impact': analyze_citation_impact(all_data)
+            }
+            
+            # Add enhanced data to results
+            enhanced_data = {
+                'papers': all_data,
+                'citation_analysis': citation_analysis,
+                'paper_categories': paper_categories,
+                'recommendations': recommendations,
+                'content_analyses': content_analyses,
+                'trend_analysis': trend_analysis,
+                'concept_mapping': concept_mapping,
+                'bibliography': bibliography_data
+            }
+            return enhanced_data
+        
+        return {'papers': all_data}
    except Exception as e:
        logger.error(f"An error occurred while fetching data from arXiv: {e}")
        raise e
@@ -90,38 +333,401 @@ def get_arxiv_main_content(url):
        logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
        return get_pdf_content(url)

-def get_pdf_content(url):
+def download_paper(paper_id, output_dir="downloads", filename=None, get_source=False):
    """
-    Helper function to get the content from a PDF if HTML content is not accessible.
+    Downloads a paper's PDF or source files with enhanced error handling.

    Args:
-        url (str): The URL of the arXiv paper.
+        paper_id (str): The arXiv ID of the paper
+        output_dir (str): Directory to save the downloaded file (default: 'downloads')
+        filename (str): Custom filename (default: None, uses paper ID)
+        get_source (bool): If True, downloads source files instead of PDF (default: False)

    Returns:
-        str: The main content of the paper as a string.
+        str: Path to the downloaded file or None if download fails
    """
    try:
-        client = arxiv.Client()
-        arxiv_id = url.split('/')[-1]
-        paper = next(client.results(arxiv.Search(id_list=[arxiv_id])))
-        pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Get paper metadata
+        client = create_arxiv_client()
+        paper = next(client.results(arxiv.Search(id_list=[paper_id])))
+
+        # Set filename if not provided
+        if not filename:
+            safe_title = re.sub(r'[^\w\-_.]', '_', paper.title[:50])
+            filename = f"{paper_id}_{safe_title}"
+            filename += ".tar.gz" if get_source else ".pdf"
+
+        # Full path for the downloaded file
+        file_path = os.path.join(output_dir, filename)
+
+        # Download the file
+        if get_source:
+            paper.download_source(dirpath=output_dir, filename=filename)
+        else:
+            paper.download_pdf(dirpath=output_dir, filename=filename)
+
+        logger.info(f"Successfully downloaded {'source' if get_source else 'PDF'} to {file_path}")
+        return file_path
+
+    except Exception as e:
+        logger.error(f"Error downloading {'source' if get_source else 'PDF'} for {paper_id}: {e}")
+        return None
+
+def analyze_paper_content(url_or_id, cleanup=True):
+    """
+    Analyzes paper content using AI to extract key information and insights.
+
+    Args:
+        url_or_id (str): The arXiv URL or ID of the paper
+        cleanup (bool): Whether to delete the PDF after extraction (default: True)
+
+    Returns:
+        dict: Analysis results including summary, key findings, and concepts
+    """
+    try:
+        # Get paper content
+        content = get_pdf_content(url_or_id, cleanup)
+        if not content or 'Failed to' in content:
+            return {'error': content}
+
+        # Generate paper summary
+        summary_prompt = f"""Analyze this research paper and provide a comprehensive summary:
+        {content[:8000]}  # Limit content length for API
+        
+        Provide a JSON response with:
+        1. executive_summary: Brief overview (2-3 sentences)
+        2. key_findings: List of main research findings
+        3. methodology: Research methods used
+        4. implications: Practical implications of the research
+        5. limitations: Study limitations and constraints"""
+        
+        summary_analysis = llm_text_gen(summary_prompt)
+
+        # Extract key concepts and relationships
+        concepts_prompt = f"""Analyze this research paper and identify key concepts and relationships:
+        {content[:8000]}
+        
+        Provide a JSON response with:
+        1. main_concepts: List of key technical concepts
+        2. concept_relationships: How concepts are related
+        3. novel_contributions: New ideas or approaches introduced
+        4. technical_requirements: Required technologies or methods
+        5. future_directions: Suggested future research"""
+        
+        concept_analysis = llm_text_gen(concepts_prompt)
+
+        return {
+            'summary_analysis': summary_analysis,
+            'concept_analysis': concept_analysis,
+            'full_text': content
+        }
+    except Exception as e:
+        logger.error(f"Error analyzing paper content: {e}")
+        return {'error': str(e)}
+
+def analyze_research_trends(papers):
+    """
+    Analyzes research trends across multiple papers.
+
+    Args:
+        papers (list): List of paper metadata and content
+
+    Returns:
+        dict: Trend analysis results
+    """
+    try:
+        # Collect paper information
+        papers_info = []
+        for paper in papers:
+            content = get_pdf_content(paper['entry_id'], cleanup=True)
+            if content and 'Failed to' not in content:
+                papers_info.append({
+                    'title': paper['title'],
+                    'abstract': paper['summary'],
+                    'content': content[:8000],  # Limit content length
+                    'year': paper['published'].year
+                })
+
+        if not papers_info:
+            return {'error': 'No valid paper content found for analysis'}
+
+        # Analyze trends
+        trends_prompt = f"""Analyze these research papers and identify key trends:
+        Papers:
+        {str(papers_info)}
+        
+        Provide a JSON response with:
+        1. temporal_trends: How research focus evolved over time
+        2. emerging_themes: New and growing research areas
+        3. declining_themes: Decreasing research focus areas
+        4. methodology_trends: Evolution of research methods
+        5. technology_trends: Trends in technology usage
+        6. research_gaps: Identified gaps and opportunities"""
+
+        trend_analysis = llm_text_gen(trends_prompt)
+        return {'trend_analysis': trend_analysis}
+
+    except Exception as e:
+        logger.error(f"Error analyzing research trends: {e}")
+        return {'error': str(e)}
+
+def map_cross_paper_concepts(papers):
+    """
+    Maps concepts and relationships across multiple papers.
+
+    Args:
+        papers (list): List of paper metadata and content
+
+    Returns:
+        dict: Concept mapping results
+    """
+    try:
+        # Analyze each paper
+        paper_analyses = []
+        for paper in papers:
+            analysis = analyze_paper_content(paper['entry_id'])
+            if 'error' not in analysis:
+                paper_analyses.append({
+                    'paper_id': paper['entry_id'],
+                    'title': paper['title'],
+                    'analysis': analysis
+                })
+
+        if not paper_analyses:
+            return {'error': 'No valid paper analyses for concept mapping'}
+
+        # Generate cross-paper concept map
+        mapping_prompt = f"""Analyze relationships between concepts across these papers:
+        {str(paper_analyses)}
+        
+        Provide a JSON response with:
+        1. shared_concepts: Concepts appearing in multiple papers
+        2. concept_evolution: How concepts developed across papers
+        3. conflicting_views: Different interpretations of same concepts
+        4. complementary_findings: How papers complement each other
+        5. knowledge_gaps: Areas needing more research"""
+
+        concept_mapping = llm_text_gen(mapping_prompt)
+        return {'concept_mapping': concept_mapping}
+
+    except Exception as e:
+        logger.error(f"Error mapping cross-paper concepts: {e}")
+        return {'error': str(e)}
+
+def generate_bibtex_entry(paper):
+    """
+    Generates a BibTeX entry for a paper with complete metadata.
+
+    Args:
+        paper (dict): Paper metadata dictionary
+
+    Returns:
+        str: BibTeX entry string
+    """
+    try:
+        # Generate a unique citation key
+        first_author = paper['authors'][0].split()[-1] if paper['authors'] else 'Unknown'
+        year = paper['published'].year if paper['published'] else '0000'
+        citation_key = f"{first_author}{year}{paper['entry_id'].split('/')[-1]}"
+
+        # Format authors for BibTeX
+        authors = ' and '.join(paper['authors'])
+
+        # Create BibTeX entry
+        bibtex = f"@article{{{citation_key},\n"
+        bibtex += f"  title = {{{paper['title']}}},\n"
+        bibtex += f"  author = {{{authors}}},\n"
+        bibtex += f"  year = {{{year}}},\n"
+        bibtex += f"  journal = {{arXiv preprint}},\n"
+        bibtex += f"  archivePrefix = {{arXiv}},\n"
+        bibtex += f"  eprint = {{{paper['entry_id'].split('/')[-1]}}},\n"
+        if paper['doi']:
+            bibtex += f"  doi = {{{paper['doi']}}},\n"
+        bibtex += f"  url = {{{paper['entry_id']}}},\n"
+        bibtex += f"  abstract = {{{paper['summary']}}}\n"
+        bibtex += "}"
+
+        return bibtex
+    except Exception as e:
+        logger.error(f"Error generating BibTeX entry: {e}")
+        return ""
+
+def convert_citation_format(bibtex_str, target_format):
+    """
+    Converts BibTeX citations to other formats and validates the output.
+
+    Args:
+        bibtex_str (str): BibTeX entry string
+        target_format (str): Target citation format ('apa', 'mla', 'chicago', etc.)
+
+    Returns:
+        str: Formatted citation string
+    """
+    try:
+        # Parse BibTeX entry
+        bib_database = bibtexparser.loads(bibtex_str)
+        entry = bib_database.entries[0]
+
+        # Generate citation format prompt
+        prompt = f"""Convert this bibliographic information to {target_format} format:
+        Title: {entry.get('title', '')}
+        Authors: {entry.get('author', '')}
+        Year: {entry.get('year', '')}
+        Journal: {entry.get('journal', '')}
+        DOI: {entry.get('doi', '')}
+        URL: {entry.get('url', '')}
+        
+        Return only the formatted citation without any explanation."""
+
+        # Use AI to generate formatted citation
+        formatted_citation = llm_text_gen(prompt)
+        return formatted_citation.strip()
+    except Exception as e:
+        logger.error(f"Error converting citation format: {e}")
+        return ""
+
+def visualize_reference_graph(papers):
+    """
+    Creates a visual representation of the citation network.
+
+    Args:
+        papers (list): List of paper metadata dictionaries
+
+    Returns:
+        str: Path to the saved visualization file
+    """
+    try:
+        # Create directed graph
+        G = nx.DiGraph()
+        
+        # Add nodes and edges
+        for paper in papers:
+            paper_id = paper['entry_id']
+            G.add_node(paper_id, title=paper['title'])
+            
+            # Add citation edges
+            if paper['doi']:
+                for other_paper in papers:
+                    if other_paper['doi'] and other_paper['doi'] in paper['summary']:
+                        G.add_edge(paper_id, other_paper['entry_id'])
+        
+        # Set up the visualization
+        plt.figure(figsize=(12, 8))
+        pos = nx.spring_layout(G)
+        
+        # Draw the graph
+        nx.draw(G, pos, with_labels=False, node_color='lightblue', 
+                node_size=1000, arrowsize=20)
+        
+        # Add labels
+        labels = nx.get_node_attributes(G, 'title')
+        nx.draw_networkx_labels(G, pos, labels, font_size=8)
+        
+        # Save the visualization
+        output_path = 'reference_graph.png'
+        plt.savefig(output_path, dpi=300, bbox_inches='tight')
+        plt.close()
+        
+        return output_path
+    except Exception as e:
+        logger.error(f"Error visualizing reference graph: {e}")
+        return ""
+
+def analyze_citation_impact(papers):
+    """
+    Analyzes citation impact and influence patterns.
+
+    Args:
+        papers (list): List of paper metadata dictionaries
+
+    Returns:
+        dict: Citation impact analysis results
+    """
+    try:
+        # Create citation network
+        G = nx.DiGraph()
+        for paper in papers:
+            G.add_node(paper['entry_id'], **paper)
+            if paper['doi']:
+                for other_paper in papers:
+                    if other_paper['doi'] and other_paper['doi'] in paper['summary']:
+                        G.add_edge(paper_id, other_paper['entry_id'])
+
+        # Calculate impact metrics
+        impact_analysis = {
+            'citation_counts': dict(G.in_degree()),
+            'influence_scores': nx.pagerank(G),
+            'authority_scores': nx.authority_matrix(G).diagonal(),
+            'hub_scores': nx.hub_matrix(G).diagonal(),
+            'citation_paths': dict(nx.all_pairs_shortest_path_length(G))
+        }
+
+        # Add temporal analysis
+        year_citations = defaultdict(int)
+        for paper in papers:
+            if paper['published']:
+                year = paper['published'].year
+                year_citations[year] += G.in_degree(paper['entry_id'])
+        impact_analysis['temporal_trends'] = dict(year_citations)
+
+        return impact_analysis
+    except Exception as e:
+        logger.error(f"Error analyzing citation impact: {e}")
+        return {}
+
+def get_pdf_content(url_or_id, cleanup=True):
+    """
+    Extracts text content from a paper's PDF with improved error handling.
+
+    Args:
+        url_or_id (str): The arXiv URL or ID of the paper
+        cleanup (bool): Whether to delete the PDF after extraction (default: True)
+
+    Returns:
+        str: The extracted text content or error message
+    """
+    try:
+        # Extract arxiv ID from URL if needed
+        arxiv_id = url_or_id.split('/')[-1] if '/' in url_or_id else url_or_id
+        
+        # Download PDF
+        pdf_path = download_paper(arxiv_id)
+        if not pdf_path:
+            return "Failed to download PDF."
+
+        # Extract text from PDF
        pdf_text = ''
-        with open(pdf_filename, 'rb') as f:
+        with open(pdf_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
-            for page in pdf_reader.pages:
+            for page_num, page in enumerate(pdf_reader.pages, 1):
                try:
                    page_text = page.extract_text()
                    if page_text:
-                        pdf_text += page_text + '\n'
-                except UnicodeDecodeError as err:
-                    logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
-                    pass
-        os.remove(pdf_filename)
-        pdf_text = clean_pdf_text(pdf_text)
-        return pdf_text
-    except Exception as pdf_error:
-        logger.error(f"Failed to process PDF: {pdf_error}")
-        return "Failed to retrieve content."
+                        pdf_text += f"\n--- Page {page_num} ---\n{page_text}"
+                except Exception as err:
+                    logger.error(f"Error extracting text from page {page_num}: {err}")
+                    continue
+
+        # Clean up
+        if cleanup:
+            try:
+                os.remove(pdf_path)
+                logger.debug(f"Cleaned up temporary PDF file: {pdf_path}")
+            except Exception as e:
+                logger.warning(f"Failed to cleanup PDF file {pdf_path}: {e}")
+
+        # Process and return text
+        if not pdf_text.strip():
+            return "No text content could be extracted from the PDF."
+            
+        return clean_pdf_text(pdf_text)
+
+    except Exception as e:
+        logger.error(f"Failed to process PDF: {e}")
+        return f"Failed to retrieve content: {str(e)}"

 def clean_pdf_text(text):
    """
@@ -194,48 +800,128 @@ def scrape_images_from_arxiv(url):
        logger.error(f"Error fetching page {url}: {e}")
        return []

-def arxiv_bibtex(arxiv_id):
+def generate_bibtex(paper_id, client=None):
    """
-    Get the BibTeX entry for an arXiv paper.
+    Generate a BibTeX entry for an arXiv paper with enhanced metadata.

-    Args: 
-        arxiv_id: The arXiv ID of the paper.
+    Args:
+        paper_id (str): The arXiv ID of the paper
+        client (arxiv.Client): Optional custom client (default: None)

-    Returns: 
-        A string containing the BibTeX entry.
+    Returns:
+        str: BibTeX entry as a string
    """
    try:
-        usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
-        xmldoc = xml.dom.minidom.parse(usock)
-        usock.close()
-        entry = xmldoc.getElementsByTagName("entry")[0]
-        date = entry.getElementsByTagName("updated")[0].firstChild.data
-        text_year = date[:4]
-        title = entry.getElementsByTagName("title")[0]
-        text_title = title.firstChild.data.strip()
-        authorlist = []
-        first = True
-        for person_name in entry.getElementsByTagName("author"):
-            name = person_name.getElementsByTagName("name")[0]
-            text_name = name.firstChild.data
-            text_given_name = ' '.join(text_name.split()[:-1])
-            text_surname = text_name.split()[-1]
-            authorlist.append(f"{text_surname}, {text_given_name}")
-            if first:
-                text_first_author_surname = text_surname
-                first = False
-        bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
-        bibtex += f" author = {' and '.join(authorlist)},\n"
-        bibtex += f" title = {{{text_title}}},\n"
-        bibtex += f" year = {{{text_year}}},\n"
-        bibtex += f" eprint = {{{arxiv_id}}},\n"
-        bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
-        bibtex += "}"
-        return bibtex
+        if client is None:
+            client = create_arxiv_client()
+
+        # Fetch paper metadata
+        paper = next(client.results(arxiv.Search(id_list=[paper_id])))
+        
+        # Extract author information
+        authors = [str(author) for author in paper.authors]
+        first_author = authors[0].split(', ')[0] if authors else 'Unknown'
+        
+        # Format year
+        year = paper.published.year if paper.published else 'Unknown'
+        
+        # Create citation key
+        citation_key = f"{first_author}{str(year)[-2:]}"
+        
+        # Build BibTeX entry
+        bibtex = [
+            f"@article{{{citation_key},",
+            f"  author = {{{' and '.join(authors)}}},",
+            f"  title = {{{paper.title}}},",
+            f"  year = {{{year}}},",
+            f"  eprint = {{{paper_id}}},",
+            f"  archivePrefix = {{arXiv}},"
+        ]
+        
+        # Add optional fields if available
+        if paper.doi:
+            bibtex.append(f"  doi = {{{paper.doi}}},")
+        if getattr(paper, 'journal_ref', None):
+            bibtex.append(f"  journal = {{{paper.journal_ref}}},")
+        if getattr(paper, 'primary_category', None):
+            bibtex.append(f"  primaryClass = {{{paper.primary_category}}},")
+            
+        # Add URL and close entry
+        bibtex.extend([
+            f"  url = {{https://arxiv.org/abs/{paper_id}}}",
+            "}"
+        ])
+        
+        return '\n'.join(bibtex)
+        
    except Exception as e:
-        logger.error(f"Error while generating BibTeX: {e}")
+        logger.error(f"Error generating BibTeX for {paper_id}: {e}")
        return ""

+def batch_download_papers(paper_ids, output_dir="downloads", get_source=False):
+    """
+    Download multiple papers in batch with progress tracking.
+
+    Args:
+        paper_ids (list): List of arXiv IDs to download
+        output_dir (str): Directory to save downloaded files (default: 'downloads')
+        get_source (bool): If True, downloads source files instead of PDFs (default: False)
+
+    Returns:
+        dict: Mapping of paper IDs to their download status and paths
+    """
+    results = {}
+    client = create_arxiv_client()
+
+    for paper_id in paper_ids:
+        try:
+            file_path = download_paper(paper_id, output_dir, get_source=get_source)
+            results[paper_id] = {
+                'success': bool(file_path),
+                'path': file_path,
+                'error': None
+            }
+        except Exception as e:
+            results[paper_id] = {
+                'success': False,
+                'path': None,
+                'error': str(e)
+            }
+            logger.error(f"Failed to download {paper_id}: {e}")
+
+    return results
+
+def batch_generate_bibtex(paper_ids):
+    """
+    Generate BibTeX entries for multiple papers.
+
+    Args:
+        paper_ids (list): List of arXiv IDs
+
+    Returns:
+        dict: Mapping of paper IDs to their BibTeX entries
+    """
+    results = {}
+    client = create_arxiv_client()
+
+    for paper_id in paper_ids:
+        try:
+            bibtex = generate_bibtex(paper_id, client)
+            results[paper_id] = {
+                'success': bool(bibtex),
+                'bibtex': bibtex,
+                'error': None
+            }
+        except Exception as e:
+            results[paper_id] = {
+                'success': False,
+                'bibtex': '',
+                'error': str(e)
+            }
+            logger.error(f"Failed to generate BibTeX for {paper_id}: {e}")
+
+    return results
+
 def extract_arxiv_ids_from_line(line):
    """
    Extract the arXiv ID from a given line of text.