Base code

This commit is contained in:
Kunthawat Greethong
2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
1). Replace Firecrawl with scrapy or crawlee : https://crawlee.dev/python/docs/introduction

View File

@@ -0,0 +1,980 @@
####################################################
#
# FIXME: Gotta use this lib: https://github.com/monk1337/resp/tree/main
# https://github.com/danielnsilva/semanticscholar
# https://github.com/shauryr/S2QA
#
####################################################
import os
import sys
import re
import pandas as pd
import arxiv
import PyPDF2
import requests
import networkx as nx
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from loguru import logger
from ..gpt_providers.text_generation.main_text_generation import llm_text_gen
import bibtexparser
from pylatexenc.latex2text import LatexNodes2Text
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import numpy as np
logger.remove()
logger.add(sys.stdout, colorize=True, format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}")
def create_arxiv_client(page_size=100, delay_seconds=3.0, num_retries=3):
"""
Creates a reusable arXiv API client with custom configuration.
Args:
page_size (int): Number of results per page (default: 100)
delay_seconds (float): Delay between API requests (default: 3.0)
num_retries (int): Number of retries for failed requests (default: 3)
Returns:
arxiv.Client: Configured arXiv API client
"""
try:
client = arxiv.Client(
page_size=page_size,
delay_seconds=delay_seconds,
num_retries=num_retries
)
return client
except Exception as e:
logger.error(f"Error creating arXiv client: {e}")
raise e
def expand_search_query(query, research_interests=None):
"""
Uses AI to expand the search query based on user's research interests.
Args:
query (str): Original search query
research_interests (list): List of user's research interests
Returns:
str: Expanded search query
"""
try:
interests_context = "\n".join(research_interests) if research_interests else ""
prompt = f"""Given the original arXiv search query: '{query}'
{f'And considering these research interests:\n{interests_context}' if interests_context else ''}
Generate an expanded arXiv search query that:
1. Includes relevant synonyms and related concepts
2. Uses appropriate arXiv search operators (AND, OR, etc.)
3. Incorporates field-specific tags (ti:, abs:, au:, etc.)
4. Maintains focus on the core topic
Return only the expanded query without any explanation."""
expanded_query = llm_text_gen(prompt)
logger.info(f"Expanded query: {expanded_query}")
return expanded_query
except Exception as e:
logger.error(f"Error expanding search query: {e}")
return query
def analyze_citation_network(papers):
"""
Analyzes citation relationships between papers using DOIs and references.
Args:
papers (list): List of paper metadata dictionaries
Returns:
dict: Citation network analysis results
"""
try:
# Create a directed graph for citations
G = nx.DiGraph()
# Add nodes and edges
for paper in papers:
paper_id = paper['entry_id']
G.add_node(paper_id, title=paper['title'])
# Add edges based on DOIs and references
if paper['doi']:
for other_paper in papers:
if other_paper['doi'] and other_paper['doi'] in paper['summary']:
G.add_edge(paper_id, other_paper['entry_id'])
# Calculate network metrics
analysis = {
'influential_papers': sorted(nx.pagerank(G).items(), key=lambda x: x[1], reverse=True),
'citation_clusters': list(nx.connected_components(G.to_undirected())),
'citation_paths': dict(nx.all_pairs_shortest_path_length(G))
}
return analysis
except Exception as e:
logger.error(f"Error analyzing citation network: {e}")
return {}
def categorize_papers(papers):
"""
Uses AI to categorize papers based on their metadata and content.
Args:
papers (list): List of paper metadata dictionaries
Returns:
dict: Paper categorization results
"""
try:
categorized_papers = {}
for paper in papers:
prompt = f"""Analyze this research paper and provide detailed categorization:
Title: {paper['title']}
Abstract: {paper['summary']}
Primary Category: {paper['primary_category']}
Categories: {', '.join(paper['categories'])}
Provide a JSON response with these fields:
1. main_theme: Primary research theme
2. sub_themes: List of related sub-themes
3. methodology: Research methodology used
4. application_domains: Potential application areas
5. technical_complexity: Level (Basic/Intermediate/Advanced)"""
categorization = llm_text_gen(prompt)
categorized_papers[paper['entry_id']] = categorization
return categorized_papers
except Exception as e:
logger.error(f"Error categorizing papers: {e}")
return {}
def get_paper_recommendations(papers, research_interests):
"""
Generates personalized paper recommendations based on user's research interests.
Args:
papers (list): List of paper metadata dictionaries
research_interests (list): User's research interests
Returns:
dict: Personalized paper recommendations
"""
try:
interests_text = "\n".join(research_interests)
recommendations = {}
for paper in papers:
prompt = f"""Evaluate this paper's relevance to the user's research interests:
Paper:
- Title: {paper['title']}
- Abstract: {paper['summary']}
- Categories: {', '.join(paper['categories'])}
User's Research Interests:
{interests_text}
Provide a JSON response with:
1. relevance_score: 0-100
2. relevance_aspects: List of matching aspects
3. potential_value: How this paper could benefit the user's research"""
evaluation = llm_text_gen(prompt)
recommendations[paper['entry_id']] = evaluation
return recommendations
except Exception as e:
logger.error(f"Error generating paper recommendations: {e}")
return {}
def fetch_arxiv_data(query, max_results=10, sort_by=arxiv.SortCriterion.SubmittedDate, sort_order=None, client=None, research_interests=None):
"""
Fetches arXiv data based on a query with advanced search options.
Args:
query (str): The search query (supports advanced syntax, e.g., 'au:einstein AND cat:physics')
max_results (int): The maximum number of results to fetch
sort_by (arxiv.SortCriterion): Sorting criterion (default: SubmittedDate)
sort_order (str): Sort order ('ascending' or 'descending', default: None)
client (arxiv.Client): Optional custom client (default: None, creates new client)
Returns:
list: A list of arXiv data with extended metadata
"""
try:
if client is None:
client = create_arxiv_client()
# Expand search query using AI if research interests are provided
expanded_query = expand_search_query(query, research_interests) if research_interests else query
logger.info(f"Using expanded query: {expanded_query}")
search = arxiv.Search(
query=expanded_query,
max_results=max_results,
sort_by=sort_by,
sort_order=sort_order
)
results = list(client.results(search))
all_data = [
{
'title': result.title,
'published': result.published,
'updated': result.updated,
'entry_id': result.entry_id,
'summary': result.summary,
'authors': [str(author) for author in result.authors],
'pdf_url': result.pdf_url,
'journal_ref': getattr(result, 'journal_ref', None),
'doi': getattr(result, 'doi', None),
'primary_category': getattr(result, 'primary_category', None),
'categories': getattr(result, 'categories', []),
'links': [link.href for link in getattr(result, 'links', [])]
}
for result in results
]
# Enhance results with AI-powered analysis
if all_data:
# Analyze citation network
citation_analysis = analyze_citation_network(all_data)
# Categorize papers using AI
paper_categories = categorize_papers(all_data)
# Generate recommendations if research interests are provided
recommendations = get_paper_recommendations(all_data, research_interests) if research_interests else {}
# Perform content analysis
content_analyses = [analyze_paper_content(paper['entry_id']) for paper in all_data]
trend_analysis = analyze_research_trends(all_data)
concept_mapping = map_cross_paper_concepts(all_data)
# Generate bibliography data
bibliography_data = {
'bibtex_entries': [generate_bibtex_entry(paper) for paper in all_data],
'citations': {
'apa': [convert_citation_format(generate_bibtex_entry(paper), 'apa') for paper in all_data],
'mla': [convert_citation_format(generate_bibtex_entry(paper), 'mla') for paper in all_data],
'chicago': [convert_citation_format(generate_bibtex_entry(paper), 'chicago') for paper in all_data]
},
'reference_graph': visualize_reference_graph(all_data),
'citation_impact': analyze_citation_impact(all_data)
}
# Add enhanced data to results
enhanced_data = {
'papers': all_data,
'citation_analysis': citation_analysis,
'paper_categories': paper_categories,
'recommendations': recommendations,
'content_analyses': content_analyses,
'trend_analysis': trend_analysis,
'concept_mapping': concept_mapping,
'bibliography': bibliography_data
}
return enhanced_data
return {'papers': all_data}
except Exception as e:
logger.error(f"An error occurred while fetching data from arXiv: {e}")
raise e
def create_dataframe(data, column_names):
"""
Creates a DataFrame from the provided data.
Args:
data (list): The data to convert to a DataFrame.
column_names (list): The column names for the DataFrame.
Returns:
DataFrame: The created DataFrame.
"""
try:
df = pd.DataFrame(data, columns=column_names)
return df
except Exception as e:
logger.error(f"An error occurred while creating DataFrame: {e}")
return pd.DataFrame()
def get_arxiv_main_content(url):
"""
Returns the main content of an arXiv paper.
Args:
url (str): The URL of the arXiv paper.
Returns:
str: The main content of the paper as a string.
"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
main_content = soup.find('div', class_='ltx_page_content')
if not main_content:
logger.warning("Main content not found in the page.")
return "Main content not found."
alert_section = main_content.find('div', class_='package-alerts ltx_document')
if (alert_section):
alert_section.decompose()
for element_id in ["abs", "authors"]:
element = main_content.find(id=element_id)
if (element):
element.decompose()
return main_content.text.strip()
except Exception as html_error:
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
return get_pdf_content(url)
def download_paper(paper_id, output_dir="downloads", filename=None, get_source=False):
"""
Downloads a paper's PDF or source files with enhanced error handling.
Args:
paper_id (str): The arXiv ID of the paper
output_dir (str): Directory to save the downloaded file (default: 'downloads')
filename (str): Custom filename (default: None, uses paper ID)
get_source (bool): If True, downloads source files instead of PDF (default: False)
Returns:
str: Path to the downloaded file or None if download fails
"""
try:
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Get paper metadata
client = create_arxiv_client()
paper = next(client.results(arxiv.Search(id_list=[paper_id])))
# Set filename if not provided
if not filename:
safe_title = re.sub(r'[^\w\-_.]', '_', paper.title[:50])
filename = f"{paper_id}_{safe_title}"
filename += ".tar.gz" if get_source else ".pdf"
# Full path for the downloaded file
file_path = os.path.join(output_dir, filename)
# Download the file
if get_source:
paper.download_source(dirpath=output_dir, filename=filename)
else:
paper.download_pdf(dirpath=output_dir, filename=filename)
logger.info(f"Successfully downloaded {'source' if get_source else 'PDF'} to {file_path}")
return file_path
except Exception as e:
logger.error(f"Error downloading {'source' if get_source else 'PDF'} for {paper_id}: {e}")
return None
def analyze_paper_content(url_or_id, cleanup=True):
"""
Analyzes paper content using AI to extract key information and insights.
Args:
url_or_id (str): The arXiv URL or ID of the paper
cleanup (bool): Whether to delete the PDF after extraction (default: True)
Returns:
dict: Analysis results including summary, key findings, and concepts
"""
try:
# Get paper content
content = get_pdf_content(url_or_id, cleanup)
if not content or 'Failed to' in content:
return {'error': content}
# Generate paper summary
summary_prompt = f"""Analyze this research paper and provide a comprehensive summary:
{content[:8000]} # Limit content length for API
Provide a JSON response with:
1. executive_summary: Brief overview (2-3 sentences)
2. key_findings: List of main research findings
3. methodology: Research methods used
4. implications: Practical implications of the research
5. limitations: Study limitations and constraints"""
summary_analysis = llm_text_gen(summary_prompt)
# Extract key concepts and relationships
concepts_prompt = f"""Analyze this research paper and identify key concepts and relationships:
{content[:8000]}
Provide a JSON response with:
1. main_concepts: List of key technical concepts
2. concept_relationships: How concepts are related
3. novel_contributions: New ideas or approaches introduced
4. technical_requirements: Required technologies or methods
5. future_directions: Suggested future research"""
concept_analysis = llm_text_gen(concepts_prompt)
return {
'summary_analysis': summary_analysis,
'concept_analysis': concept_analysis,
'full_text': content
}
except Exception as e:
logger.error(f"Error analyzing paper content: {e}")
return {'error': str(e)}
def analyze_research_trends(papers):
"""
Analyzes research trends across multiple papers.
Args:
papers (list): List of paper metadata and content
Returns:
dict: Trend analysis results
"""
try:
# Collect paper information
papers_info = []
for paper in papers:
content = get_pdf_content(paper['entry_id'], cleanup=True)
if content and 'Failed to' not in content:
papers_info.append({
'title': paper['title'],
'abstract': paper['summary'],
'content': content[:8000], # Limit content length
'year': paper['published'].year
})
if not papers_info:
return {'error': 'No valid paper content found for analysis'}
# Analyze trends
trends_prompt = f"""Analyze these research papers and identify key trends:
Papers:
{str(papers_info)}
Provide a JSON response with:
1. temporal_trends: How research focus evolved over time
2. emerging_themes: New and growing research areas
3. declining_themes: Decreasing research focus areas
4. methodology_trends: Evolution of research methods
5. technology_trends: Trends in technology usage
6. research_gaps: Identified gaps and opportunities"""
trend_analysis = llm_text_gen(trends_prompt)
return {'trend_analysis': trend_analysis}
except Exception as e:
logger.error(f"Error analyzing research trends: {e}")
return {'error': str(e)}
def map_cross_paper_concepts(papers):
"""
Maps concepts and relationships across multiple papers.
Args:
papers (list): List of paper metadata and content
Returns:
dict: Concept mapping results
"""
try:
# Analyze each paper
paper_analyses = []
for paper in papers:
analysis = analyze_paper_content(paper['entry_id'])
if 'error' not in analysis:
paper_analyses.append({
'paper_id': paper['entry_id'],
'title': paper['title'],
'analysis': analysis
})
if not paper_analyses:
return {'error': 'No valid paper analyses for concept mapping'}
# Generate cross-paper concept map
mapping_prompt = f"""Analyze relationships between concepts across these papers:
{str(paper_analyses)}
Provide a JSON response with:
1. shared_concepts: Concepts appearing in multiple papers
2. concept_evolution: How concepts developed across papers
3. conflicting_views: Different interpretations of same concepts
4. complementary_findings: How papers complement each other
5. knowledge_gaps: Areas needing more research"""
concept_mapping = llm_text_gen(mapping_prompt)
return {'concept_mapping': concept_mapping}
except Exception as e:
logger.error(f"Error mapping cross-paper concepts: {e}")
return {'error': str(e)}
def generate_bibtex_entry(paper):
"""
Generates a BibTeX entry for a paper with complete metadata.
Args:
paper (dict): Paper metadata dictionary
Returns:
str: BibTeX entry string
"""
try:
# Generate a unique citation key
first_author = paper['authors'][0].split()[-1] if paper['authors'] else 'Unknown'
year = paper['published'].year if paper['published'] else '0000'
citation_key = f"{first_author}{year}{paper['entry_id'].split('/')[-1]}"
# Format authors for BibTeX
authors = ' and '.join(paper['authors'])
# Create BibTeX entry
bibtex = f"@article{{{citation_key},\n"
bibtex += f" title = {{{paper['title']}}},\n"
bibtex += f" author = {{{authors}}},\n"
bibtex += f" year = {{{year}}},\n"
bibtex += f" journal = {{arXiv preprint}},\n"
bibtex += f" archivePrefix = {{arXiv}},\n"
bibtex += f" eprint = {{{paper['entry_id'].split('/')[-1]}}},\n"
if paper['doi']:
bibtex += f" doi = {{{paper['doi']}}},\n"
bibtex += f" url = {{{paper['entry_id']}}},\n"
bibtex += f" abstract = {{{paper['summary']}}}\n"
bibtex += "}"
return bibtex
except Exception as e:
logger.error(f"Error generating BibTeX entry: {e}")
return ""
def convert_citation_format(bibtex_str, target_format):
"""
Converts BibTeX citations to other formats and validates the output.
Args:
bibtex_str (str): BibTeX entry string
target_format (str): Target citation format ('apa', 'mla', 'chicago', etc.)
Returns:
str: Formatted citation string
"""
try:
# Parse BibTeX entry
bib_database = bibtexparser.loads(bibtex_str)
entry = bib_database.entries[0]
# Generate citation format prompt
prompt = f"""Convert this bibliographic information to {target_format} format:
Title: {entry.get('title', '')}
Authors: {entry.get('author', '')}
Year: {entry.get('year', '')}
Journal: {entry.get('journal', '')}
DOI: {entry.get('doi', '')}
URL: {entry.get('url', '')}
Return only the formatted citation without any explanation."""
# Use AI to generate formatted citation
formatted_citation = llm_text_gen(prompt)
return formatted_citation.strip()
except Exception as e:
logger.error(f"Error converting citation format: {e}")
return ""
def visualize_reference_graph(papers):
"""
Creates a visual representation of the citation network.
Args:
papers (list): List of paper metadata dictionaries
Returns:
str: Path to the saved visualization file
"""
try:
# Create directed graph
G = nx.DiGraph()
# Add nodes and edges
for paper in papers:
paper_id = paper['entry_id']
G.add_node(paper_id, title=paper['title'])
# Add citation edges
if paper['doi']:
for other_paper in papers:
if other_paper['doi'] and other_paper['doi'] in paper['summary']:
G.add_edge(paper_id, other_paper['entry_id'])
# Set up the visualization
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
# Draw the graph
nx.draw(G, pos, with_labels=False, node_color='lightblue',
node_size=1000, arrowsize=20)
# Add labels
labels = nx.get_node_attributes(G, 'title')
nx.draw_networkx_labels(G, pos, labels, font_size=8)
# Save the visualization
output_path = 'reference_graph.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.close()
return output_path
except Exception as e:
logger.error(f"Error visualizing reference graph: {e}")
return ""
def analyze_citation_impact(papers):
"""
Analyzes citation impact and influence patterns.
Args:
papers (list): List of paper metadata dictionaries
Returns:
dict: Citation impact analysis results
"""
try:
# Create citation network
G = nx.DiGraph()
for paper in papers:
G.add_node(paper['entry_id'], **paper)
if paper['doi']:
for other_paper in papers:
if other_paper['doi'] and other_paper['doi'] in paper['summary']:
G.add_edge(paper_id, other_paper['entry_id'])
# Calculate impact metrics
impact_analysis = {
'citation_counts': dict(G.in_degree()),
'influence_scores': nx.pagerank(G),
'authority_scores': nx.authority_matrix(G).diagonal(),
'hub_scores': nx.hub_matrix(G).diagonal(),
'citation_paths': dict(nx.all_pairs_shortest_path_length(G))
}
# Add temporal analysis
year_citations = defaultdict(int)
for paper in papers:
if paper['published']:
year = paper['published'].year
year_citations[year] += G.in_degree(paper['entry_id'])
impact_analysis['temporal_trends'] = dict(year_citations)
return impact_analysis
except Exception as e:
logger.error(f"Error analyzing citation impact: {e}")
return {}
def get_pdf_content(url_or_id, cleanup=True):
"""
Extracts text content from a paper's PDF with improved error handling.
Args:
url_or_id (str): The arXiv URL or ID of the paper
cleanup (bool): Whether to delete the PDF after extraction (default: True)
Returns:
str: The extracted text content or error message
"""
try:
# Extract arxiv ID from URL if needed
arxiv_id = url_or_id.split('/')[-1] if '/' in url_or_id else url_or_id
# Download PDF
pdf_path = download_paper(arxiv_id)
if not pdf_path:
return "Failed to download PDF."
# Extract text from PDF
pdf_text = ''
with open(pdf_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for page_num, page in enumerate(pdf_reader.pages, 1):
try:
page_text = page.extract_text()
if page_text:
pdf_text += f"\n--- Page {page_num} ---\n{page_text}"
except Exception as err:
logger.error(f"Error extracting text from page {page_num}: {err}")
continue
# Clean up
if cleanup:
try:
os.remove(pdf_path)
logger.debug(f"Cleaned up temporary PDF file: {pdf_path}")
except Exception as e:
logger.warning(f"Failed to cleanup PDF file {pdf_path}: {e}")
# Process and return text
if not pdf_text.strip():
return "No text content could be extracted from the PDF."
return clean_pdf_text(pdf_text)
except Exception as e:
logger.error(f"Failed to process PDF: {e}")
return f"Failed to retrieve content: {str(e)}"
def clean_pdf_text(text):
"""
Helper function to clean the text extracted from a PDF.
Args:
text (str): The text to clean.
Returns:
str: The cleaned text.
"""
pattern = r'References\s*.*'
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
for section in sections_to_remove:
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
return text
def download_image(image_url, base_url, folder="images"):
"""
Downloads an image from a URL.
Args:
image_url (str): The URL of the image.
base_url (str): The base URL of the website.
folder (str): The folder to save the image.
Returns:
bool: True if the image was downloaded successfully, False otherwise.
"""
if image_url.startswith('data:image'):
logger.info(f"Skipping download of data URI image: {image_url}")
return False
if not os.path.exists(folder):
os.makedirs(folder)
if not urlparse(image_url).scheme:
if not base_url.endswith('/'):
base_url += '/'
image_url = base_url + image_url
try:
response = requests.get(image_url)
response.raise_for_status()
image_name = image_url.split("/")[-1]
with open(os.path.join(folder, image_name), 'wb') as file:
file.write(response.content)
return True
except requests.RequestException as e:
logger.error(f"Error downloading {image_url}: {e}")
return False
def scrape_images_from_arxiv(url):
"""
Scrapes images from an arXiv page.
Args:
url (str): The URL of the arXiv page.
Returns:
list: A list of image URLs.
"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
image_urls = [img['src'] for img in images if 'src' in img.attrs]
return image_urls
except requests.RequestException as e:
logger.error(f"Error fetching page {url}: {e}")
return []
def generate_bibtex(paper_id, client=None):
"""
Generate a BibTeX entry for an arXiv paper with enhanced metadata.
Args:
paper_id (str): The arXiv ID of the paper
client (arxiv.Client): Optional custom client (default: None)
Returns:
str: BibTeX entry as a string
"""
try:
if client is None:
client = create_arxiv_client()
# Fetch paper metadata
paper = next(client.results(arxiv.Search(id_list=[paper_id])))
# Extract author information
authors = [str(author) for author in paper.authors]
first_author = authors[0].split(', ')[0] if authors else 'Unknown'
# Format year
year = paper.published.year if paper.published else 'Unknown'
# Create citation key
citation_key = f"{first_author}{str(year)[-2:]}"
# Build BibTeX entry
bibtex = [
f"@article{{{citation_key},",
f" author = {{{' and '.join(authors)}}},",
f" title = {{{paper.title}}},",
f" year = {{{year}}},",
f" eprint = {{{paper_id}}},",
f" archivePrefix = {{arXiv}},"
]
# Add optional fields if available
if paper.doi:
bibtex.append(f" doi = {{{paper.doi}}},")
if getattr(paper, 'journal_ref', None):
bibtex.append(f" journal = {{{paper.journal_ref}}},")
if getattr(paper, 'primary_category', None):
bibtex.append(f" primaryClass = {{{paper.primary_category}}},")
# Add URL and close entry
bibtex.extend([
f" url = {{https://arxiv.org/abs/{paper_id}}}",
"}"
])
return '\n'.join(bibtex)
except Exception as e:
logger.error(f"Error generating BibTeX for {paper_id}: {e}")
return ""
def batch_download_papers(paper_ids, output_dir="downloads", get_source=False):
"""
Download multiple papers in batch with progress tracking.
Args:
paper_ids (list): List of arXiv IDs to download
output_dir (str): Directory to save downloaded files (default: 'downloads')
get_source (bool): If True, downloads source files instead of PDFs (default: False)
Returns:
dict: Mapping of paper IDs to their download status and paths
"""
results = {}
client = create_arxiv_client()
for paper_id in paper_ids:
try:
file_path = download_paper(paper_id, output_dir, get_source=get_source)
results[paper_id] = {
'success': bool(file_path),
'path': file_path,
'error': None
}
except Exception as e:
results[paper_id] = {
'success': False,
'path': None,
'error': str(e)
}
logger.error(f"Failed to download {paper_id}: {e}")
return results
def batch_generate_bibtex(paper_ids):
"""
Generate BibTeX entries for multiple papers.
Args:
paper_ids (list): List of arXiv IDs
Returns:
dict: Mapping of paper IDs to their BibTeX entries
"""
results = {}
client = create_arxiv_client()
for paper_id in paper_ids:
try:
bibtex = generate_bibtex(paper_id, client)
results[paper_id] = {
'success': bool(bibtex),
'bibtex': bibtex,
'error': None
}
except Exception as e:
results[paper_id] = {
'success': False,
'bibtex': '',
'error': str(e)
}
logger.error(f"Failed to generate BibTeX for {paper_id}: {e}")
return results
def extract_arxiv_ids_from_line(line):
"""
Extract the arXiv ID from a given line of text.
Args:
line (str): A line of text potentially containing an arXiv URL.
Returns:
str: The extracted arXiv ID, or None if not found.
"""
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
match = arxiv_id_pattern.search(line)
if match:
return match.group(1) + (match.group(2) if match.group(2) else '')
return None
def read_written_ids(file_path):
"""
Read already written arXiv IDs from a file.
Args:
file_path (str): Path to the file containing written IDs.
Returns:
set: A set of arXiv IDs.
"""
written_ids = set()
try:
with open(file_path, 'r', encoding="utf-8") as file:
for line in file:
written_ids.add(line.strip())
except FileNotFoundError:
logger.error(f"File not found: {file_path}")
except Exception as e:
logger.error(f"Error while reading the file: {e}")
return written_ids
def append_id_to_file(arxiv_id, output_file_path):
"""
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
Args:
arxiv_id (str): The arXiv ID to append.
output_file_path (str): Path to the output file.
"""
try:
if not os.path.exists(output_file_path):
logger.info(f"File does not exist. Creating new file: {output_file_path}")
with open(output_file_path, 'a', encoding="utf-8") as outfile:
outfile.write(arxiv_id + '\n')
else:
logger.info(f"Appending to existing file: {output_file_path}")
with open(output_file_path, 'a', encoding="utf-8") as outfile:
outfile.write(arxiv_id + '\n')
except Exception as e:
logger.error(f"Error while appending to file: {e}")

View File

@@ -0,0 +1,100 @@
# Common utils for web_researcher
import os
import sys
import re
import json
from pathlib import Path
from datetime import datetime, timedelta
from pathlib import Path
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def cfg_search_param(flag):
"""
Read values from the main_config.json file and return them as variables and a dictionary.
Args:
flag (str): A flag to determine which configuration values to return.
Returns:
various: The values read from the config file based on the flag.
"""
try:
file_path = Path(os.environ.get("ALWRITY_CONFIG", ""))
if not file_path.is_file():
raise FileNotFoundError(f"Configuration file not found: {file_path}")
logger.info(f"Reading search config params from {file_path}")
with open(file_path, 'r', encoding='utf-8') as file:
config = json.load(file)
web_research_section = config["Search Engine Parameters"]
if 'serperdev' in flag:
# Get values as variables
geo_location = web_research_section.get("Geographic Location")
search_language = web_research_section.get("Search Language")
num_results = web_research_section.get("Number of Results")
return geo_location, search_language, num_results
elif 'tavily' in flag:
include_urls = web_research_section.get("Include Domains")
pattern = re.compile(r"^(https?://[^\s,]+)(,\s*https?://[^\s,]+)*$")
if pattern.match(include_urls):
include_urls = [url.strip() for url in include_urls.split(',')]
else:
include_urls = None
return include_urls
elif 'exa' in flag:
include_urls = web_research_section.get("Include Domains")
pattern = re.compile(r"^(https?://\w+)(,\s*https?://\w+)*$")
if pattern.match(include_urls) is not None:
include_urls = include_urls.split(',')
elif re.match(r"^http?://\w+$", include_urls) is not None:
include_urls = include_urls.split(" ")
else:
include_urls = None
num_results = web_research_section.get("Number of Results")
similar_url = web_research_section.get("Similar URL")
time_range = web_research_section.get("Time Range")
if time_range == "past day":
start_published_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
elif time_range == "past week":
start_published_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
elif time_range == "past month":
start_published_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
elif time_range == "past year":
start_published_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
elif time_range == "anytime" or not time_range:
start_published_date = None
time_range = start_published_date
return include_urls, time_range, num_results, similar_url
except FileNotFoundError:
logger.error(f"Error: Config file '{file_path}' not found.")
return {}, None, None, None
except KeyError as e:
logger.error(f"Error: Missing section or option in config file: {e}")
return {}, None, None, None
except ValueError as e:
logger.error(f"Error: Invalid value in config file: {e}")
return {}, None, None, None
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "a+", encoding="utf-8") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
return file_path
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")

View File

@@ -0,0 +1,256 @@
import matplotlib.pyplot as plt
import pandas as pd
import yfinance as yf
import pandas_ta as ta
import matplotlib.dates as mdates
from datetime import datetime, timedelta
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def calculate_technical_indicators(data: pd.DataFrame) -> pd.DataFrame:
"""
Calculates a suite of technical indicators using pandas_ta.
Args:
data (pd.DataFrame): DataFrame containing historical stock price data.
Returns:
pd.DataFrame: DataFrame with added technical indicators.
"""
try:
# Moving Averages
data.ta.macd(append=True)
data.ta.sma(length=20, append=True)
data.ta.ema(length=50, append=True)
# Momentum Indicators
data.ta.rsi(append=True)
data.ta.stoch(append=True)
# Volatility Indicators
data.ta.bbands(append=True)
data.ta.adx(append=True)
# Other Indicators
data.ta.obv(append=True)
data.ta.willr(append=True)
data.ta.cmf(append=True)
data.ta.psar(append=True)
# Custom Calculations
data['OBV_in_million'] = data['OBV'] / 1e6
data['MACD_histogram_12_26_9'] = data['MACDh_12_26_9']
logging.info("Technical indicators calculated successfully.")
return data
except KeyError as e:
logging.error(f"Missing key in data: {e}")
except ValueError as e:
logging.error(f"Value error: {e}")
except Exception as e:
logging.error(f"Error during technical indicator calculation: {e}")
return None
def get_last_day_summary(data: pd.DataFrame) -> pd.Series:
"""
Extracts and summarizes technical indicators for the last trading day.
Args:
data (pd.DataFrame): DataFrame with calculated technical indicators.
Returns:
pd.Series: Summary of technical indicators for the last day.
"""
try:
last_day_summary = data.iloc[-1][[
'Adj Close', 'MACD_12_26_9', 'MACD_histogram_12_26_9', 'RSI_14',
'BBL_5_2.0', 'BBM_5_2.0', 'BBU_5_2.0', 'SMA_20', 'EMA_50',
'OBV_in_million', 'STOCHk_14_3_3', 'STOCHd_14_3_3', 'ADX_14',
'WILLR_14', 'CMF_20', 'PSARl_0.02_0.2', 'PSARs_0.02_0.2'
]]
logging.info("Last day summary extracted.")
return last_day_summary
except KeyError as e:
logging.error(f"Missing columns in data: {e}")
except Exception as e:
logging.error(f"Error extracting last day summary: {e}")
return None
def analyze_stock(ticker_symbol: str, start_date: datetime, end_date: datetime) -> pd.Series:
"""
Fetches stock data, calculates technical indicators, and provides a summary.
Args:
ticker_symbol (str): The stock symbol.
start_date (datetime): Start date for data retrieval.
end_date (datetime): End date for data retrieval.
Returns:
pd.Series: Summary of technical indicators for the last day.
"""
try:
# Fetch stock data
stock_data = yf.download(ticker_symbol, start=start_date, end=end_date)
logging.info(f"Stock data fetched for {ticker_symbol} from {start_date} to {end_date}")
# Calculate technical indicators
stock_data = calculate_technical_indicators(stock_data)
# Get last day summary
if stock_data is not None:
last_day_summary = get_last_day_summary(stock_data)
if last_day_summary is not None:
print("Summary of Technical Indicators for the Last Day:")
print(last_day_summary)
return last_day_summary
else:
logging.error("Stock data is None, unable to calculate indicators.")
except Exception as e:
logging.error(f"Error during analysis: {e}")
return None
def get_finance_data(symbol: str) -> pd.Series:
"""
Fetches financial data for a given stock symbol.
Args:
symbol (str): The stock symbol.
Returns:
pd.Series: Summary of technical indicators for the last day.
"""
end_date = datetime.today()
start_date = end_date - timedelta(days=120)
# Perform analysis
last_day_summary = analyze_stock(symbol, start_date, end_date)
return last_day_summary
def analyze_options_data(ticker: str, expiry_date: str) -> tuple:
"""
Analyzes option data for a given ticker and expiry date.
Args:
ticker (str): The stock ticker symbol.
expiry_date (str): The option expiry date.
Returns:
tuple: A tuple containing calculated metrics for call and put options.
"""
call_df = options.get_calls(ticker, expiry_date)
put_df = options.get_puts(ticker, expiry_date)
# Implied Volatility Analysis:
avg_call_iv = call_df["Implied Volatility"].str.rstrip("%").astype(float).mean()
avg_put_iv = put_df["Implied Volatility"].str.rstrip("%").astype(float).mean()
logging.info(f"Average Implied Volatility for Call Options: {avg_call_iv}%")
logging.info(f"Average Implied Volatility for Put Options: {avg_put_iv}%")
# Option Prices Analysis:
avg_call_last_price = call_df["Last Price"].mean()
avg_put_last_price = put_df["Last Price"].mean()
logging.info(f"Average Last Price for Call Options: {avg_call_last_price}")
logging.info(f"Average Last Price for Put Options: {avg_put_last_price}")
# Strike Price Analysis:
min_call_strike = call_df["Strike"].min()
max_call_strike = call_df["Strike"].max()
min_put_strike = put_df["Strike"].min()
max_put_strike = put_df["Strike"].max()
logging.info(f"Minimum Strike Price for Call Options: {min_call_strike}")
logging.info(f"Maximum Strike Price for Call Options: {max_call_strike}")
logging.info(f"Minimum Strike Price for Put Options: {min_put_strike}")
logging.info(f"Maximum Strike Price for Put Options: {max_put_strike}")
# Volume Analysis:
total_call_volume = call_df["Volume"].str.replace('-', '0').astype(float).sum()
total_put_volume = put_df["Volume"].str.replace('-', '0').astype(float).sum()
logging.info(f"Total Volume for Call Options: {total_call_volume}")
logging.info(f"Total Volume for Put Options: {total_put_volume}")
# Open Interest Analysis:
call_df['Open Interest'] = call_df['Open Interest'].str.replace('-', '0').astype(float)
put_df['Open Interest'] = put_df['Open Interest'].str.replace('-', '0').astype(float)
total_call_open_interest = call_df["Open Interest"].sum()
total_put_open_interest = put_df["Open Interest"].sum()
logging.info(f"Total Open Interest for Call Options: {total_call_open_interest}")
logging.info(f"Total Open Interest for Put Options: {total_put_open_interest}")
# Convert Implied Volatility to float
call_df['Implied Volatility'] = call_df['Implied Volatility'].str.replace('%', '').astype(float)
put_df['Implied Volatility'] = put_df['Implied Volatility'].str.replace('%', '').astype(float)
# Calculate Put-Call Ratio
put_call_ratio = total_put_volume / total_call_volume
logging.info(f"Put-Call Ratio: {put_call_ratio}")
# Calculate Implied Volatility Percentile
call_iv_percentile = (call_df['Implied Volatility'] > call_df['Implied Volatility'].mean()).mean() * 100
put_iv_percentile = (put_df['Implied Volatility'] > put_df['Implied Volatility'].mean()).mean() * 100
logging.info(f"Call Option Implied Volatility Percentile: {call_iv_percentile}")
logging.info(f"Put Option Implied Volatility Percentile: {put_iv_percentile}")
# Calculate Implied Volatility Skew
implied_vol_skew = call_df['Implied Volatility'].mean() - put_df['Implied Volatility'].mean()
logging.info(f"Implied Volatility Skew: {implied_vol_skew}")
# Determine market sentiment
is_bullish_sentiment = call_df['Implied Volatility'].mean() > put_df['Implied Volatility'].mean()
sentiment = "bullish" if is_bullish_sentiment else "bearish"
logging.info(f"The overall sentiment of {ticker} is {sentiment}.")
return (avg_call_iv, avg_put_iv, avg_call_last_price, avg_put_last_price,
min_call_strike, max_call_strike, min_put_strike, max_put_strike,
total_call_volume, total_put_volume, total_call_open_interest, total_put_open_interest,
put_call_ratio, call_iv_percentile, put_iv_percentile, implied_vol_skew, sentiment)
def get_fin_options_data(ticker: str) -> list:
"""
Fetches and analyzes options data for a given stock ticker.
Args:
ticker (str): The stock ticker symbol.
Returns:
list: A list of sentences summarizing the options data.
"""
current_price = round(stock_info.get_live_price(ticker), 3)
option_expiry_dates = options.get_expiration_dates(ticker)
nearest_expiry = option_expiry_dates[0]
results = analyze_options_data(ticker, nearest_expiry)
# Unpack the results tuple
(avg_call_iv, avg_put_iv, avg_call_last_price, avg_put_last_price,
min_call_strike, max_call_strike, min_put_strike, max_put_strike,
total_call_volume, total_put_volume, total_call_open_interest, total_put_open_interest,
put_call_ratio, call_iv_percentile, put_iv_percentile, implied_vol_skew, sentiment) = results
# Create a list of complete sentences with the results
results_sentences = [
f"Average Implied Volatility for Call Options: {avg_call_iv}%",
f"Average Implied Volatility for Put Options: {avg_put_iv}%",
f"Average Last Price for Call Options: {avg_call_last_price}",
f"Average Last Price for Put Options: {avg_put_last_price}",
f"Minimum Strike Price for Call Options: {min_call_strike}",
f"Maximum Strike Price for Call Options: {max_call_strike}",
f"Minimum Strike Price for Put Options: {min_put_strike}",
f"Maximum Strike Price for Put Options: {max_put_strike}",
f"Total Volume for Call Options: {total_call_volume}",
f"Total Volume for Put Options: {total_put_volume}",
f"Total Open Interest for Call Options: {total_call_open_interest}",
f"Total Open Interest for Put Options: {total_put_open_interest}",
f"Put-Call Ratio: {put_call_ratio}",
f"Call Option Implied Volatility Percentile: {call_iv_percentile}",
f"Put Option Implied Volatility Percentile: {put_iv_percentile}",
f"Implied Volatility Skew: {implied_vol_skew}",
f"The overall sentiment of {ticker} is {sentiment}."
]
# Print each sentence
for sentence in results_sentences:
logging.info(sentence)
return results_sentences

View File

@@ -0,0 +1,96 @@
import os
from pathlib import Path
from firecrawl import FirecrawlApp
import logging
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv(Path('../../.env'))
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def initialize_client() -> FirecrawlApp:
"""
Initialize and return a Firecrawl client.
Returns:
FirecrawlApp: An instance of the Firecrawl client.
"""
return FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
def scrape_website(website_url: str, depth: int = 1, max_pages: int = 10) -> dict:
"""
Scrape a website starting from the given URL.
Args:
website_url (str): The URL of the website to scrape.
depth (int, optional): The depth of crawling. Default is 1.
max_pages (int, optional): The maximum number of pages to scrape. Default is 10.
Returns:
dict: The result of the website scraping, or None if an error occurred.
"""
client = initialize_client()
try:
result = client.crawl_url({
'url': website_url,
'depth': depth,
'max_pages': max_pages
})
return result
except KeyError as e:
logging.error(f"Missing key in data: {e}")
except ValueError as e:
logging.error(f"Value error: {e}")
except Exception as e:
logging.error(f"Error scraping website: {e}")
return None
def scrape_url(url: str) -> dict:
"""
Scrape a specific URL.
Args:
url (str): The URL to scrape.
Returns:
dict: The result of the URL scraping, or None if an error occurred.
"""
client = initialize_client()
try:
result = client.scrape_url(url)
return result
except KeyError as e:
logging.error(f"Missing key in data: {e}")
except ValueError as e:
logging.error(f"Value error: {e}")
except Exception as e:
logging.error(f"Error scraping URL: {e}")
return None
def extract_data(url: str, schema: dict) -> dict:
"""
Extract structured data from a URL using the provided schema.
Args:
url (str): The URL to extract data from.
schema (dict): The schema to use for data extraction.
Returns:
dict: The extracted data, or None if an error occurred.
"""
client = initialize_client()
try:
result = client.extract({
'url': url,
'schema': schema
})
return result
except KeyError as e:
logging.error(f"Missing key in data: {e}")
except ValueError as e:
logging.error(f"Value error: {e}")
except Exception as e:
logging.error(f"Error extracting data: {e}")
return None

View File

@@ -0,0 +1,339 @@
"""
This Python script performs Google searches using various services such as SerpApi, Serper.dev, and more. It displays the search results, including organic results, People Also Ask, and Related Searches, in formatted tables. The script also utilizes GPT to generate titles and FAQs for the Google search results.
Features:
- Utilizes SerpApi, Serper.dev, and other services for Google searches.
- Displays organic search results, including position, title, link, and snippet.
- Presents People Also Ask questions and snippets in a formatted table.
- Includes Related Searches in the combined table with People Also Ask.
- Configures logging with Loguru for informative messages.
- Uses Rich and Tabulate for visually appealing and formatted tables.
Usage:
- Ensure the necessary API keys are set in the .env file.
- Run the script to perform a Google search with the specified query.
- View the displayed tables with organic results, People Also Ask, and Related Searches.
- Additional information, such as generated titles and FAQs using GPT, is presented.
Modifications:
- Update the environment variables in the .env file with the required API keys.
- Customize the search parameters, such as location and language, in the functions as needed.
- Adjust logging configurations, table formatting, and other aspects based on preferences.
"""
import os
from pathlib import Path
import sys
import configparser
from pathlib import Path
import pandas as pd
import json
import requests
from clint.textui import progress
import streamlit as st
#from serpapi import GoogleSearch
from loguru import logger
from tabulate import tabulate
#from GoogleNews import GoogleNews
# Configure logger
logger.remove()
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv(Path('../../.env'))
logger.add(
sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from .common_utils import save_in_file, cfg_search_param
from tenacity import retry, stop_after_attempt, wait_random_exponential
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def google_search(query):
"""
Perform a Google search for the given query.
Args:
query (str): The search query.
flag (str, optional): The search flag (default is "faq").
Returns:
list: List of search results based on the specified flag.
"""
#try:
# perform_serpapi_google_search(query)
# logger.info(f"FIXME: Google serapi: {query}")
# #return process_search_results(search_result)
#except Exception as err:
# logger.error(f"ERROR: Check Here: https://serpapi.com/. Your requests may be over. {err}")
# Retry with serper.dev
try:
logger.info("Trying Google search with Serper.dev: https://serper.dev/api-key")
search_result = perform_serperdev_google_search(query)
if search_result:
process_search_results(search_result)
return(search_result)
except Exception as err:
logger.error(f"Failed Google search with serper.dev: {err}")
return None
# # Retry with BROWSERLESS API
# try:
# search_result = perform_browserless_google_search(query)
# #return process_search_results(search_result, flag)
# except Exception as err:
# logger.error("FIXME: Failed to do Google search with BROWSERLESS API.")
# logger.debug("FIXME: Trying with dataforSEO API.")
def perform_serpapi_google_search(query):
"""
Perform a Google search using the SerpApi service.
Args:
query (str): The search query.
location (str, optional): The location for the search (default is "Austin, Texas").
api_key (str, optional): Your secret API key for SerpApi.
Returns:
dict: A dictionary containing the search results.
"""
try:
logger.info("Reading Web search config values from main_config")
geo_location, search_language, num_results, time_range, include_domains, similar_url = read_return_config_section('web_research')
except Exception as err:
logger.error(f"Failed to read web research params: {err}")
return
try:
# Check if API key is provided
if not os.getenv("SERPAPI_KEY"):
#raise ValueError("SERPAPI_KEY key is required for SerpApi")
logger.error("SERPAPI_KEY key is required for SerpApi")
return
# Create a GoogleSearch instance
search = GoogleSearch({
"q": query,
"location": location,
"api_key": api_key
})
# Get search results as a dictionary
result = search.get_dict()
return result
except ValueError as ve:
# Handle missing API key error
logger.info(f"SERPAPI ValueError: {ve}")
except Exception as e:
# Handle other exceptions
logger.info(f"SERPAPI An error occurred: {e}")
def perform_serperdev_google_search(query):
"""
Perform a Google search using the Serper API.
Args:
query (str): The search query.
Returns:
dict: The JSON response from the Serper API.
"""
# Get the Serper API key from environment variables
logger.info("Doing serper.dev google search.")
serper_api_key = os.getenv('SERPER_API_KEY')
# Check if the API key is available
if not serper_api_key:
raise ValueError("SERPER_API_KEY is missing. Set it in the .env file.")
# Serper API endpoint URL
url = "https://google.serper.dev/search"
try:
geo_loc, lang, num_results = cfg_search_param('serperdev')
except Exception as err:
logger.error(f"Failed to read config {err}")
# Build payload as end user or main_config
payload = json.dumps({
"q": query,
"gl": geo_loc,
"hl": lang,
"num": num_results,
"autocorrect": True,
})
# Request headers with API key
headers = {
'X-API-KEY': serper_api_key,
'Content-Type': 'application/json'
}
# Send a POST request to the Serper API with progress bar
with progress.Bar(label="Searching", expected_size=100) as bar:
response = requests.post(url, headers=headers, data=payload, stream=True)
# Check if the request was successful
if response.status_code == 200:
# Parse and return the JSON response
return response.json()
else:
# Print an error message if the request fails
logger.error(f"Error: {response.status_code}, {response.text}")
return None
def perform_serper_news_search(news_keywords, news_country, news_language):
""" Function for Serper.dev News google search """
# Get the Serper API key from environment variables
logger.info(f"Doing serper.dev google search. {news_keywords} - {news_country} - {news_language}")
serper_api_key = os.getenv('SERPER_API_KEY')
# Check if the API key is available
if not serper_api_key:
raise ValueError("SERPER_API_KEY is missing. Set it in the .env file.")
# Serper API endpoint URL
url = "https://google.serper.dev/news"
payload = json.dumps({
"q": news_keywords,
"gl": news_country,
"hl": news_language,
})
# Request headers with API key
headers = {
'X-API-KEY': serper_api_key,
'Content-Type': 'application/json'
}
# Send a POST request to the Serper API with progress bar
with progress.Bar(label="Searching News", expected_size=100) as bar:
response = requests.post(url, headers=headers, data=payload, stream=True)
# Check if the request was successful
if response.status_code == 200:
# Parse and return the JSON response
#process_search_results(response, "news")
return response.json()
else:
# Print an error message if the request fails
logger.error(f"Error: {response.status_code}, {response.text}")
return None
def perform_browserless_google_search():
return
def perform_dataforseo_google_search():
return
def google_news(search_keywords, news_period="7d", region="IN"):
""" Get news articles from google_news"""
googlenews = GoogleNews()
googlenews.enableException(True)
googlenews = GoogleNews(lang='en', region=region)
googlenews = GoogleNews(period=news_period)
print(googlenews.get_news('APPLE'))
print(googlenews.search('APPLE'))
def process_search_results(search_results, search_type="general"):
"""
Create a Pandas DataFrame from the search results.
Args:
search_results (dict): The search results JSON.
Returns:
pd.DataFrame: Pandas DataFrame containing the search results.
"""
data = []
logger.info(f"Google Search Parameters: {search_results.get('searchParameters', {})}")
if 'general' in search_type:
organic_results = search_results.get("organic", [])
if 'news' in search_type:
organic_results = search_results.get("news", [])
# Displaying Organic Results
organic_data = []
for result in search_results["organic"]:
position = result.get("position", "")
title = result.get("title", "")
link = result.get("link", "")
snippet = result.get("snippet", "")
organic_data.append([position, title, link, snippet])
organic_headers = ["Rank", "Title", "Link", "Snippet"]
organic_table = tabulate(organic_data,
headers=organic_headers,
tablefmt="fancy_grid",
colalign=["center", "left", "left", "left"],
maxcolwidths=[5, 25, 35, 50])
# Print the tables
print("\n\n📢❗🚨 Google search Organic Results:")
print(organic_table)
# Displaying People Also Ask and Related Searches combined
combined_data = []
try:
people_also_ask_data = []
if "peopleAlsoAsk" in search_results:
for question in search_results["peopleAlsoAsk"]:
title = question.get("title", "")
snippet = question.get("snippet", "")
link = question.get("link", "")
people_also_ask_data.append([title, snippet, link])
except Exception as people_also_ask_err:
logger.error(f"Error processing 'peopleAlsoAsk': {people_also_ask_err}")
people_also_ask_data = []
related_searches_data = []
for query in search_results.get("relatedSearches", []):
related_searches_data.append([query.get("query", "")])
related_searches_headers = ["Related Search"]
if people_also_ask_data:
# Add Related Searches as a column to People Also Ask
combined_data = [
row + [related_searches_data[i][0] if i < len(related_searches_data) else ""]
for i, row in enumerate(people_also_ask_data)
]
combined_headers = ["Question", "Snippet", "Link", "Related Search"]
# Display the combined table
combined_table = tabulate(
combined_data,
headers=combined_headers,
tablefmt="fancy_grid",
colalign=["left", "left", "left", "left"],
maxcolwidths=[20, 50, 20, 30]
)
else:
combined_table = tabulate(
related_searches_data,
headers=related_searches_headers,
tablefmt="fancy_grid",
colalign=["left"],
maxcolwidths=[60]
)
print("\n\n📢❗🚨 People Also Ask & Related Searches:")
print(combined_table)
# Save the combined table to a file
try:
# Display on Alwrity UI
st.write(organic_table)
st.write(combined_table)
save_in_file(organic_table)
save_in_file(combined_table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
return search_results

View File

@@ -0,0 +1,500 @@
"""
This Python script analyzes Google search keywords by fetching auto-suggestions, performing keyword clustering, and visualizing Google Trends data. It uses various libraries such as pytrends, requests_html, tqdm, and more.
Features:
- Fetches auto-suggestions for a given search keyword from Google.
- Performs keyword clustering using K-means algorithm based on TF-IDF vectors.
- Visualizes Google Trends data, including interest over time and interest by region.
- Retrieves related queries and topics for a set of search keywords.
- Utilizes visualization libraries such as Matplotlib, Plotly, and Rich for displaying results.
- Incorporates logger.for error handling and informative messages.
Usage:
- Provide a search term or a list of search terms for analysis.
- Run the script to fetch auto-suggestions, perform clustering, and visualize Google Trends data.
- Explore the displayed results, including top keywords in each cluster and related topics.
Modifications:
- Customize the search terms in the 'do_google_trends_analysis' function.
- Adjust the number of clusters for keyword clustering and other parameters as needed.
- Explore further visualizations and analyses based on the generated data.
Note: Ensure that the required libraries are installed using 'pip install pytrends requests_html tqdm tabulate plotly rich'.
"""
import os
import time # I wish
import random
import requests
import numpy as np
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, silhouette_samples
from rich.console import Console
from rich.progress import Progress
import urllib
import json
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
from requests_html import HTML, HTMLSession
from urllib.parse import quote_plus
from tqdm import tqdm
from tabulate import tabulate
from pytrends.request import TrendReq
from loguru import logger
# Configure logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def fetch_google_trends_interest_overtime(keyword):
try:
pytrends = TrendReq(hl='en-US', tz=360)
pytrends.build_payload([keyword], timeframe='today 1-y', geo='US')
# 1. Interest Over Time
data = pytrends.interest_over_time()
data = data.reset_index()
# Visualization using Matplotlib
plt.figure(figsize=(10, 6))
plt.plot(data['date'], data[keyword], label=keyword)
plt.title(f'Interest Over Time for "{keyword}"')
plt.xlabel('Date')
plt.ylabel('Interest')
plt.legend()
plt.show()
return data
except Exception as e:
logger.error(f"Error in fetch_google_trends_data: {e}")
return pd.DataFrame()
def plot_interest_by_region(kw_list):
try:
from pytrends.request import TrendReq
import matplotlib.pyplot as plt
trends = TrendReq()
trends.build_payload(kw_list=kw_list)
kw_list = ' '.join(kw_list)
data = trends.interest_by_region() #sorting by region
data = data.sort_values(by=f"{kw_list}", ascending=False)
print("\n📢❗🚨 ")
print(f"Top 10 regions with highest interest for keyword: {kw_list}")
data = data.head(10) #Top 10
print(data)
data.reset_index().plot(x="geoName", y=f"{kw_list}",
figsize=(20,15), kind="bar")
plt.style.use('fivethirtyeight')
plt.show()
# FIXME: Send this image to vision GPT for analysis.
except Exception as e:
print(f"Error plotting interest by region: {e}")
return None
def get_related_topics_and_save_csv(search_keywords):
search_keywords = [f"{search_keywords}"]
try:
pytrends = TrendReq(hl='en-US', tz=360)
pytrends.build_payload(kw_list=search_keywords, timeframe='today 12-m')
# Get related topics - this returns a dictionary
topics_data = pytrends.related_topics()
# Extract data for the first keyword
if topics_data and search_keywords[0] in topics_data:
keyword_data = topics_data[search_keywords[0]]
# Create two separate dataframes for top and rising
top_df = keyword_data.get('top', pd.DataFrame())
rising_df = keyword_data.get('rising', pd.DataFrame())
return {
'top': top_df[['topic_title', 'value']] if not top_df.empty else pd.DataFrame(),
'rising': rising_df[['topic_title', 'value']] if not rising_df.empty else pd.DataFrame()
}
except Exception as e:
logger.error(f"Error in related topics: {e}")
return {'top': pd.DataFrame(), 'rising': pd.DataFrame()}
def get_related_queries_and_save_csv(search_keywords):
search_keywords = [f"{search_keywords}"]
try:
pytrends = TrendReq(hl='en-US', tz=360)
pytrends.build_payload(kw_list=search_keywords, timeframe='today 12-m')
# Get related queries - this returns a dictionary
queries_data = pytrends.related_queries()
# Extract data for the first keyword
if queries_data and search_keywords[0] in queries_data:
keyword_data = queries_data[search_keywords[0]]
# Create two separate dataframes for top and rising
top_df = keyword_data.get('top', pd.DataFrame())
rising_df = keyword_data.get('rising', pd.DataFrame())
return {
'top': top_df if not top_df.empty else pd.DataFrame(),
'rising': rising_df if not rising_df.empty else pd.DataFrame()
}
except Exception as e:
logger.error(f"Error in related queries: {e}")
return {'top': pd.DataFrame(), 'rising': pd.DataFrame()}
def get_source(url):
try:
session = HTMLSession()
response = session.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses
return response
except requests.exceptions.RequestException as e:
logger.error(f"Error during HTTP request: {e}")
return None
def get_results(query):
try:
query = urllib.parse.quote_plus(query)
response = get_source(f"https://suggestqueries.google.com/complete/search?output=chrome&hl=en&q={query}")
time.sleep(random.uniform(0.1, 0.6))
if response:
response.raise_for_status()
results = json.loads(response.text)
return results
else:
return None
except json.JSONDecodeError as e:
logger.error(f"Error decoding JSON response: {e}")
return None
except requests.exceptions.RequestException as e:
logger.error(f"Error during HTTP request: {e}")
return None
def format_results(results):
try:
suggestions = []
for index, value in enumerate(results[1]):
suggestion = {'term': value, 'relevance': results[4]['google:suggestrelevance'][index]}
suggestions.append(suggestion)
return suggestions
except (KeyError, IndexError) as e:
logger.error(f"Error parsing search results: {e}")
return []
def get_expanded_term_suffixes():
return ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm','n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def get_expanded_term_prefixes():
# For shopping, review type blogs.
#return ['discount *', 'pricing *', 'cheap', 'best price *', 'lowest price', 'best value', 'sale', 'affordable', 'promo', 'budget''what *', 'where *', 'how to *', 'why *', 'buy*', 'how much*','best *', 'worse *', 'rent*', 'sale*', 'offer*','vs*','or*']
return ['what *', 'where *', 'how to *', 'why *','best *', 'vs*', 'or*']
def get_expanded_terms(query):
try:
expanded_term_prefixes = get_expanded_term_prefixes()
expanded_term_suffixes = get_expanded_term_suffixes()
terms = [query]
for term in expanded_term_prefixes:
terms.append(f"{term} {query}")
for term in expanded_term_suffixes:
terms.append(f"{query} {term}")
return terms
except Exception as e:
logger.error(f"Error in get_expanded_terms: {e}")
return []
def get_expanded_suggestions(query):
try:
all_results = []
expanded_terms = get_expanded_terms(query)
for term in tqdm(expanded_terms, desc="📢❗🚨 Fetching Google AutoSuggestions", unit="term"):
results = get_results(term)
if results:
formatted_results = format_results(results)
all_results += formatted_results
all_results = sorted(all_results, key=lambda k: k.get('relevance', 0), reverse=True)
return all_results
except Exception as e:
logger.error(f"Error in get_expanded_suggestions: {e}")
return []
def get_suggestions_for_keyword(search_term):
""" """
try:
expanded_results = get_expanded_suggestions(search_term)
expanded_results_df = pd.DataFrame(expanded_results)
expanded_results_df.columns = ['Keywords', 'Relevance']
#expanded_results_df.to_csv('results.csv', index=False)
pd.set_option('display.max_rows', expanded_results_df.shape[0]+1)
expanded_results_df.drop_duplicates('Keywords', inplace=True)
table = tabulate(expanded_results_df, headers=['Keywords', 'Relevance'], tablefmt='fancy_grid')
# FIXME: Too much data for LLM context window. We will need to embed it.
#try:
# save_in_file(table)
#except Exception as save_results_err:
# logger.error(f"Failed to save search results: {save_results_err}")
return expanded_results_df
except Exception as e:
logger.error(f"get_suggestions_for_keyword: Error in main: {e}")
def perform_keyword_clustering(expanded_results_df, num_clusters=5):
try:
# Preprocessing: Convert the keywords to lowercase
expanded_results_df['Keywords'] = expanded_results_df['Keywords'].str.lower()
# Vectorization: Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the keywords
tfidf_vectors = vectorizer.fit_transform(expanded_results_df['Keywords'])
# Applying K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(tfidf_vectors)
# Add cluster labels to the DataFrame
expanded_results_df['cluster_label'] = cluster_labels
# Assessing cluster quality through silhouette score
silhouette_avg = silhouette_score(tfidf_vectors, cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")
# Visualize cluster quality using a silhouette plot
#visualize_silhouette(tfidf_vectors, cluster_labels)
return expanded_results_df
except Exception as e:
logger.error(f"Error in perform_keyword_clustering: {e}")
return pd.DataFrame()
def visualize_silhouette(X, labels):
try:
silhouette_avg = silhouette_score(X, labels)
print(f"Silhouette Score: {silhouette_avg}")
# Create a subplot with 1 row and 2 columns
fig, ax1 = plt.subplots(1, 1, figsize=(8, 6))
# The 1st subplot is the silhouette plot
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, X.shape[0] + (len(set(labels)) + 1) * 10])
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, labels)
y_lower = 10
for i in set(labels):
# Aggregate the silhouette scores for samples belonging to the cluster
ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.nipy_spectral(float(i) / len(set(labels)))
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for the next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("Silhouette plot for KMeans clustering")
ax1.set_xlabel("Silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for the average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()
except Exception as e:
logger.error(f"Error in visualize_silhouette: {e}")
def print_and_return_top_keywords(expanded_results_df, num_clusters=5):
"""
Display and return top keywords in each cluster.
Args:
expanded_results_df (pd.DataFrame): DataFrame containing expanded keywords, relevance, and cluster labels.
num_clusters (int or str): Number of clusters or 'all'.
Returns:
pd.DataFrame: DataFrame with top keywords for each cluster.
"""
top_keywords_df = pd.DataFrame()
if num_clusters == 'all':
unique_clusters = expanded_results_df['cluster_label'].unique()
else:
unique_clusters = range(int(num_clusters))
for i in unique_clusters:
cluster_df = expanded_results_df[expanded_results_df['cluster_label'] == i]
top_keywords = cluster_df.sort_values(by='Relevance', ascending=False).head(5)
top_keywords_df = pd.concat([top_keywords_df, top_keywords])
print(f"\n📢❗🚨 GTop Keywords for All Clusters:")
table = tabulate(top_keywords_df, headers='keys', tablefmt='fancy_grid')
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"🚨 Failed to save search results: {save_results_err}")
print(table)
return top_keywords_df
def generate_wordcloud(keywords):
"""
Generate and display a word cloud from a list of keywords.
Args:
keywords (list): List of keywords.
"""
# Convert the list of keywords to a string
text = ' '.join(keywords)
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
# Display the word cloud using matplotlib
plt.figure(figsize=(600, 200))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "a+", encoding="utf-8") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")
def do_google_trends_analysis(search_term):
""" Get a google search keywords, get its stats."""
search_term = [f"{search_term}"]
all_the_keywords = []
try:
for asearch_term in search_term:
#FIXME: Lets work with a single root keyword.
suggestions_df = get_suggestions_for_keyword(asearch_term)
if len(suggestions_df['Keywords']) > 10:
result_df = perform_keyword_clustering(suggestions_df)
# Display top keywords in each cluster
top_keywords = print_and_return_top_keywords(result_df)
all_the_keywords.append(top_keywords['Keywords'].tolist())
else:
all_the_keywords.append(suggestions_df['Keywords'].tolist())
all_the_keywords = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in all_the_keywords])
# Generate a random sleep time between 2 and 3 seconds
time.sleep(random.uniform(2, 3))
# Display additional information
try:
result_df = get_related_topics_and_save_csv(search_term)
logger.info(f"Related topics:: result_df: {result_df}")
# Extract 'Top' topic_title
if result_df:
top_topic_title = result_df['top']['topic_title'].values.tolist()
# Join each sublist into one string separated by comma
#top_topic_title = [','.join(filter(None, map(str, sublist))) for sublist in top_topic_title]
top_topic_title = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in top_topic_title])
except Exception as err:
logger.error(f"Failed to get results from google trends related topics: {err}")
# TBD: Not getting great results OR unable to understand them.
#all_the_keywords += top_topic_title
all_the_keywords = all_the_keywords.split(',')
# Split the list into chunks of 5 keywords
chunk_size = 4
chunks = [all_the_keywords[i:i + chunk_size] for i in range(0, len(all_the_keywords), chunk_size)]
# Create a DataFrame with columns named 'Keyword 1', 'Keyword 2', etc.
combined_df = pd.DataFrame(chunks, columns=[f'K📢eyword Col{i + 1}' for i in range(chunk_size)])
# Print the table
table = tabulate(combined_df, headers='keys', tablefmt='fancy_grid')
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
print(table)
#generate_wordcloud(all_the_keywords)
return(all_the_keywords)
except Exception as e:
logger.error(f"Error in Google Trends Analysis: {e}")
def get_trending_searches(country='united_states'):
"""Get trending searches for a specific country."""
try:
pytrends = TrendReq(hl='en-US', tz=360)
trending_searches = pytrends.trending_searches(pn=country)
return trending_searches
except Exception as e:
logger.error(f"Error getting trending searches: {e}")
return pd.DataFrame()
def get_realtime_trends(country='US'):
"""Get realtime trending searches for a specific country."""
try:
pytrends = TrendReq(hl='en-US', tz=360)
realtime_trends = pytrends.realtime_trending_searches(pn=country)
return realtime_trends
except Exception as e:
logger.error(f"Error getting realtime trends: {e}")
return pd.DataFrame()

View File

@@ -0,0 +1,803 @@
################################################################
#
# ## Features
#
# - **Web Research**: Alwrity enables users to conduct web research efficiently.
# By providing keywords or topics of interest, users can initiate searches across multiple platforms simultaneously.
#
# - **Google SERP Search**: The tool integrates with Google Search Engine Results Pages (SERP)
# to retrieve relevant information based on user queries. It offers insights into organic search results,
# People Also Ask, and related searches.
#
# - **Tavily AI Integration**: Alwrity leverages Tavily AI's capabilities to enhance web research.
# It utilizes advanced algorithms to search for information and extract relevant data from various sources.
#
# - **Metaphor AI Semantic Search**: Alwrity employs Metaphor AI's semantic search technology to find related articles and content.
# By analyzing context and meaning, it delivers precise and accurate results.
#
# - **Google Trends Analysis**: The tool provides Google Trends analysis for user-defined keywords.
# It helps users understand the popularity and trends associated with specific topics over time.
#
##############################################################
import os
import json
import time
from pathlib import Path
import sys
from datetime import datetime
import streamlit as st
import pandas as pd
import random
import numpy as np
from lib.alwrity_ui.display_google_serp_results import (
process_research_results,
process_search_results,
display_research_results
)
from lib.alwrity_ui.google_trends_ui import display_google_trends_data, process_trends_data
from .tavily_ai_search import do_tavily_ai_search
from .metaphor_basic_neural_web_search import metaphor_search_articles, streamlit_display_metaphor_results
from .google_serp_search import google_search
from .google_trends_researcher import do_google_trends_analysis
#from .google_gemini_web_researcher import do_gemini_web_research
from loguru import logger
# Configure logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def gpt_web_researcher(search_keywords, search_mode, **kwargs):
"""Keyword based web researcher with progress tracking."""
logger.info(f"Starting web research - Keywords: {search_keywords}, Mode: {search_mode}")
logger.debug(f"Additional parameters: {kwargs}")
try:
# Reset session state variables for this research operation
if 'metaphor_results_displayed' in st.session_state:
del st.session_state.metaphor_results_displayed
# Initialize result container
research_results = None
# Create status containers
status_container = st.empty()
progress_bar = st.progress(0)
def update_progress(message, progress=None, level="info"):
if progress is not None:
progress_bar.progress(progress)
if level == "error":
status_container.error(f"🚫 {message}")
elif level == "warning":
status_container.warning(f"⚠️ {message}")
else:
status_container.info(f"🔄 {message}")
logger.debug(f"Progress update [{level}]: {message}")
if search_mode == "google":
logger.info("Starting Google research pipeline")
try:
# First try Google SERP
update_progress("Initiating SERP search...", progress=10)
serp_results = do_google_serp_search(search_keywords, **kwargs)
if serp_results and serp_results.get('organic'):
logger.info("SERP search successful")
update_progress("SERP search completed", progress=40)
research_results = serp_results
else:
logger.warning("SERP search returned no results, falling back to Gemini")
update_progress("No SERP results, trying Gemini...", progress=45)
# Keep it commented. Fallback to Gemini
#try:
# gemini_results = do_gemini_web_research(search_keywords)
# if gemini_results:
# logger.info("Gemini research successful")
# update_progress("Gemini research completed", progress=80)
# research_results = {
# 'source': 'gemini',
# 'results': gemini_results
# }
#except Exception as gemini_err:
# logger.error(f"Gemini research failed: {gemini_err}")
# update_progress("Gemini research failed", level="warning")
if research_results:
update_progress("Processing final results...", progress=90)
processed_results = process_research_results(research_results)
if processed_results:
update_progress("Research completed!", progress=100, level="success")
display_research_results(processed_results)
return processed_results
else:
error_msg = "Failed to process research results"
logger.warning(error_msg)
update_progress(error_msg, level="warning")
return None
else:
error_msg = "No results from either SERP or Gemini"
logger.warning(error_msg)
update_progress(error_msg, level="warning")
return None
except Exception as search_err:
error_msg = f"Research pipeline failed: {str(search_err)}"
logger.error(error_msg, exc_info=True)
update_progress(error_msg, level="error")
raise
elif search_mode == "ai":
logger.info("Starting AI research pipeline")
try:
# Do Tavily AI Search
update_progress("Initiating Tavily AI search...", progress=10)
# Extract relevant parameters for Tavily search
include_domains = kwargs.pop('include_domains', None)
search_depth = kwargs.pop('search_depth', 'advanced')
# Pass the parameters to do_tavily_ai_search
t_results = do_tavily_ai_search(
search_keywords, # Pass as positional argument
max_results=kwargs.get('num_results', 10),
include_domains=include_domains,
search_depth=search_depth,
**kwargs
)
# Do Metaphor AI Search
update_progress("Initiating Metaphor AI search...", progress=50)
metaphor_results, metaphor_titles = do_metaphor_ai_research(search_keywords)
if metaphor_results is None:
update_progress("Metaphor AI search failed, continuing with Tavily results only...", level="warning")
else:
update_progress("Metaphor AI search completed successfully", progress=75)
# Add debug logging to check the structure of metaphor_results
logger.debug(f"Metaphor results structure: {type(metaphor_results)}")
if isinstance(metaphor_results, dict):
logger.debug(f"Metaphor results keys: {metaphor_results.keys()}")
if 'data' in metaphor_results:
logger.debug(f"Metaphor data keys: {metaphor_results['data'].keys()}")
if 'results' in metaphor_results['data']:
logger.debug(f"Number of results: {len(metaphor_results['data']['results'])}")
# Display Metaphor results only if not already displayed
if 'metaphor_results_displayed' not in st.session_state:
st.session_state.metaphor_results_displayed = True
# Make sure to pass the correct parameters to streamlit_display_metaphor_results
streamlit_display_metaphor_results(metaphor_results, search_keywords)
# Add Google Trends Analysis
update_progress("Initiating Google Trends analysis...", progress=80)
try:
# Add an informative message about Google Trends
with st.expander(" About Google Trends Analysis", expanded=False):
st.markdown("""
**What is Google Trends Analysis?**
Google Trends Analysis provides insights into how often a particular search-term is entered relative to the total search-volume across various regions of the world, and in various languages.
**What data will be shown?**
- **Related Keywords**: Terms that are frequently searched together with your keyword
- **Interest Over Time**: How interest in your keyword has changed over the past 12 months
- **Regional Interest**: Where in the world your keyword is most popular
- **Related Queries**: What people search for before and after searching for your keyword
- **Related Topics**: Topics that are closely related to your keyword
**How to use this data:**
- Identify trending topics in your industry
- Understand seasonal patterns in search behavior
- Discover related keywords for content planning
- Target content to specific regions with high interest
""")
trends_results = do_google_pytrends_analysis(search_keywords)
if trends_results:
update_progress("Google Trends analysis completed successfully", progress=90)
# Store trends results in the research_results
if metaphor_results:
metaphor_results['trends_data'] = trends_results
else:
# If metaphor_results is None, create a new container for results
metaphor_results = {'trends_data': trends_results}
# Display Google Trends data using the new UI module
display_google_trends_data(trends_results, search_keywords)
else:
update_progress("Google Trends analysis returned no results", level="warning")
except Exception as trends_err:
logger.error(f"Google Trends analysis failed: {trends_err}")
update_progress("Google Trends analysis failed", level="warning")
st.error(f"Error in Google Trends analysis: {str(trends_err)}")
# Return the combined results
update_progress("Research completed!", progress=100, level="success")
return metaphor_results or t_results
except Exception as ai_err:
error_msg = f"AI research pipeline failed: {str(ai_err)}"
logger.error(error_msg, exc_info=True)
update_progress(error_msg, level="error")
raise
else:
error_msg = f"Unsupported search mode: {search_mode}"
logger.error(error_msg)
update_progress(error_msg, level="error")
raise ValueError(error_msg)
except Exception as err:
error_msg = f"Failed in gpt_web_researcher: {str(err)}"
logger.error(error_msg, exc_info=True)
if 'update_progress' in locals():
update_progress(error_msg, level="error")
raise
def do_google_serp_search(search_keywords, status_container, update_progress, **kwargs):
"""Perform Google SERP analysis with sidebar progress tracking."""
logger.info("="*50)
logger.info("Starting Google SERP Search")
logger.info("="*50)
try:
# Validate parameters
update_progress("Validating search parameters", progress=0.1)
status_container.info("📝 Validating parameters...")
if not search_keywords or not isinstance(search_keywords, str):
logger.error(f"Invalid search keywords: {search_keywords}")
raise ValueError("Search keywords must be a non-empty string")
# Update search initiation
update_progress(f"Initiating search for: '{search_keywords}'", progress=0.2)
status_container.info("🌐 Querying search API...")
logger.info(f"Search params: {kwargs}")
# Execute search
g_results = google_search(search_keywords)
if g_results:
# Log success
update_progress("Search completed successfully", progress=0.8, level="success")
# Update statistics
stats = f"""Found:
- {len(g_results.get('organic', []))} organic results
- {len(g_results.get('peopleAlsoAsk', []))} related questions
- {len(g_results.get('relatedSearches', []))} related searches"""
update_progress(stats, progress=0.9)
# Process results
update_progress("Processing search results", progress=0.95)
status_container.info("⚡ Processing results...")
processed_results = process_search_results(g_results)
# Extract titles
update_progress("Extracting information", progress=0.98)
g_titles = extract_info(g_results, 'titles')
# Final success
update_progress("Analysis completed successfully", progress=1.0, level="success")
status_container.success("✨ Research completed!")
# Clear main status after delay
time.sleep(1)
status_container.empty()
return {
'results': g_results,
'titles': g_titles,
'summary': processed_results,
'stats': {
'organic_count': len(g_results.get('organic', [])),
'questions_count': len(g_results.get('peopleAlsoAsk', [])),
'related_count': len(g_results.get('relatedSearches', []))
}
}
else:
update_progress("No results found", progress=0.5, level="warning")
status_container.warning("⚠️ No results found")
return None
except Exception as err:
error_msg = f"Search failed: {str(err)}"
update_progress(error_msg, progress=0.5, level="error")
logger.error(error_msg)
logger.debug("Stack trace:", exc_info=True)
raise
finally:
logger.info("="*50)
logger.info("Google SERP Search function completed")
logger.info("="*50)
def do_tavily_ai_search(search_keywords, max_results=10, **kwargs):
""" Common function to do Tavily AI web research."""
try:
logger.info(f"Doing Tavily AI search for: {search_keywords}")
# Prepare Tavily search parameters
tavily_params = {
'max_results': max_results,
'search_depth': 'advanced' if kwargs.get('search_depth', 3) > 2 else 'basic',
'time_range': kwargs.get('time_range', 'year'),
'include_domains': kwargs.get('include_domains', [""]) if kwargs.get('include_domains') else [""]
}
# Import the Tavily search function directly
from .tavily_ai_search import do_tavily_ai_search as tavily_search
# Call the actual Tavily search function
t_results = tavily_search(
keywords=search_keywords,
**tavily_params
)
if t_results:
t_titles = tavily_extract_information(t_results, 'titles')
t_answer = tavily_extract_information(t_results, 'answer')
return(t_results, t_titles, t_answer)
else:
logger.warning("No results returned from Tavily AI search")
return None, None, None
except Exception as err:
logger.error(f"Failed to do Tavily AI Search: {err}")
return None, None, None
def do_metaphor_ai_research(search_keywords):
"""
Perform Metaphor AI research and return results with titles.
Args:
search_keywords (str): Keywords to search for
Returns:
tuple: (response_articles, titles) or (None, None) if search fails
"""
try:
logger.info(f"Start Semantic/Neural web search with Metaphor: {search_keywords}")
response_articles = metaphor_search_articles(search_keywords)
if response_articles and 'data' in response_articles:
m_titles = [result.get('title', '') for result in response_articles['data'].get('results', [])]
return response_articles, m_titles
else:
logger.warning("No valid results from Metaphor search")
return None, None
except Exception as err:
logger.error(f"Failed to do Metaphor search: {err}")
return None, None
def do_google_pytrends_analysis(keywords):
"""
Perform Google Trends analysis for the given keywords.
Args:
keywords (str): The search keywords to analyze
Returns:
dict: A dictionary containing formatted Google Trends data with the following keys:
- related_keywords: List of related keywords
- interest_over_time: DataFrame with date and interest columns
- regional_interest: DataFrame with country_code, country, and interest columns
- related_queries: DataFrame with query and value columns
- related_topics: DataFrame with topic and value columns
"""
logger.info(f"Performing Google Trends analysis for keywords: {keywords}")
# Create a progress container for Streamlit
progress_container = st.empty()
progress_bar = st.progress(0)
def update_progress(message, progress=None, level="info"):
"""Helper function to update progress in Streamlit UI"""
if progress is not None:
progress_bar.progress(progress)
if level == "error":
progress_container.error(f"🚫 {message}")
elif level == "warning":
progress_container.warning(f"⚠️ {message}")
else:
progress_container.info(f"🔄 {message}")
logger.debug(f"Progress update [{level}]: {message}")
try:
# Initialize the formatted data dictionary
formatted_data = {
'related_keywords': [],
'interest_over_time': pd.DataFrame(),
'regional_interest': pd.DataFrame(),
'related_queries': pd.DataFrame(),
'related_topics': pd.DataFrame()
}
# Get raw trends data from google_trends_researcher
update_progress("Fetching Google Trends data...", progress=10)
raw_trends_data = do_google_trends_analysis(keywords)
if not raw_trends_data:
logger.warning("No Google Trends data returned")
update_progress("No Google Trends data returned", level="warning", progress=20)
return formatted_data
# Process related keywords from the raw data
update_progress("Processing related keywords...", progress=30)
if isinstance(raw_trends_data, list):
formatted_data['related_keywords'] = raw_trends_data
elif isinstance(raw_trends_data, dict):
if 'keywords' in raw_trends_data:
formatted_data['related_keywords'] = raw_trends_data['keywords']
if 'interest_over_time' in raw_trends_data:
formatted_data['interest_over_time'] = raw_trends_data['interest_over_time']
if 'regional_interest' in raw_trends_data:
formatted_data['regional_interest'] = raw_trends_data['regional_interest']
if 'related_queries' in raw_trends_data:
formatted_data['related_queries'] = raw_trends_data['related_queries']
if 'related_topics' in raw_trends_data:
formatted_data['related_topics'] = raw_trends_data['related_topics']
# If we have keywords but missing other data, try to fetch them using pytrends directly
if formatted_data['related_keywords'] and (
formatted_data['interest_over_time'].empty or
formatted_data['regional_interest'].empty or
formatted_data['related_queries'].empty or
formatted_data['related_topics'].empty
):
try:
update_progress("Fetching additional data from Google Trends API...", progress=40)
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)
# Build payload with the main keyword
update_progress("Building search payload...", progress=45)
pytrends.build_payload([keywords], timeframe='today 12-m', geo='')
# Get interest over time if missing
if formatted_data['interest_over_time'].empty:
try:
update_progress("Fetching interest over time data...", progress=50)
interest_df = pytrends.interest_over_time()
if not interest_df.empty:
formatted_data['interest_over_time'] = interest_df.reset_index()
update_progress(f"Successfully fetched interest over time data with {len(formatted_data['interest_over_time'])} data points", progress=55)
else:
update_progress("No interest over time data available", level="warning", progress=55)
except Exception as e:
logger.error(f"Error fetching interest over time: {e}")
update_progress(f"Error fetching interest over time: {str(e)}", level="warning", progress=55)
# Get regional interest if missing
if formatted_data['regional_interest'].empty:
try:
update_progress("Fetching regional interest data...", progress=60)
regional_df = pytrends.interest_by_region()
if not regional_df.empty:
formatted_data['regional_interest'] = regional_df.reset_index()
update_progress(f"Successfully fetched regional interest data for {len(formatted_data['regional_interest'])} regions", progress=65)
else:
update_progress("No regional interest data available", level="warning", progress=65)
except Exception as e:
logger.error(f"Error fetching regional interest: {e}")
update_progress(f"Error fetching regional interest: {str(e)}", level="warning", progress=65)
# Get related queries if missing
if formatted_data['related_queries'].empty:
try:
update_progress("Fetching related queries data...", progress=70)
# Get related queries data
related_queries = pytrends.related_queries()
# Create empty DataFrame as fallback
formatted_data['related_queries'] = pd.DataFrame(columns=['query', 'value'])
# Simple direct approach to avoid list index errors
if related_queries and isinstance(related_queries, dict):
# Check if our keyword exists in the results
if keywords in related_queries:
keyword_data = related_queries[keywords]
# Process top queries if available
if 'top' in keyword_data and keyword_data['top'] is not None:
try:
update_progress("Processing top related queries...", progress=75)
# Convert to DataFrame if it's not already
if isinstance(keyword_data['top'], pd.DataFrame):
top_df = keyword_data['top']
else:
# Try to convert to DataFrame
top_df = pd.DataFrame(keyword_data['top'])
# Ensure it has the right columns
if not top_df.empty:
# Rename columns if needed
if 'query' in top_df.columns:
# Already has the right column name
pass
elif len(top_df.columns) > 0:
# Use first column as query
top_df = top_df.rename(columns={top_df.columns[0]: 'query'})
# Add to our results
formatted_data['related_queries'] = top_df
update_progress(f"Successfully processed {len(top_df)} top related queries", progress=80)
except Exception as e:
logger.warning(f"Error processing top queries: {e}")
update_progress(f"Error processing top queries: {str(e)}", level="warning", progress=80)
# Process rising queries if available
if 'rising' in keyword_data and keyword_data['rising'] is not None:
try:
update_progress("Processing rising related queries...", progress=85)
# Convert to DataFrame if it's not already
if isinstance(keyword_data['rising'], pd.DataFrame):
rising_df = keyword_data['rising']
else:
# Try to convert to DataFrame
rising_df = pd.DataFrame(keyword_data['rising'])
# Ensure it has the right columns
if not rising_df.empty:
# Rename columns if needed
if 'query' in rising_df.columns:
# Already has the right column name
pass
elif len(rising_df.columns) > 0:
# Use first column as query
rising_df = rising_df.rename(columns={rising_df.columns[0]: 'query'})
# Combine with existing data if we have any
if not formatted_data['related_queries'].empty:
formatted_data['related_queries'] = pd.concat([formatted_data['related_queries'], rising_df])
update_progress(f"Successfully processed {len(rising_df)} rising related queries", progress=90)
else:
formatted_data['related_queries'] = rising_df
update_progress(f"Successfully processed {len(rising_df)} rising related queries", progress=90)
except Exception as e:
logger.warning(f"Error processing rising queries: {e}")
update_progress(f"Error processing rising queries: {str(e)}", level="warning", progress=90)
except Exception as e:
logger.error(f"Error fetching related queries: {e}")
update_progress(f"Error fetching related queries: {str(e)}", level="warning", progress=90)
# Ensure we have an empty DataFrame with the right columns
formatted_data['related_queries'] = pd.DataFrame(columns=['query', 'value'])
# Get related topics if missing
if formatted_data['related_topics'].empty:
try:
update_progress("Fetching related topics data...", progress=95)
# Get related topics data
related_topics = pytrends.related_topics()
# Create empty DataFrame as fallback
formatted_data['related_topics'] = pd.DataFrame(columns=['topic', 'value'])
# Simple direct approach to avoid list index errors
if related_topics and isinstance(related_topics, dict):
# Check if our keyword exists in the results
if keywords in related_topics:
keyword_data = related_topics[keywords]
# Process top topics if available
if 'top' in keyword_data and keyword_data['top'] is not None:
try:
update_progress("Processing top related topics...", progress=97)
# Convert to DataFrame if it's not already
if isinstance(keyword_data['top'], pd.DataFrame):
top_df = keyword_data['top']
else:
# Try to convert to DataFrame
top_df = pd.DataFrame(keyword_data['top'])
# Ensure it has the right columns
if not top_df.empty:
# Rename columns if needed
if 'topic_title' in top_df.columns:
top_df = top_df.rename(columns={'topic_title': 'topic'})
elif len(top_df.columns) > 0 and 'topic' not in top_df.columns:
# Use first column as topic
top_df = top_df.rename(columns={top_df.columns[0]: 'topic'})
# Add to our results
formatted_data['related_topics'] = top_df
update_progress(f"Successfully processed {len(top_df)} top related topics", progress=98)
except Exception as e:
logger.warning(f"Error processing top topics: {e}")
update_progress(f"Error processing top topics: {str(e)}", level="warning", progress=98)
# Process rising topics if available
if 'rising' in keyword_data and keyword_data['rising'] is not None:
try:
update_progress("Processing rising related topics...", progress=99)
# Convert to DataFrame if it's not already
if isinstance(keyword_data['rising'], pd.DataFrame):
rising_df = keyword_data['rising']
else:
# Try to convert to DataFrame
rising_df = pd.DataFrame(keyword_data['rising'])
# Ensure it has the right columns
if not rising_df.empty:
# Rename columns if needed
if 'topic_title' in rising_df.columns:
rising_df = rising_df.rename(columns={'topic_title': 'topic'})
elif len(rising_df.columns) > 0 and 'topic' not in rising_df.columns:
# Use first column as topic
rising_df = rising_df.rename(columns={rising_df.columns[0]: 'topic'})
# Combine with existing data if we have any
if not formatted_data['related_topics'].empty:
formatted_data['related_topics'] = pd.concat([formatted_data['related_topics'], rising_df])
update_progress(f"Successfully processed {len(rising_df)} rising related topics", progress=100)
else:
formatted_data['related_topics'] = rising_df
update_progress(f"Successfully processed {len(rising_df)} rising related topics", progress=100)
except Exception as e:
logger.warning(f"Error processing rising topics: {e}")
update_progress(f"Error processing rising topics: {str(e)}", level="warning", progress=100)
except Exception as e:
logger.error(f"Error fetching related topics: {e}")
update_progress(f"Error fetching related topics: {str(e)}", level="warning", progress=100)
# Ensure we have an empty DataFrame with the right columns
formatted_data['related_topics'] = pd.DataFrame(columns=['topic', 'value'])
except Exception as e:
logger.error(f"Error fetching additional trends data: {e}")
update_progress(f"Error fetching additional trends data: {str(e)}", level="warning", progress=100)
# Ensure all DataFrames have the correct column names for the UI
update_progress("Finalizing data formatting...", progress=100)
if not formatted_data['interest_over_time'].empty:
if 'date' not in formatted_data['interest_over_time'].columns:
formatted_data['interest_over_time'] = formatted_data['interest_over_time'].reset_index()
if 'interest' not in formatted_data['interest_over_time'].columns and keywords in formatted_data['interest_over_time'].columns:
formatted_data['interest_over_time'] = formatted_data['interest_over_time'].rename(columns={keywords: 'interest'})
if not formatted_data['regional_interest'].empty:
if 'country_code' not in formatted_data['regional_interest'].columns and 'geoName' in formatted_data['regional_interest'].columns:
formatted_data['regional_interest'] = formatted_data['regional_interest'].rename(columns={'geoName': 'country_code'})
if 'interest' not in formatted_data['regional_interest'].columns and keywords in formatted_data['regional_interest'].columns:
formatted_data['regional_interest'] = formatted_data['regional_interest'].rename(columns={keywords: 'interest'})
if not formatted_data['related_queries'].empty:
# Handle different column names that might be present in the related queries DataFrame
if 'query' not in formatted_data['related_queries'].columns:
if 'Top query' in formatted_data['related_queries'].columns:
formatted_data['related_queries'] = formatted_data['related_queries'].rename(columns={'Top query': 'query'})
elif 'Rising query' in formatted_data['related_queries'].columns:
formatted_data['related_queries'] = formatted_data['related_queries'].rename(columns={'Rising query': 'query'})
elif 'query' not in formatted_data['related_queries'].columns and len(formatted_data['related_queries'].columns) > 0:
# If we have a DataFrame but no 'query' column, use the first column as 'query'
first_col = formatted_data['related_queries'].columns[0]
formatted_data['related_queries'] = formatted_data['related_queries'].rename(columns={first_col: 'query'})
if 'value' not in formatted_data['related_queries'].columns and len(formatted_data['related_queries'].columns) > 1:
# If we have a second column, use it as 'value'
second_col = formatted_data['related_queries'].columns[1]
formatted_data['related_queries'] = formatted_data['related_queries'].rename(columns={second_col: 'value'})
elif 'value' not in formatted_data['related_queries'].columns:
# If no 'value' column exists, add one with default values
formatted_data['related_queries']['value'] = 0
if not formatted_data['related_topics'].empty:
# Handle different column names that might be present in the related topics DataFrame
if 'topic' not in formatted_data['related_topics'].columns:
if 'topic_title' in formatted_data['related_topics'].columns:
formatted_data['related_topics'] = formatted_data['related_topics'].rename(columns={'topic_title': 'topic'})
elif 'topic' not in formatted_data['related_topics'].columns and len(formatted_data['related_topics'].columns) > 0:
# If we have a DataFrame but no 'topic' column, use the first column as 'topic'
first_col = formatted_data['related_topics'].columns[0]
formatted_data['related_topics'] = formatted_data['related_topics'].rename(columns={first_col: 'topic'})
if 'value' not in formatted_data['related_topics'].columns and len(formatted_data['related_topics'].columns) > 1:
# If we have a second column, use it as 'value'
second_col = formatted_data['related_topics'].columns[1]
formatted_data['related_topics'] = formatted_data['related_topics'].rename(columns={second_col: 'value'})
elif 'value' not in formatted_data['related_topics'].columns:
# If no 'value' column exists, add one with default values
formatted_data['related_topics']['value'] = 0
# Clear the progress container after completion
progress_container.empty()
progress_bar.empty()
return formatted_data
except Exception as e:
logger.error(f"Error in Google Trends analysis: {e}")
update_progress(f"Error in Google Trends analysis: {str(e)}", level="error", progress=100)
# Clear the progress container after error
progress_container.empty()
progress_bar.empty()
return {
'related_keywords': [],
'interest_over_time': pd.DataFrame(),
'regional_interest': pd.DataFrame(),
'related_queries': pd.DataFrame(),
'related_topics': pd.DataFrame()
}
def metaphor_extract_titles_or_text(json_data, return_titles=True):
"""
Extract either titles or text from the given JSON structure.
Args:
json_data (list): List of Result objects in JSON format.
return_titles (bool): If True, return titles. If False, return text.
Returns:
list: List of titles or text.
"""
if return_titles:
return [(result.title) for result in json_data]
else:
return [result.text for result in json_data]
def extract_info(json_data, info_type):
"""
Extract information (titles, peopleAlsoAsk, or relatedSearches) from the given JSON.
Args:
json_data (dict): The JSON data.
info_type (str): The type of information to extract (titles, peopleAlsoAsk, relatedSearches).
Returns:
list or None: A list containing the requested information, or None if the type is invalid.
"""
if info_type == "titles":
return [result.get("title") for result in json_data.get("organic", [])]
elif info_type == "peopleAlsoAsk":
return [item.get("question") for item in json_data.get("peopleAlsoAsk", [])]
elif info_type == "relatedSearches":
return [item.get("query") for item in json_data.get("relatedSearches", [])]
else:
print("Invalid info_type. Please use 'titles', 'peopleAlsoAsk', or 'relatedSearches'.")
return None
def tavily_extract_information(json_data, keyword):
"""
Extract information from the given JSON based on the specified keyword.
Args:
json_data (dict): The JSON data.
keyword (str): The keyword (title, content, answer, follow-query).
Returns:
list or str: The extracted information based on the keyword.
"""
if keyword == 'titles':
return [result['title'] for result in json_data['results']]
elif keyword == 'content':
return [result['content'] for result in json_data['results']]
elif keyword == 'answer':
return json_data['answer']
elif keyword == 'follow-query':
return json_data['follow_up_questions']
else:
return f"Invalid keyword: {keyword}"

View File

@@ -0,0 +1,623 @@
import os
import sys
import pandas as pd
from io import StringIO
from pathlib import Path
from metaphor_python import Metaphor
from datetime import datetime, timedelta
import streamlit as st
from loguru import logger
from tqdm import tqdm
from tabulate import tabulate
from collections import namedtuple
import textwrap
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from dotenv import load_dotenv
load_dotenv(Path('../../.env'))
from exa_py import Exa
from tenacity import (retry, stop_after_attempt, wait_random_exponential,)# for exponential backoff
from .gpt_summarize_web_content import summarize_web_content
from .gpt_competitor_analysis import summarize_competitor_content
from .common_utils import save_in_file, cfg_search_param
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_metaphor_client():
"""
Get the Metaphor client.
Returns:
Metaphor: An instance of the Metaphor client.
"""
METAPHOR_API_KEY = os.environ.get('METAPHOR_API_KEY')
if not METAPHOR_API_KEY:
logger.error("METAPHOR_API_KEY environment variable not set!")
st.error("METAPHOR_API_KEY environment variable not set!")
raise ValueError("METAPHOR_API_KEY environment variable not set!")
return Exa(METAPHOR_API_KEY)
def metaphor_rag_search():
""" Mainly used for researching blog sections. """
metaphor = get_metaphor_client()
query = "blog research" # Example query, this can be parameterized as needed
results = metaphor.search(query)
if not results:
logger.error("No results found for the query.")
st.error("No results found for the query.")
return None
# Process the results (this is a placeholder, actual processing logic will depend on requirements)
processed_results = [result['title'] for result in results]
# Display the results
st.write("Search Results:")
st.write(processed_results)
return processed_results
def metaphor_find_similar(similar_url, usecase, num_results=5, start_published_date=None, end_published_date=None,
include_domains=None, exclude_domains=None, include_text=None, exclude_text=None,
summary_query=None, progress_bar=None):
"""Find similar content using Metaphor API."""
try:
# Initialize progress if not provided
if progress_bar is None:
progress_bar = st.progress(0.0)
# Update progress
progress_bar.progress(0.1, text="Initializing search...")
# Get Metaphor client
metaphor = get_metaphor_client()
logger.info(f"Initialized Metaphor client for URL: {similar_url}")
# Prepare search parameters
search_params = {
"highlights": True,
"num_results": num_results,
}
# Add optional parameters if provided
if start_published_date:
search_params["start_published_date"] = start_published_date
if end_published_date:
search_params["end_published_date"] = end_published_date
if include_domains:
search_params["include_domains"] = include_domains
if exclude_domains:
search_params["exclude_domains"] = exclude_domains
if include_text:
search_params["include_text"] = include_text
if exclude_text:
search_params["exclude_text"] = exclude_text
# Add summary query
if summary_query:
search_params["summary"] = summary_query
else:
search_params["summary"] = {"query": f"Find {usecase} similar to the given URL."}
logger.debug(f"Search parameters: {search_params}")
# Update progress
progress_bar.progress(0.2, text="Preparing search parameters...")
# Make API call
logger.info("Calling Metaphor API find_similar_and_contents...")
search_response = metaphor.find_similar_and_contents(
similar_url,
**search_params
)
if search_response and hasattr(search_response, 'results'):
competitors = search_response.results
total_results = len(competitors)
# Update progress
progress_bar.progress(0.3, text=f"Found {total_results} results...")
# Process results
processed_results = []
for i, result in enumerate(competitors):
# Calculate progress as decimal (0.0-1.0)
progress = 0.3 + (0.6 * (i / total_results))
progress_text = f"Processing result {i+1}/{total_results}..."
progress_bar.progress(progress, text=progress_text)
# Process each result
processed_result = {
"Title": result.title,
"URL": result.url,
"Content Summary": result.text if hasattr(result, 'text') else "No content available"
}
processed_results.append(processed_result)
# Update progress
progress_bar.progress(0.9, text="Finalizing results...")
# Create DataFrame
df = pd.DataFrame(processed_results)
# Update progress
progress_bar.progress(1.0, text="Analysis completed!")
return df, search_response
else:
logger.warning("No results found in search response")
progress_bar.progress(1.0, text="No results found")
return pd.DataFrame(), search_response
except Exception as e:
logger.error(f"Error in metaphor_find_similar: {str(e)}", exc_info=True)
if progress_bar:
progress_bar.progress(1.0, text="Error occurred during analysis")
raise
def calculate_date_range(time_range: str) -> tuple:
"""
Calculate start and end dates based on time range selection.
Args:
time_range (str): One of 'past_day', 'past_week', 'past_month', 'past_year', 'anytime'
Returns:
tuple: (start_date, end_date) in ISO format with milliseconds
"""
now = datetime.utcnow()
end_date = now.strftime('%Y-%m-%dT%H:%M:%S.999Z')
if time_range == 'past_day':
start_date = (now - timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%S.000Z')
elif time_range == 'past_week':
start_date = (now - timedelta(weeks=1)).strftime('%Y-%m-%dT%H:%M:%S.000Z')
elif time_range == 'past_month':
start_date = (now - timedelta(days=30)).strftime('%Y-%m-%dT%H:%M:%S.000Z')
elif time_range == 'past_year':
start_date = (now - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%S.000Z')
else: # anytime
start_date = None
end_date = None
return start_date, end_date
def metaphor_search_articles(query, search_options: dict = None):
"""
Search for articles using the Metaphor/Exa API.
Args:
query (str): The search query.
search_options (dict): Search configuration options including:
- num_results (int): Number of results to retrieve
- use_autoprompt (bool): Whether to use autoprompt
- include_domains (list): List of domains to include
- time_range (str): One of 'past_day', 'past_week', 'past_month', 'past_year', 'anytime'
- exclude_domains (list): List of domains to exclude
Returns:
dict: Search results and metadata
"""
exa = get_metaphor_client()
try:
# Initialize default search options
if search_options is None:
search_options = {}
# Get config parameters or use defaults
try:
include_domains, _, num_results, _ = cfg_search_param('exa')
except Exception as cfg_err:
logger.warning(f"Failed to load config parameters: {cfg_err}. Using defaults.")
include_domains = None
num_results = 10
# Calculate date range based on time_range option
time_range = search_options.get('time_range', 'anytime')
start_published_date, end_published_date = calculate_date_range(time_range)
# Prepare search parameters
search_params = {
'num_results': search_options.get('num_results', num_results),
'summary': True, # Always get summaries
'include_domains': search_options.get('include_domains', include_domains),
'use_autoprompt': search_options.get('use_autoprompt', True),
}
# Add date parameters only if they are not None
if start_published_date:
search_params['start_published_date'] = start_published_date
if end_published_date:
search_params['end_published_date'] = end_published_date
logger.info(f"Exa web search with params: {search_params} and Query: {query}")
# Execute search
search_response = exa.search_and_contents(
query,
**search_params
)
if not search_response or not hasattr(search_response, 'results'):
logger.warning("No results returned from Exa search")
return None
# Get cost information safely
try:
cost_dollars = {
'total': float(search_response.cost_dollars['total']),
} if hasattr(search_response, 'cost_dollars') else None
except Exception as cost_err:
logger.warning(f"Error processing cost information: {cost_err}")
cost_dollars = None
# Format response to match expected structure
formatted_response = {
"data": {
"requestId": getattr(search_response, 'request_id', None),
"resolvedSearchType": "neural",
"results": [
{
"id": result.url,
"title": result.title,
"url": result.url,
"publishedDate": result.published_date if hasattr(result, 'published_date') else None,
"author": getattr(result, 'author', None),
"score": getattr(result, 'score', 0),
"summary": result.summary if hasattr(result, 'summary') else None,
"text": result.text if hasattr(result, 'text') else None,
"image": getattr(result, 'image', None),
"favicon": getattr(result, 'favicon', None)
}
for result in search_response.results
],
"costDollars": cost_dollars
}
}
# Get AI-generated answer from Metaphor
try:
exa_answer = get_exa_answer(query)
if exa_answer:
formatted_response.update(exa_answer)
except Exception as exa_err:
logger.warning(f"Error getting Exa answer: {exa_err}")
# Get AI-generated answer from Tavily
try:
# Import the function directly from the module
import importlib
tavily_module = importlib.import_module('lib.ai_web_researcher.tavily_ai_search')
if hasattr(tavily_module, 'do_tavily_ai_search'):
tavily_response = tavily_module.do_tavily_ai_search(query)
if tavily_response and 'answer' in tavily_response:
formatted_response.update({
"tavily_answer": tavily_response.get("answer"),
"tavily_citations": tavily_response.get("citations", []),
"tavily_cost_dollars": tavily_response.get("costDollars", {"total": 0})
})
else:
logger.warning("do_tavily_ai_search function not found in tavily_ai_search module")
except Exception as tavily_err:
logger.warning(f"Error getting Tavily answer: {tavily_err}")
# Return the formatted response without displaying it
# The display will be handled by gpt_web_researcher
return formatted_response
except Exception as e:
logger.error(f"Error in Exa searching articles: {e}")
return None
def streamlit_display_metaphor_results(metaphor_response, search_keywords=None):
"""Display Metaphor search results in Streamlit."""
if not metaphor_response:
st.error("No search results found.")
return
# Add debug logging
logger.debug(f"Displaying Metaphor results. Type: {type(metaphor_response)}")
if isinstance(metaphor_response, dict):
logger.debug(f"Metaphor response keys: {metaphor_response.keys()}")
# Initialize session state variables if they don't exist
if 'search_insights' not in st.session_state:
st.session_state.search_insights = None
if 'metaphor_response' not in st.session_state:
st.session_state.metaphor_response = None
if 'insights_generated' not in st.session_state:
st.session_state.insights_generated = False
# Store the current response in session state
st.session_state.metaphor_response = metaphor_response
# Display search results
st.subheader("🔍 Search Results")
# Calculate metrics - handle different data structures
results = []
if isinstance(metaphor_response, dict):
if 'data' in metaphor_response and 'results' in metaphor_response['data']:
results = metaphor_response['data']['results']
elif 'results' in metaphor_response:
results = metaphor_response['results']
total_results = len(results)
avg_relevance = sum(r.get('score', 0) for r in results) / total_results if total_results > 0 else 0
# Display metrics
col1, col2 = st.columns(2)
with col1:
st.metric("Total Results", total_results)
with col2:
st.metric("Average Relevance Score", f"{avg_relevance:.2f}")
# Display AI-generated answers if available
if 'tavily_answer' in metaphor_response or 'metaphor_answer' in metaphor_response:
st.subheader("🤖 AI-Generated Answers")
if 'tavily_answer' in metaphor_response:
st.markdown("**Tavily AI Answer:**")
st.write(metaphor_response['tavily_answer'])
if 'metaphor_answer' in metaphor_response:
st.markdown("**Metaphor AI Answer:**")
st.write(metaphor_response['metaphor_answer'])
# Get Search Insights button
if st.button("Generate Search Insights", key="metaphor_generate_insights_button"):
st.session_state.insights_generated = True
st.rerun()
# Display insights if they exist in session state
if st.session_state.search_insights:
st.subheader("🔍 Search Insights")
st.write(st.session_state.search_insights)
# Display search results in a data editor
st.subheader("📊 Detailed Results")
# Prepare data for display
results_data = []
for result in results:
result_data = {
'Title': result.get('title', ''),
'URL': result.get('url', ''),
'Snippet': result.get('summary', ''),
'Relevance Score': result.get('score', 0),
'Published Date': result.get('publishedDate', '')
}
results_data.append(result_data)
# Create DataFrame
df = pd.DataFrame(results_data)
# Display the DataFrame if it's not empty
if not df.empty:
# Configure columns
st.dataframe(
df,
column_config={
"Title": st.column_config.TextColumn(
"Title",
help="Title of the search result",
width="large",
),
"URL": st.column_config.LinkColumn(
"URL",
help="Link to the search result",
width="medium",
display_text="Visit Article",
),
"Snippet": st.column_config.TextColumn(
"Snippet",
help="Summary of the search result",
width="large",
),
"Relevance Score": st.column_config.NumberColumn(
"Relevance Score",
help="Relevance score of the search result",
format="%.2f",
width="small",
),
"Published Date": st.column_config.DateColumn(
"Published Date",
help="Publication date of the search result",
width="medium",
),
},
hide_index=True,
)
# Add popover for snippets
st.markdown("""
<style>
.snippet-popover {
position: relative;
display: inline-block;
}
.snippet-popover .snippet-content {
visibility: hidden;
width: 300px;
background-color: #f9f9f9;
color: #333;
text-align: left;
border-radius: 6px;
padding: 10px;
position: absolute;
z-index: 1;
bottom: 125%;
left: 50%;
margin-left: -150px;
opacity: 0;
transition: opacity 0.3s;
box-shadow: 0 2px 5px rgba(0,0,0,0.2);
}
.snippet-popover:hover .snippet-content {
visibility: visible;
opacity: 1;
}
</style>
""", unsafe_allow_html=True)
# Display snippets with popover
st.subheader("📝 Snippets")
for i, result in enumerate(results):
snippet = result.get('summary', '')
if snippet:
st.markdown(f"""
<div class="snippet-popover">
<strong>{result.get('title', '')}</strong>
<div class="snippet-content">
{snippet}
</div>
</div>
""", unsafe_allow_html=True)
else:
st.info("No detailed results available.")
# Add a collapsible section for the raw JSON data
with st.expander("Research Results (JSON)", expanded=False):
st.json(metaphor_response)
def metaphor_news_summarizer(news_keywords):
""" build a LLM-based news summarizer app with the Exa API to keep us up-to-date
with the latest news on a given topic.
"""
exa = get_metaphor_client()
# FIXME: Needs to be user defined.
one_week_ago = (datetime.now() - timedelta(days=7))
date_cutoff = one_week_ago.strftime("%Y-%m-%d")
search_response = exa.search_and_contents(
news_keywords, use_autoprompt=True, start_published_date=date_cutoff
)
urls = [result.url for result in search_response.results]
print("URLs:")
for url in urls:
print(url)
def print_search_result(contents_response):
# Define the Result namedtuple
Result = namedtuple("Result", ["url", "title", "text"])
# Tabulate the data
table_headers = ["URL", "Title", "Summary"]
table_data = [(result.url, result.title, result.text) for result in contents_response]
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
colalign=["left", "left", "left"],
maxcolwidths=[20, 20, 70])
# Convert table_data to DataFrame
import pandas as pd
df = pd.DataFrame(table_data, columns=["URL", "Title", "Summary"])
import streamlit as st
st.table(df)
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
def metaphor_scholar_search(query, include_domains=None, time_range="anytime"):
"""
Search for papers using the Metaphor API.
Args:
query (str): The search query.
include_domains (list): List of domains to include.
time_range (str): Time range for published articles ("day", "week", "month", "year", "anytime").
Returns:
MetaphorResponse: The response from the Metaphor API.
"""
client = get_metaphor_client()
try:
if time_range == "day":
start_published_date = (datetime.utcnow() - timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
elif time_range == "week":
start_published_date = (datetime.utcnow() - timedelta(weeks=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
elif time_range == "month":
start_published_date = (datetime.utcnow() - timedelta(weeks=4)).strftime('%Y-%m-%dT%H:%M:%SZ')
elif time_range == "year":
start_published_date = (datetime.utcnow() - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%SZ')
else:
start_published_date = None
response = client.search(query, include_domains=include_domains, start_published_date=start_published_date, use_autoprompt=True)
return response
except Exception as e:
logger.error(f"Error in searching papers: {e}")
def get_exa_answer(query: str, system_prompt: str = None) -> dict:
"""
Get an AI-generated answer for a query using Exa's answer endpoint.
Args:
query (str): The search query to get an answer for
system_prompt (str, optional): Custom system prompt for the LLM. If None, uses default prompt.
Returns:
dict: Response containing answer, citations, and cost information
{
"answer": str,
"citations": list[dict],
"costDollars": dict
}
"""
exa = get_metaphor_client()
try:
# Use default system prompt if none provided
if system_prompt is None:
system_prompt = (
"I am doing research to write factual content. "
"Help me find answers for content generation task. "
"Provide detailed, well-structured answers with clear citations."
)
logger.info(f"Getting Exa answer for query: {query}")
logger.debug(f"Using system prompt: {system_prompt}")
# Make API call to get answer with system_prompt parameter
result = exa.answer(
query,
model="exa",
text=True # Include full text in citations
)
if not result or not result.get('answer'):
logger.warning("No answer received from Exa")
return None
# Format response to match expected structure
response = {
"answer": result.get('answer'),
"citations": result.get('citations', []),
"costDollars": result.get('costDollars', {"total": 0})
}
return response
except Exception as e:
logger.error(f"Error getting Exa answer: {e}")
return None

View File

@@ -0,0 +1,218 @@
"""
This Python script uses the Tavily AI service to perform advanced searches based on specified keywords and options. It retrieves Tavily AI search results, pretty-prints them using Rich and Tabulate, and provides additional information such as the answer to the search query and follow-up questions.
Features:
- Utilizes the Tavily AI service for advanced searches.
- Retrieves API keys from the environment variables loaded from a .env file.
- Configures logging with Loguru for informative messages.
- Implements a retry mechanism using Tenacity to handle transient failures during Tavily searches.
- Displays search results, including titles, snippets, and links, in a visually appealing table using Tabulate and Rich.
Usage:
- Ensure the necessary API keys are set in the .env file.
- Run the script to perform a Tavily AI search with specified keywords and options.
- The search results, including titles, snippets, and links, are displayed in a formatted table.
- Additional information, such as the answer to the search query and follow-up questions, is presented in separate tables.
Modifications:
- To modify the script, update the environment variables in the .env file with the required API keys.
- Adjust the search parameters, such as keywords and search depth, in the `do_tavily_ai_search` function as needed.
- Customize logging configurations and table formatting according to preferences.
To-Do (TBD):
- Consider adding further enhancements or customization based on specific use cases.
"""
import os
from pathlib import Path
import sys
from dotenv import load_dotenv
from loguru import logger
from tavily import TavilyClient
from rich import print
from tabulate import tabulate
# Load environment variables from .env file
load_dotenv(Path('../../.env'))
from rich import print
import streamlit as st
# Configure logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from .common_utils import save_in_file, cfg_search_param
from tenacity import retry, stop_after_attempt, wait_random_exponential
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def do_tavily_ai_search(keywords, max_results=5, include_domains=None, search_depth="advanced", **kwargs):
"""
Get Tavily AI search results based on specified keywords and options.
"""
# Run Tavily search
logger.info(f"Running Tavily search on: {keywords}")
# Retrieve API keys
api_key = os.getenv('TAVILY_API_KEY')
if not api_key:
raise ValueError("API keys for Tavily is Not set.")
# Initialize Tavily client
try:
client = TavilyClient(api_key=api_key)
except Exception as err:
logger.error(f"Failed to create Tavily client. Check TAVILY_API_KEY: {err}")
raise
try:
# Create search parameters exactly matching Tavily's API format
tavily_search_result = client.search(
query=keywords,
search_depth="advanced",
time_range="year",
include_answer="advanced",
include_domains=[""] if not include_domains else include_domains,
max_results=max_results
)
if tavily_search_result:
print_result_table(tavily_search_result)
streamlit_display_results(tavily_search_result)
return tavily_search_result
return None
except Exception as err:
logger.error(f"Failed to do Tavily Research: {err}")
raise
def streamlit_display_results(output_data):
"""Display Tavily AI search results in Streamlit UI with enhanced visualization."""
# Display the 'answer' in Streamlit with enhanced styling
answer = output_data.get("answer", "No answer available")
st.markdown("### 🤖 AI-Generated Answer")
st.markdown(f"""
<div style="background-color: #f0f2f6; padding: 20px; border-radius: 10px; border-left: 5px solid #4CAF50;">
{answer}
</div>
""", unsafe_allow_html=True)
# Display follow-up questions if available
follow_up_questions = output_data.get("follow_up_questions", [])
if follow_up_questions:
st.markdown("### ❓ Follow-up Questions")
for i, question in enumerate(follow_up_questions, 1):
st.markdown(f"**{i}.** {question}")
# Prepare data for display with dataeditor
st.markdown("### 📊 Search Results")
# Create a DataFrame for the results
import pandas as pd
results_data = []
for item in output_data.get("results", []):
title = item.get("title", "")
snippet = item.get("content", "")
link = item.get("url", "")
results_data.append({
"Title": title,
"Content": snippet,
"Link": link
})
if results_data:
df = pd.DataFrame(results_data)
# Display the data editor
st.data_editor(
df,
column_config={
"Title": st.column_config.TextColumn(
"Title",
help="Article title",
width="medium",
),
"Content": st.column_config.TextColumn(
"Content",
help="Click the button below to view full content",
width="large",
),
"Link": st.column_config.LinkColumn(
"Link",
help="Click to visit the website",
width="small",
display_text="Visit Site"
),
},
hide_index=True,
use_container_width=True,
)
# Add popovers for full content display
for item in output_data.get("results", []):
with st.popover(f"View content: {item.get('title', '')[:50]}..."):
st.markdown(item.get("content", ""))
else:
st.info("No results found for your search query.")
def print_result_table(output_data):
""" Pretty print the tavily AI search result. """
# Prepare data for tabulate
table_data = []
for item in output_data.get("results"):
title = item.get("title", "")
snippet = item.get("content", "")
link = item.get("url", "")
table_data.append([title, snippet, link])
# Define table headers
table_headers = ["Title", "Snippet", "Link"]
# Display the table using tabulate
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
colalign=["left", "left", "left"],
maxcolwidths=[30, 60, 30])
# Print the table
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
# Display the 'answer' in a table
table_headers = [f"The answer to search query: {output_data.get('query')}"]
table_data = [[output_data.get("answer")]]
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
maxcolwidths=[80])
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
# Display the 'follow_up_questions' in a table
if output_data.get("follow_up_questions"):
table_headers = [f"Search Engine follow up questions for query: {output_data.get('query')}"]
table_data = [[output_data.get("follow_up_questions")]]
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
maxcolwidths=[80])
print(table)
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")