426 lines
17 KiB
Python
426 lines
17 KiB
Python
"""
|
|
Tavily API Service for ALwrity
|
|
|
|
This service provides web search and research capabilities using the Tavily API,
|
|
which offers AI-powered search with real-time information retrieval.
|
|
|
|
Key Features:
|
|
- Web search with AI-powered results
|
|
- Content extraction and summarization
|
|
- Real-time information retrieval
|
|
- Topic-based search (general, news, finance)
|
|
- Advanced search depth options
|
|
- Cost-effective API usage with caching
|
|
|
|
Dependencies:
|
|
- aiohttp (for async HTTP requests)
|
|
- os (for environment variables)
|
|
- logging (for debugging)
|
|
|
|
Author: ALwrity Team
|
|
Version: 1.0
|
|
Last Updated: January 2025
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import aiohttp
|
|
from typing import Dict, List, Optional, Any, Union
|
|
from datetime import datetime, timedelta
|
|
from loguru import logger
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
class TavilyService:
|
|
"""
|
|
Service for web search and research using the Tavily API.
|
|
|
|
This service provides AI-powered search capabilities to find relevant
|
|
content and information for research purposes.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the Tavily Service with API credentials."""
|
|
self.api_key = os.getenv("TAVILY_API_KEY")
|
|
self.base_url = "https://api.tavily.com"
|
|
self.enabled = False
|
|
|
|
# Don't assume key is available at import time in production.
|
|
# Keys may be injected per-request via middleware, so defer init.
|
|
self._try_initialize()
|
|
|
|
def _try_initialize(self) -> None:
|
|
"""Attempt to (re)initialize the Tavily service from current environment."""
|
|
if self.enabled and self.api_key:
|
|
return
|
|
try:
|
|
self.api_key = os.getenv("TAVILY_API_KEY")
|
|
if not self.api_key:
|
|
# Leave disabled; caller may try again after middleware injection
|
|
logger.warning("TAVILY_API_KEY not configured; Tavily service will be disabled")
|
|
self.enabled = False
|
|
return
|
|
self.enabled = True
|
|
logger.info("Tavily Service initialized successfully")
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize Tavily service: {e}")
|
|
self.enabled = False
|
|
|
|
async def search(
|
|
self,
|
|
query: str,
|
|
topic: str = "general",
|
|
search_depth: str = "basic",
|
|
max_results: int = 10,
|
|
include_domains: Optional[List[str]] = None,
|
|
exclude_domains: Optional[List[str]] = None,
|
|
include_answer: Union[bool, str] = False,
|
|
include_raw_content: Union[bool, str] = False,
|
|
include_images: bool = False,
|
|
include_image_descriptions: bool = False,
|
|
include_favicon: bool = False,
|
|
time_range: Optional[str] = None,
|
|
start_date: Optional[str] = None,
|
|
end_date: Optional[str] = None,
|
|
country: Optional[str] = None,
|
|
chunks_per_source: int = 3,
|
|
auto_parameters: bool = False
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Execute a search query using Tavily API.
|
|
|
|
Args:
|
|
query: The search query to execute
|
|
topic: Category of search (general, news, finance)
|
|
search_depth: Depth of search (basic, advanced) - basic costs 1 credit, advanced costs 2
|
|
max_results: Maximum number of results to return (0-20)
|
|
include_domains: List of domains to specifically include
|
|
exclude_domains: List of domains to specifically exclude
|
|
include_answer: Include LLM-generated answer (basic/advanced/true/false)
|
|
include_raw_content: Include raw HTML content (markdown/text/true/false)
|
|
include_images: Include image search results
|
|
include_image_descriptions: Include image descriptions
|
|
include_favicon: Include favicon URLs
|
|
time_range: Time range filter (day, week, month, year, d, w, m, y)
|
|
start_date: Start date filter (YYYY-MM-DD)
|
|
end_date: End date filter (YYYY-MM-DD)
|
|
country: Country filter (boost results from specific country)
|
|
chunks_per_source: Maximum chunks per source (1-3, only for advanced search)
|
|
auto_parameters: Auto-configure parameters based on query
|
|
|
|
Returns:
|
|
Dictionary containing search results
|
|
"""
|
|
try:
|
|
# Ensure we pick up any per-request injected key
|
|
self._try_initialize()
|
|
if not self.enabled:
|
|
raise ValueError("Tavily Service is not enabled - API key missing")
|
|
|
|
logger.info(f"Starting Tavily search for: {query}")
|
|
|
|
# Build request payload
|
|
payload = {
|
|
"api_key": self.api_key,
|
|
"query": query,
|
|
"topic": topic,
|
|
"search_depth": search_depth,
|
|
"max_results": min(max_results, 20), # Tavily limit
|
|
"include_favicon": include_favicon
|
|
}
|
|
|
|
# Add optional parameters
|
|
if include_domains:
|
|
payload["include_domains"] = include_domains[:300] # Tavily limit
|
|
|
|
if exclude_domains:
|
|
payload["exclude_domains"] = exclude_domains[:150] # Tavily limit
|
|
|
|
if include_answer:
|
|
payload["include_answer"] = include_answer
|
|
|
|
if include_raw_content:
|
|
payload["include_raw_content"] = include_raw_content
|
|
|
|
if include_images:
|
|
payload["include_images"] = include_images
|
|
if include_image_descriptions:
|
|
payload["include_image_descriptions"] = include_image_descriptions
|
|
|
|
if time_range:
|
|
payload["time_range"] = time_range
|
|
|
|
if start_date:
|
|
payload["start_date"] = start_date
|
|
|
|
if end_date:
|
|
payload["end_date"] = end_date
|
|
|
|
if country and topic == "general":
|
|
payload["country"] = country
|
|
|
|
if search_depth == "advanced" and 1 <= chunks_per_source <= 3:
|
|
payload["chunks_per_source"] = chunks_per_source
|
|
|
|
if auto_parameters:
|
|
payload["auto_parameters"] = True
|
|
|
|
# Make API request
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.post(
|
|
f"{self.base_url}/search",
|
|
json=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
timeout=aiohttp.ClientTimeout(total=60)
|
|
) as response:
|
|
if response.status == 200:
|
|
result = await response.json()
|
|
logger.info(f"Tavily search completed successfully. Found {len(result.get('results', []))} results.")
|
|
|
|
# Process and structure results
|
|
processed_results = self._process_search_results(result, query)
|
|
|
|
return {
|
|
"success": True,
|
|
"query": result.get("query", query),
|
|
"answer": result.get("answer"), # If include_answer was requested
|
|
"results": processed_results,
|
|
"images": result.get("images", []),
|
|
"response_time": result.get("response_time"),
|
|
"request_id": result.get("request_id"),
|
|
"auto_parameters": result.get("auto_parameters"),
|
|
"total_results": len(processed_results),
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
else:
|
|
error_text = await response.text()
|
|
logger.error(f"Tavily API error: {response.status} - {error_text}")
|
|
raise RuntimeError(f"Tavily API error: {response.status} - {error_text}")
|
|
|
|
except aiohttp.ClientTimeout:
|
|
logger.error("Tavily API request timed out")
|
|
return {
|
|
"success": False,
|
|
"error": "Request timed out",
|
|
"details": "The search request took too long to complete"
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error in Tavily search: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"details": "An unexpected error occurred during search"
|
|
}
|
|
|
|
def _process_search_results(self, api_response: Dict[str, Any], query: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Process and structure Tavily API response into standardized format.
|
|
|
|
Args:
|
|
api_response: Raw response from Tavily API
|
|
query: Original search query
|
|
|
|
Returns:
|
|
List of processed search results
|
|
"""
|
|
results = []
|
|
raw_results = api_response.get("results", [])
|
|
|
|
for result in raw_results:
|
|
try:
|
|
# Extract domain from URL
|
|
url = result.get("url", "")
|
|
domain = urlparse(url).netloc if url else ""
|
|
|
|
# Calculate relevance score (Tavily provides score field)
|
|
relevance_score = result.get("score", 0.5)
|
|
|
|
processed_result = {
|
|
"url": url,
|
|
"domain": domain,
|
|
"title": result.get("title", ""),
|
|
"content": result.get("content", ""),
|
|
"raw_content": result.get("raw_content"), # If include_raw_content was requested
|
|
"score": relevance_score,
|
|
"relevance_score": relevance_score, # Alias for compatibility
|
|
"favicon": result.get("favicon"),
|
|
"published_date": result.get("published_date"),
|
|
}
|
|
|
|
results.append(processed_result)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error processing Tavily result: {str(e)}")
|
|
continue
|
|
|
|
# Sort by relevance score (highest first)
|
|
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
|
|
|
|
return results
|
|
|
|
async def search_industry_trends(
|
|
self,
|
|
topic: str,
|
|
industry: str,
|
|
max_results: int = 10,
|
|
search_depth: str = "basic"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Search for current industry trends and insights.
|
|
|
|
Args:
|
|
topic: The specific topic to research
|
|
industry: The industry context for the search
|
|
max_results: Maximum number of search results to return
|
|
search_depth: Depth of search (basic or advanced)
|
|
|
|
Returns:
|
|
Dictionary containing search results with industry context
|
|
"""
|
|
# Build industry-specific query
|
|
search_query = f"{topic} {industry} trends insights"
|
|
|
|
# Use news topic for current trends
|
|
return await self.search(
|
|
query=search_query,
|
|
topic="news" if search_depth == "basic" else "general",
|
|
search_depth=search_depth,
|
|
max_results=max_results,
|
|
include_answer="basic",
|
|
include_favicon=True,
|
|
time_range="month" # Last month for current trends
|
|
)
|
|
|
|
async def discover_competitors(
|
|
self,
|
|
user_url: str,
|
|
num_results: int = 10,
|
|
include_domains: Optional[List[str]] = None,
|
|
exclude_domains: Optional[List[str]] = None,
|
|
industry_context: Optional[str] = None,
|
|
website_analysis_data: Optional[Dict[str, Any]] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Discover competitors for a given website using Tavily search.
|
|
|
|
Args:
|
|
user_url: The website URL to find competitors for
|
|
num_results: Number of competitor results to return
|
|
include_domains: List of domains to include in search
|
|
exclude_domains: List of domains to exclude from search
|
|
industry_context: Industry context for better competitor discovery
|
|
|
|
Returns:
|
|
Dictionary containing competitor analysis results
|
|
"""
|
|
try:
|
|
# Ensure we pick up any per-request injected key
|
|
self._try_initialize()
|
|
if not self.enabled:
|
|
raise ValueError("Tavily Service is not enabled - API key missing")
|
|
|
|
logger.info(f"Starting competitor discovery for: {user_url}")
|
|
|
|
# Extract user domain for exclusion
|
|
user_domain = urlparse(user_url).netloc
|
|
exclude_domains_list = exclude_domains or []
|
|
exclude_domains_list.append(user_domain)
|
|
|
|
# Build search query
|
|
query_parts = ["similar websites", "competitors"]
|
|
if industry_context:
|
|
query_parts.append(f"in {industry_context}")
|
|
|
|
# Extract insights from website analysis if available
|
|
if website_analysis_data:
|
|
analysis = website_analysis_data.get('analysis', {})
|
|
if 'target_audience' in analysis:
|
|
audience = analysis['target_audience']
|
|
if isinstance(audience, dict) and 'primary_audience' in audience:
|
|
query_parts.append(audience['primary_audience'])
|
|
|
|
search_query = " ".join(query_parts)
|
|
|
|
# Perform search
|
|
search_result = await self.search(
|
|
query=search_query,
|
|
topic="general",
|
|
search_depth="advanced", # Use advanced for better competitor discovery
|
|
max_results=num_results,
|
|
include_domains=include_domains,
|
|
exclude_domains=exclude_domains_list,
|
|
include_favicon=True,
|
|
chunks_per_source=3
|
|
)
|
|
|
|
if not search_result.get("success"):
|
|
return search_result
|
|
|
|
# Process results into competitor format
|
|
competitors = []
|
|
for result in search_result.get("results", []):
|
|
competitor_data = {
|
|
"url": result.get("url"),
|
|
"domain": result.get("domain"),
|
|
"title": result.get("title"),
|
|
"summary": result.get("content", ""),
|
|
"relevance_score": result.get("relevance_score", 0.5),
|
|
"favicon": result.get("favicon"),
|
|
"published_date": result.get("published_date"),
|
|
"highlights": self._extract_highlights(result.get("content", "")),
|
|
"competitive_insights": self._extract_competitive_insights(result),
|
|
"content_insights": self._analyze_content_quality(result)
|
|
}
|
|
competitors.append(competitor_data)
|
|
|
|
logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")
|
|
|
|
return {
|
|
"success": True,
|
|
"user_url": user_url,
|
|
"competitors": competitors,
|
|
"total_competitors": len(competitors),
|
|
"analysis_timestamp": datetime.utcnow().isoformat(),
|
|
"industry_context": industry_context,
|
|
"request_id": search_result.get("request_id")
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in competitor discovery: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"details": "An unexpected error occurred during competitor discovery"
|
|
}
|
|
|
|
def _extract_highlights(self, content: str, num_sentences: int = 3) -> List[str]:
|
|
"""Extract key highlights from content."""
|
|
if not content:
|
|
return []
|
|
|
|
# Simple sentence extraction (can be enhanced with NLP)
|
|
sentences = [s.strip() for s in content.split('.') if s.strip()]
|
|
return sentences[:num_sentences]
|
|
|
|
def _extract_competitive_insights(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Extract competitive insights from search result."""
|
|
content = result.get("content", "")
|
|
title = result.get("title", "")
|
|
|
|
return {
|
|
"business_model": "unknown",
|
|
"target_audience": "unknown",
|
|
"key_differentiators": []
|
|
}
|
|
|
|
def _analyze_content_quality(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Analyze content quality metrics."""
|
|
content = result.get("content", "")
|
|
|
|
return {
|
|
"content_focus": "general",
|
|
"content_quality": "medium",
|
|
"publishing_frequency": "unknown"
|
|
}
|
|
|