""" Tavily API Service for ALwrity This service provides web search and research capabilities using the Tavily API, which offers AI-powered search with real-time information retrieval. Key Features: - Web search with AI-powered results - Content extraction and summarization - Real-time information retrieval - Topic-based search (general, news, finance) - Advanced search depth options - Cost-effective API usage with caching Dependencies: - aiohttp (for async HTTP requests) - os (for environment variables) - logging (for debugging) Author: ALwrity Team Version: 1.0 Last Updated: January 2025 """ import os import json import aiohttp from typing import Dict, List, Optional, Any, Union from datetime import datetime, timedelta from loguru import logger from urllib.parse import urlparse class TavilyService: """ Service for web search and research using the Tavily API. This service provides AI-powered search capabilities to find relevant content and information for research purposes. """ def __init__(self): """Initialize the Tavily Service with API credentials.""" self.api_key = os.getenv("TAVILY_API_KEY") self.base_url = "https://api.tavily.com" self.enabled = False # Don't assume key is available at import time in production. # Keys may be injected per-request via middleware, so defer init. self._try_initialize() def _try_initialize(self) -> None: """Attempt to (re)initialize the Tavily service from current environment.""" if self.enabled and self.api_key: return try: self.api_key = os.getenv("TAVILY_API_KEY") if not self.api_key: # Leave disabled; caller may try again after middleware injection logger.warning("TAVILY_API_KEY not configured; Tavily service will be disabled") self.enabled = False return self.enabled = True logger.info("Tavily Service initialized successfully") except Exception as e: logger.error(f"Failed to initialize Tavily service: {e}") self.enabled = False async def search( self, query: str, topic: str = "general", search_depth: str = "basic", max_results: int = 10, include_domains: Optional[List[str]] = None, exclude_domains: Optional[List[str]] = None, include_answer: Union[bool, str] = False, include_raw_content: Union[bool, str] = False, include_images: bool = False, include_image_descriptions: bool = False, include_favicon: bool = False, time_range: Optional[str] = None, start_date: Optional[str] = None, end_date: Optional[str] = None, country: Optional[str] = None, chunks_per_source: int = 3, auto_parameters: bool = False ) -> Dict[str, Any]: """ Execute a search query using Tavily API. Args: query: The search query to execute topic: Category of search (general, news, finance) search_depth: Depth of search (basic, advanced) - basic costs 1 credit, advanced costs 2 max_results: Maximum number of results to return (0-20) include_domains: List of domains to specifically include exclude_domains: List of domains to specifically exclude include_answer: Include LLM-generated answer (basic/advanced/true/false) include_raw_content: Include raw HTML content (markdown/text/true/false) include_images: Include image search results include_image_descriptions: Include image descriptions include_favicon: Include favicon URLs time_range: Time range filter (day, week, month, year, d, w, m, y) start_date: Start date filter (YYYY-MM-DD) end_date: End date filter (YYYY-MM-DD) country: Country filter (boost results from specific country) chunks_per_source: Maximum chunks per source (1-3, only for advanced search) auto_parameters: Auto-configure parameters based on query Returns: Dictionary containing search results """ try: # Ensure we pick up any per-request injected key self._try_initialize() if not self.enabled: raise ValueError("Tavily Service is not enabled - API key missing") logger.info(f"Starting Tavily search for: {query}") # Build request payload payload = { "api_key": self.api_key, "query": query, "topic": topic, "search_depth": search_depth, "max_results": min(max_results, 20), # Tavily limit "include_favicon": include_favicon } # Add optional parameters if include_domains: payload["include_domains"] = include_domains[:300] # Tavily limit if exclude_domains: payload["exclude_domains"] = exclude_domains[:150] # Tavily limit if include_answer: payload["include_answer"] = include_answer if include_raw_content: payload["include_raw_content"] = include_raw_content if include_images: payload["include_images"] = include_images if include_image_descriptions: payload["include_image_descriptions"] = include_image_descriptions if time_range: payload["time_range"] = time_range if start_date: payload["start_date"] = start_date if end_date: payload["end_date"] = end_date if country and topic == "general": payload["country"] = country if search_depth == "advanced" and 1 <= chunks_per_source <= 3: payload["chunks_per_source"] = chunks_per_source if auto_parameters: payload["auto_parameters"] = True # Make API request async with aiohttp.ClientSession() as session: async with session.post( f"{self.base_url}/search", json=payload, headers={"Content-Type": "application/json"}, timeout=aiohttp.ClientTimeout(total=60) ) as response: if response.status == 200: result = await response.json() logger.info(f"Tavily search completed successfully. Found {len(result.get('results', []))} results.") # Process and structure results processed_results = self._process_search_results(result, query) return { "success": True, "query": result.get("query", query), "answer": result.get("answer"), # If include_answer was requested "results": processed_results, "images": result.get("images", []), "response_time": result.get("response_time"), "request_id": result.get("request_id"), "auto_parameters": result.get("auto_parameters"), "total_results": len(processed_results), "timestamp": datetime.utcnow().isoformat() } else: error_text = await response.text() logger.error(f"Tavily API error: {response.status} - {error_text}") raise RuntimeError(f"Tavily API error: {response.status} - {error_text}") except aiohttp.ClientTimeout: logger.error("Tavily API request timed out") return { "success": False, "error": "Request timed out", "details": "The search request took too long to complete" } except Exception as e: logger.error(f"Error in Tavily search: {str(e)}") return { "success": False, "error": str(e), "details": "An unexpected error occurred during search" } def _process_search_results(self, api_response: Dict[str, Any], query: str) -> List[Dict[str, Any]]: """ Process and structure Tavily API response into standardized format. Args: api_response: Raw response from Tavily API query: Original search query Returns: List of processed search results """ results = [] raw_results = api_response.get("results", []) for result in raw_results: try: # Extract domain from URL url = result.get("url", "") domain = urlparse(url).netloc if url else "" # Calculate relevance score (Tavily provides score field) relevance_score = result.get("score", 0.5) processed_result = { "url": url, "domain": domain, "title": result.get("title", ""), "content": result.get("content", ""), "raw_content": result.get("raw_content"), # If include_raw_content was requested "score": relevance_score, "relevance_score": relevance_score, # Alias for compatibility "favicon": result.get("favicon"), "published_date": result.get("published_date"), } results.append(processed_result) except Exception as e: logger.warning(f"Error processing Tavily result: {str(e)}") continue # Sort by relevance score (highest first) results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True) return results async def search_industry_trends( self, topic: str, industry: str, max_results: int = 10, search_depth: str = "basic" ) -> Dict[str, Any]: """ Search for current industry trends and insights. Args: topic: The specific topic to research industry: The industry context for the search max_results: Maximum number of search results to return search_depth: Depth of search (basic or advanced) Returns: Dictionary containing search results with industry context """ # Build industry-specific query search_query = f"{topic} {industry} trends insights" # Use news topic for current trends return await self.search( query=search_query, topic="news" if search_depth == "basic" else "general", search_depth=search_depth, max_results=max_results, include_answer="basic", include_favicon=True, time_range="month" # Last month for current trends ) async def discover_competitors( self, user_url: str, num_results: int = 10, include_domains: Optional[List[str]] = None, exclude_domains: Optional[List[str]] = None, industry_context: Optional[str] = None, website_analysis_data: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: """ Discover competitors for a given website using Tavily search. Args: user_url: The website URL to find competitors for num_results: Number of competitor results to return include_domains: List of domains to include in search exclude_domains: List of domains to exclude from search industry_context: Industry context for better competitor discovery Returns: Dictionary containing competitor analysis results """ try: # Ensure we pick up any per-request injected key self._try_initialize() if not self.enabled: raise ValueError("Tavily Service is not enabled - API key missing") logger.info(f"Starting competitor discovery for: {user_url}") # Extract user domain for exclusion user_domain = urlparse(user_url).netloc exclude_domains_list = exclude_domains or [] exclude_domains_list.append(user_domain) # Build search query query_parts = ["similar websites", "competitors"] if industry_context: query_parts.append(f"in {industry_context}") # Extract insights from website analysis if available if website_analysis_data: analysis = website_analysis_data.get('analysis', {}) if 'target_audience' in analysis: audience = analysis['target_audience'] if isinstance(audience, dict) and 'primary_audience' in audience: query_parts.append(audience['primary_audience']) search_query = " ".join(query_parts) # Perform search search_result = await self.search( query=search_query, topic="general", search_depth="advanced", # Use advanced for better competitor discovery max_results=num_results, include_domains=include_domains, exclude_domains=exclude_domains_list, include_favicon=True, chunks_per_source=3 ) if not search_result.get("success"): return search_result # Process results into competitor format competitors = [] for result in search_result.get("results", []): competitor_data = { "url": result.get("url"), "domain": result.get("domain"), "title": result.get("title"), "summary": result.get("content", ""), "relevance_score": result.get("relevance_score", 0.5), "favicon": result.get("favicon"), "published_date": result.get("published_date"), "highlights": self._extract_highlights(result.get("content", "")), "competitive_insights": self._extract_competitive_insights(result), "content_insights": self._analyze_content_quality(result) } competitors.append(competitor_data) logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}") return { "success": True, "user_url": user_url, "competitors": competitors, "total_competitors": len(competitors), "analysis_timestamp": datetime.utcnow().isoformat(), "industry_context": industry_context, "request_id": search_result.get("request_id") } except Exception as e: logger.error(f"Error in competitor discovery: {str(e)}") return { "success": False, "error": str(e), "details": "An unexpected error occurred during competitor discovery" } def _extract_highlights(self, content: str, num_sentences: int = 3) -> List[str]: """Extract key highlights from content.""" if not content: return [] # Simple sentence extraction (can be enhanced with NLP) sentences = [s.strip() for s in content.split('.') if s.strip()] return sentences[:num_sentences] def _extract_competitive_insights(self, result: Dict[str, Any]) -> Dict[str, Any]: """Extract competitive insights from search result.""" content = result.get("content", "") title = result.get("title", "") return { "business_model": "unknown", "target_audience": "unknown", "key_differentiators": [] } def _analyze_content_quality(self, result: Dict[str, Any]) -> Dict[str, Any]: """Analyze content quality metrics.""" content = result.get("content", "") return { "content_focus": "general", "content_quality": "medium", "publishing_frequency": "unknown" }