moreminimore-marketing/backend/services/research/tavily_service.py

"""
Tavily API Service for ALwrity

This service provides web search and research capabilities using the Tavily API,
which offers AI-powered search with real-time information retrieval.

Key Features:
- Web search with AI-powered results
- Content extraction and summarization
- Real-time information retrieval
- Topic-based search (general, news, finance)
- Advanced search depth options
- Cost-effective API usage with caching

Dependencies:
- aiohttp (for async HTTP requests)
- os (for environment variables)
- logging (for debugging)

Author: ALwrity Team
Version: 1.0
Last Updated: January 2025
"""

import os
import json
import aiohttp
from typing import Dict, List, Optional, Any, Union
from datetime import datetime, timedelta
from loguru import logger
from urllib.parse import urlparse


class TavilyService:
    """
    Service for web search and research using the Tavily API.

    This service provides AI-powered search capabilities to find relevant
    content and information for research purposes.
    """

    def __init__(self):
        """Initialize the Tavily Service with API credentials."""
        self.api_key = os.getenv("TAVILY_API_KEY")
        self.base_url = "https://api.tavily.com"
        self.enabled = False

        # Don't assume key is available at import time in production.
        # Keys may be injected per-request via middleware, so defer init.
        self._try_initialize()

    def _try_initialize(self) -> None:
        """Attempt to (re)initialize the Tavily service from current environment."""
        if self.enabled and self.api_key:
            return
        try:
            self.api_key = os.getenv("TAVILY_API_KEY")
            if not self.api_key:
                # Leave disabled; caller may try again after middleware injection
                logger.warning("TAVILY_API_KEY not configured; Tavily service will be disabled")
                self.enabled = False
                return
            self.enabled = True
            logger.info("Tavily Service initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Tavily service: {e}")
            self.enabled = False

    async def search(
        self,
        query: str,
        topic: str = "general",
        search_depth: str = "basic",
        max_results: int = 10,
        include_domains: Optional[List[str]] = None,
        exclude_domains: Optional[List[str]] = None,
        include_answer: Union[bool, str] = False,
        include_raw_content: Union[bool, str] = False,
        include_images: bool = False,
        include_image_descriptions: bool = False,
        include_favicon: bool = False,
        time_range: Optional[str] = None,
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        country: Optional[str] = None,
        chunks_per_source: int = 3,
        auto_parameters: bool = False
    ) -> Dict[str, Any]:
        """
        Execute a search query using Tavily API.

        Args:
            query: The search query to execute
            topic: Category of search (general, news, finance)
            search_depth: Depth of search (basic, advanced) - basic costs 1 credit, advanced costs 2
            max_results: Maximum number of results to return (0-20)
            include_domains: List of domains to specifically include
            exclude_domains: List of domains to specifically exclude
            include_answer: Include LLM-generated answer (basic/advanced/true/false)
            include_raw_content: Include raw HTML content (markdown/text/true/false)
            include_images: Include image search results
            include_image_descriptions: Include image descriptions
            include_favicon: Include favicon URLs
            time_range: Time range filter (day, week, month, year, d, w, m, y)
            start_date: Start date filter (YYYY-MM-DD)
            end_date: End date filter (YYYY-MM-DD)
            country: Country filter (boost results from specific country)
            chunks_per_source: Maximum chunks per source (1-3, only for advanced search)
            auto_parameters: Auto-configure parameters based on query

        Returns:
            Dictionary containing search results
        """
        try:
            # Ensure we pick up any per-request injected key
            self._try_initialize()
            if not self.enabled:
                raise ValueError("Tavily Service is not enabled - API key missing")

            logger.info(f"Starting Tavily search for: {query}")

            # Build request payload
            payload = {
                "api_key": self.api_key,
                "query": query,
                "topic": topic,
                "search_depth": search_depth,
                "max_results": min(max_results, 20),  # Tavily limit
                "include_favicon": include_favicon
            }

            # Add optional parameters
            if include_domains:
                payload["include_domains"] = include_domains[:300]  # Tavily limit

            if exclude_domains:
                payload["exclude_domains"] = exclude_domains[:150]  # Tavily limit

            if include_answer:
                payload["include_answer"] = include_answer

            if include_raw_content:
                payload["include_raw_content"] = include_raw_content

            if include_images:
                payload["include_images"] = include_images
                if include_image_descriptions:
                    payload["include_image_descriptions"] = include_image_descriptions

            if time_range:
                payload["time_range"] = time_range

            if start_date:
                payload["start_date"] = start_date

            if end_date:
                payload["end_date"] = end_date

            if country and topic == "general":
                payload["country"] = country

            if search_depth == "advanced" and 1 <= chunks_per_source <= 3:
                payload["chunks_per_source"] = chunks_per_source

            if auto_parameters:
                payload["auto_parameters"] = True

            # Make API request
            async with aiohttp.ClientSession() as session:
                async with session.post(
                    f"{self.base_url}/search",
                    json=payload,
                    headers={"Content-Type": "application/json"},
                    timeout=aiohttp.ClientTimeout(total=60)
                ) as response:
                    if response.status == 200:
                        result = await response.json()
                        logger.info(f"Tavily search completed successfully. Found {len(result.get('results', []))} results.")

                        # Process and structure results
                        processed_results = self._process_search_results(result, query)

                        return {
                            "success": True,
                            "query": result.get("query", query),
                            "answer": result.get("answer"),  # If include_answer was requested
                            "results": processed_results,
                            "images": result.get("images", []),
                            "response_time": result.get("response_time"),
                            "request_id": result.get("request_id"),
                            "auto_parameters": result.get("auto_parameters"),
                            "total_results": len(processed_results),
                            "timestamp": datetime.utcnow().isoformat()
                        }
                    else:
                        error_text = await response.text()
                        logger.error(f"Tavily API error: {response.status} - {error_text}")
                        raise RuntimeError(f"Tavily API error: {response.status} - {error_text}")

        except aiohttp.ClientTimeout:
            logger.error("Tavily API request timed out")
            return {
                "success": False,
                "error": "Request timed out",
                "details": "The search request took too long to complete"
            }
        except Exception as e:
            logger.error(f"Error in Tavily search: {str(e)}")
            return {
                "success": False,
                "error": str(e),
                "details": "An unexpected error occurred during search"
            }

    def _process_search_results(self, api_response: Dict[str, Any], query: str) -> List[Dict[str, Any]]:
        """
        Process and structure Tavily API response into standardized format.

        Args:
            api_response: Raw response from Tavily API
            query: Original search query

        Returns:
            List of processed search results
        """
        results = []
        raw_results = api_response.get("results", [])

        for result in raw_results:
            try:
                # Extract domain from URL
                url = result.get("url", "")
                domain = urlparse(url).netloc if url else ""

                # Calculate relevance score (Tavily provides score field)
                relevance_score = result.get("score", 0.5)

                processed_result = {
                    "url": url,
                    "domain": domain,
                    "title": result.get("title", ""),
                    "content": result.get("content", ""),
                    "raw_content": result.get("raw_content"),  # If include_raw_content was requested
                    "score": relevance_score,
                    "relevance_score": relevance_score,  # Alias for compatibility
                    "favicon": result.get("favicon"),
                    "published_date": result.get("published_date"),
                }

                results.append(processed_result)

            except Exception as e:
                logger.warning(f"Error processing Tavily result: {str(e)}")
                continue

        # Sort by relevance score (highest first)
        results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)

        return results

    async def search_industry_trends(
        self,
        topic: str,
        industry: str,
        max_results: int = 10,
        search_depth: str = "basic"
    ) -> Dict[str, Any]:
        """
        Search for current industry trends and insights.

        Args:
            topic: The specific topic to research
            industry: The industry context for the search
            max_results: Maximum number of search results to return
            search_depth: Depth of search (basic or advanced)

        Returns:
            Dictionary containing search results with industry context
        """
        # Build industry-specific query
        search_query = f"{topic} {industry} trends insights"

        # Use news topic for current trends
        return await self.search(
            query=search_query,
            topic="news" if search_depth == "basic" else "general",
            search_depth=search_depth,
            max_results=max_results,
            include_answer="basic",
            include_favicon=True,
            time_range="month"  # Last month for current trends
        )

    async def discover_competitors(
        self,
        user_url: str,
        num_results: int = 10,
        include_domains: Optional[List[str]] = None,
        exclude_domains: Optional[List[str]] = None,
        industry_context: Optional[str] = None,
        website_analysis_data: Optional[Dict[str, Any]] = None
    ) -> Dict[str, Any]:
        """
        Discover competitors for a given website using Tavily search.

        Args:
            user_url: The website URL to find competitors for
            num_results: Number of competitor results to return
            include_domains: List of domains to include in search
            exclude_domains: List of domains to exclude from search
            industry_context: Industry context for better competitor discovery

        Returns:
            Dictionary containing competitor analysis results
        """
        try:
            # Ensure we pick up any per-request injected key
            self._try_initialize()
            if not self.enabled:
                raise ValueError("Tavily Service is not enabled - API key missing")

            logger.info(f"Starting competitor discovery for: {user_url}")

            # Extract user domain for exclusion
            user_domain = urlparse(user_url).netloc
            exclude_domains_list = exclude_domains or []
            exclude_domains_list.append(user_domain)

            # Build search query
            query_parts = ["similar websites", "competitors"]
            if industry_context:
                query_parts.append(f"in {industry_context}")

            # Extract insights from website analysis if available
            if website_analysis_data:
                analysis = website_analysis_data.get('analysis', {})
                if 'target_audience' in analysis:
                    audience = analysis['target_audience']
                    if isinstance(audience, dict) and 'primary_audience' in audience:
                        query_parts.append(audience['primary_audience'])

            search_query = " ".join(query_parts)

            # Perform search
            search_result = await self.search(
                query=search_query,
                topic="general",
                search_depth="advanced",  # Use advanced for better competitor discovery
                max_results=num_results,
                include_domains=include_domains,
                exclude_domains=exclude_domains_list,
                include_favicon=True,
                chunks_per_source=3
            )

            if not search_result.get("success"):
                return search_result

            # Process results into competitor format
            competitors = []
            for result in search_result.get("results", []):
                competitor_data = {
                    "url": result.get("url"),
                    "domain": result.get("domain"),
                    "title": result.get("title"),
                    "summary": result.get("content", ""),
                    "relevance_score": result.get("relevance_score", 0.5),
                    "favicon": result.get("favicon"),
                    "published_date": result.get("published_date"),
                    "highlights": self._extract_highlights(result.get("content", "")),
                    "competitive_insights": self._extract_competitive_insights(result),
                    "content_insights": self._analyze_content_quality(result)
                }
                competitors.append(competitor_data)

            logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")

            return {
                "success": True,
                "user_url": user_url,
                "competitors": competitors,
                "total_competitors": len(competitors),
                "analysis_timestamp": datetime.utcnow().isoformat(),
                "industry_context": industry_context,
                "request_id": search_result.get("request_id")
            }

        except Exception as e:
            logger.error(f"Error in competitor discovery: {str(e)}")
            return {
                "success": False,
                "error": str(e),
                "details": "An unexpected error occurred during competitor discovery"
            }

    def _extract_highlights(self, content: str, num_sentences: int = 3) -> List[str]:
        """Extract key highlights from content."""
        if not content:
            return []

        # Simple sentence extraction (can be enhanced with NLP)
        sentences = [s.strip() for s in content.split('.') if s.strip()]
        return sentences[:num_sentences]

    def _extract_competitive_insights(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """Extract competitive insights from search result."""
        content = result.get("content", "")
        title = result.get("title", "")

        return {
            "business_model": "unknown",
            "target_audience": "unknown",
            "key_differentiators": []
        }

    def _analyze_content_quality(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze content quality metrics."""
        content = result.get("content", "")

        return {
            "content_focus": "general",
            "content_quality": "medium",
            "publishing_frequency": "unknown"
        }