Base code

2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions
--- a/backend/services/research/tavily_service.py
+++ b/backend/services/research/tavily_service.py
@@ -0,0 +1,425 @@
+"""
+Tavily API Service for ALwrity
+
+This service provides web search and research capabilities using the Tavily API,
+which offers AI-powered search with real-time information retrieval.
+
+Key Features:
+- Web search with AI-powered results
+- Content extraction and summarization
+- Real-time information retrieval
+- Topic-based search (general, news, finance)
+- Advanced search depth options
+- Cost-effective API usage with caching
+
+Dependencies:
+- aiohttp (for async HTTP requests)
+- os (for environment variables)
+- logging (for debugging)
+
+Author: ALwrity Team
+Version: 1.0
+Last Updated: January 2025
+"""
+
+import os
+import json
+import aiohttp
+from typing import Dict, List, Optional, Any, Union
+from datetime import datetime, timedelta
+from loguru import logger
+from urllib.parse import urlparse
+
+
+class TavilyService:
+    """
+    Service for web search and research using the Tavily API.
+    
+    This service provides AI-powered search capabilities to find relevant
+    content and information for research purposes.
+    """
+    
+    def __init__(self):
+        """Initialize the Tavily Service with API credentials."""
+        self.api_key = os.getenv("TAVILY_API_KEY")
+        self.base_url = "https://api.tavily.com"
+        self.enabled = False
+
+        # Don't assume key is available at import time in production.
+        # Keys may be injected per-request via middleware, so defer init.
+        self._try_initialize()
+
+    def _try_initialize(self) -> None:
+        """Attempt to (re)initialize the Tavily service from current environment."""
+        if self.enabled and self.api_key:
+            return
+        try:
+            self.api_key = os.getenv("TAVILY_API_KEY")
+            if not self.api_key:
+                # Leave disabled; caller may try again after middleware injection
+                logger.warning("TAVILY_API_KEY not configured; Tavily service will be disabled")
+                self.enabled = False
+                return
+            self.enabled = True
+            logger.info("Tavily Service initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize Tavily service: {e}")
+            self.enabled = False
+    
+    async def search(
+        self,
+        query: str,
+        topic: str = "general",
+        search_depth: str = "basic",
+        max_results: int = 10,
+        include_domains: Optional[List[str]] = None,
+        exclude_domains: Optional[List[str]] = None,
+        include_answer: Union[bool, str] = False,
+        include_raw_content: Union[bool, str] = False,
+        include_images: bool = False,
+        include_image_descriptions: bool = False,
+        include_favicon: bool = False,
+        time_range: Optional[str] = None,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None,
+        country: Optional[str] = None,
+        chunks_per_source: int = 3,
+        auto_parameters: bool = False
+    ) -> Dict[str, Any]:
+        """
+        Execute a search query using Tavily API.
+        
+        Args:
+            query: The search query to execute
+            topic: Category of search (general, news, finance)
+            search_depth: Depth of search (basic, advanced) - basic costs 1 credit, advanced costs 2
+            max_results: Maximum number of results to return (0-20)
+            include_domains: List of domains to specifically include
+            exclude_domains: List of domains to specifically exclude
+            include_answer: Include LLM-generated answer (basic/advanced/true/false)
+            include_raw_content: Include raw HTML content (markdown/text/true/false)
+            include_images: Include image search results
+            include_image_descriptions: Include image descriptions
+            include_favicon: Include favicon URLs
+            time_range: Time range filter (day, week, month, year, d, w, m, y)
+            start_date: Start date filter (YYYY-MM-DD)
+            end_date: End date filter (YYYY-MM-DD)
+            country: Country filter (boost results from specific country)
+            chunks_per_source: Maximum chunks per source (1-3, only for advanced search)
+            auto_parameters: Auto-configure parameters based on query
+            
+        Returns:
+            Dictionary containing search results
+        """
+        try:
+            # Ensure we pick up any per-request injected key
+            self._try_initialize()
+            if not self.enabled:
+                raise ValueError("Tavily Service is not enabled - API key missing")
+            
+            logger.info(f"Starting Tavily search for: {query}")
+            
+            # Build request payload
+            payload = {
+                "api_key": self.api_key,
+                "query": query,
+                "topic": topic,
+                "search_depth": search_depth,
+                "max_results": min(max_results, 20),  # Tavily limit
+                "include_favicon": include_favicon
+            }
+            
+            # Add optional parameters
+            if include_domains:
+                payload["include_domains"] = include_domains[:300]  # Tavily limit
+            
+            if exclude_domains:
+                payload["exclude_domains"] = exclude_domains[:150]  # Tavily limit
+            
+            if include_answer:
+                payload["include_answer"] = include_answer
+            
+            if include_raw_content:
+                payload["include_raw_content"] = include_raw_content
+            
+            if include_images:
+                payload["include_images"] = include_images
+                if include_image_descriptions:
+                    payload["include_image_descriptions"] = include_image_descriptions
+            
+            if time_range:
+                payload["time_range"] = time_range
+            
+            if start_date:
+                payload["start_date"] = start_date
+            
+            if end_date:
+                payload["end_date"] = end_date
+            
+            if country and topic == "general":
+                payload["country"] = country
+            
+            if search_depth == "advanced" and 1 <= chunks_per_source <= 3:
+                payload["chunks_per_source"] = chunks_per_source
+            
+            if auto_parameters:
+                payload["auto_parameters"] = True
+            
+            # Make API request
+            async with aiohttp.ClientSession() as session:
+                async with session.post(
+                    f"{self.base_url}/search",
+                    json=payload,
+                    headers={"Content-Type": "application/json"},
+                    timeout=aiohttp.ClientTimeout(total=60)
+                ) as response:
+                    if response.status == 200:
+                        result = await response.json()
+                        logger.info(f"Tavily search completed successfully. Found {len(result.get('results', []))} results.")
+                        
+                        # Process and structure results
+                        processed_results = self._process_search_results(result, query)
+                        
+                        return {
+                            "success": True,
+                            "query": result.get("query", query),
+                            "answer": result.get("answer"),  # If include_answer was requested
+                            "results": processed_results,
+                            "images": result.get("images", []),
+                            "response_time": result.get("response_time"),
+                            "request_id": result.get("request_id"),
+                            "auto_parameters": result.get("auto_parameters"),
+                            "total_results": len(processed_results),
+                            "timestamp": datetime.utcnow().isoformat()
+                        }
+                    else:
+                        error_text = await response.text()
+                        logger.error(f"Tavily API error: {response.status} - {error_text}")
+                        raise RuntimeError(f"Tavily API error: {response.status} - {error_text}")
+                        
+        except aiohttp.ClientTimeout:
+            logger.error("Tavily API request timed out")
+            return {
+                "success": False,
+                "error": "Request timed out",
+                "details": "The search request took too long to complete"
+            }
+        except Exception as e:
+            logger.error(f"Error in Tavily search: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "details": "An unexpected error occurred during search"
+            }
+    
+    def _process_search_results(self, api_response: Dict[str, Any], query: str) -> List[Dict[str, Any]]:
+        """
+        Process and structure Tavily API response into standardized format.
+        
+        Args:
+            api_response: Raw response from Tavily API
+            query: Original search query
+            
+        Returns:
+            List of processed search results
+        """
+        results = []
+        raw_results = api_response.get("results", [])
+        
+        for result in raw_results:
+            try:
+                # Extract domain from URL
+                url = result.get("url", "")
+                domain = urlparse(url).netloc if url else ""
+                
+                # Calculate relevance score (Tavily provides score field)
+                relevance_score = result.get("score", 0.5)
+                
+                processed_result = {
+                    "url": url,
+                    "domain": domain,
+                    "title": result.get("title", ""),
+                    "content": result.get("content", ""),
+                    "raw_content": result.get("raw_content"),  # If include_raw_content was requested
+                    "score": relevance_score,
+                    "relevance_score": relevance_score,  # Alias for compatibility
+                    "favicon": result.get("favicon"),
+                    "published_date": result.get("published_date"),
+                }
+                
+                results.append(processed_result)
+                
+            except Exception as e:
+                logger.warning(f"Error processing Tavily result: {str(e)}")
+                continue
+        
+        # Sort by relevance score (highest first)
+        results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
+        
+        return results
+    
+    async def search_industry_trends(
+        self,
+        topic: str,
+        industry: str,
+        max_results: int = 10,
+        search_depth: str = "basic"
+    ) -> Dict[str, Any]:
+        """
+        Search for current industry trends and insights.
+        
+        Args:
+            topic: The specific topic to research
+            industry: The industry context for the search
+            max_results: Maximum number of search results to return
+            search_depth: Depth of search (basic or advanced)
+            
+        Returns:
+            Dictionary containing search results with industry context
+        """
+        # Build industry-specific query
+        search_query = f"{topic} {industry} trends insights"
+        
+        # Use news topic for current trends
+        return await self.search(
+            query=search_query,
+            topic="news" if search_depth == "basic" else "general",
+            search_depth=search_depth,
+            max_results=max_results,
+            include_answer="basic",
+            include_favicon=True,
+            time_range="month"  # Last month for current trends
+        )
+    
+    async def discover_competitors(
+        self,
+        user_url: str,
+        num_results: int = 10,
+        include_domains: Optional[List[str]] = None,
+        exclude_domains: Optional[List[str]] = None,
+        industry_context: Optional[str] = None,
+        website_analysis_data: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """
+        Discover competitors for a given website using Tavily search.
+        
+        Args:
+            user_url: The website URL to find competitors for
+            num_results: Number of competitor results to return
+            include_domains: List of domains to include in search
+            exclude_domains: List of domains to exclude from search
+            industry_context: Industry context for better competitor discovery
+            
+        Returns:
+            Dictionary containing competitor analysis results
+        """
+        try:
+            # Ensure we pick up any per-request injected key
+            self._try_initialize()
+            if not self.enabled:
+                raise ValueError("Tavily Service is not enabled - API key missing")
+            
+            logger.info(f"Starting competitor discovery for: {user_url}")
+            
+            # Extract user domain for exclusion
+            user_domain = urlparse(user_url).netloc
+            exclude_domains_list = exclude_domains or []
+            exclude_domains_list.append(user_domain)
+            
+            # Build search query
+            query_parts = ["similar websites", "competitors"]
+            if industry_context:
+                query_parts.append(f"in {industry_context}")
+            
+            # Extract insights from website analysis if available
+            if website_analysis_data:
+                analysis = website_analysis_data.get('analysis', {})
+                if 'target_audience' in analysis:
+                    audience = analysis['target_audience']
+                    if isinstance(audience, dict) and 'primary_audience' in audience:
+                        query_parts.append(audience['primary_audience'])
+            
+            search_query = " ".join(query_parts)
+            
+            # Perform search
+            search_result = await self.search(
+                query=search_query,
+                topic="general",
+                search_depth="advanced",  # Use advanced for better competitor discovery
+                max_results=num_results,
+                include_domains=include_domains,
+                exclude_domains=exclude_domains_list,
+                include_favicon=True,
+                chunks_per_source=3
+            )
+            
+            if not search_result.get("success"):
+                return search_result
+            
+            # Process results into competitor format
+            competitors = []
+            for result in search_result.get("results", []):
+                competitor_data = {
+                    "url": result.get("url"),
+                    "domain": result.get("domain"),
+                    "title": result.get("title"),
+                    "summary": result.get("content", ""),
+                    "relevance_score": result.get("relevance_score", 0.5),
+                    "favicon": result.get("favicon"),
+                    "published_date": result.get("published_date"),
+                    "highlights": self._extract_highlights(result.get("content", "")),
+                    "competitive_insights": self._extract_competitive_insights(result),
+                    "content_insights": self._analyze_content_quality(result)
+                }
+                competitors.append(competitor_data)
+            
+            logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")
+            
+            return {
+                "success": True,
+                "user_url": user_url,
+                "competitors": competitors,
+                "total_competitors": len(competitors),
+                "analysis_timestamp": datetime.utcnow().isoformat(),
+                "industry_context": industry_context,
+                "request_id": search_result.get("request_id")
+            }
+            
+        except Exception as e:
+            logger.error(f"Error in competitor discovery: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "details": "An unexpected error occurred during competitor discovery"
+            }
+    
+    def _extract_highlights(self, content: str, num_sentences: int = 3) -> List[str]:
+        """Extract key highlights from content."""
+        if not content:
+            return []
+        
+        # Simple sentence extraction (can be enhanced with NLP)
+        sentences = [s.strip() for s in content.split('.') if s.strip()]
+        return sentences[:num_sentences]
+    
+    def _extract_competitive_insights(self, result: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract competitive insights from search result."""
+        content = result.get("content", "")
+        title = result.get("title", "")
+        
+        return {
+            "business_model": "unknown",
+            "target_audience": "unknown",
+            "key_differentiators": []
+        }
+    
+    def _analyze_content_quality(self, result: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze content quality metrics."""
+        content = result.get("content", "")
+        
+        return {
+            "content_focus": "general",
+            "content_quality": "medium",
+            "publishing_frequency": "unknown"
+        }
+