Base code
This commit is contained in:
425
backend/services/research/tavily_service.py
Normal file
425
backend/services/research/tavily_service.py
Normal file
@@ -0,0 +1,425 @@
|
||||
"""
|
||||
Tavily API Service for ALwrity
|
||||
|
||||
This service provides web search and research capabilities using the Tavily API,
|
||||
which offers AI-powered search with real-time information retrieval.
|
||||
|
||||
Key Features:
|
||||
- Web search with AI-powered results
|
||||
- Content extraction and summarization
|
||||
- Real-time information retrieval
|
||||
- Topic-based search (general, news, finance)
|
||||
- Advanced search depth options
|
||||
- Cost-effective API usage with caching
|
||||
|
||||
Dependencies:
|
||||
- aiohttp (for async HTTP requests)
|
||||
- os (for environment variables)
|
||||
- logging (for debugging)
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
Last Updated: January 2025
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import aiohttp
|
||||
from typing import Dict, List, Optional, Any, Union
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
class TavilyService:
|
||||
"""
|
||||
Service for web search and research using the Tavily API.
|
||||
|
||||
This service provides AI-powered search capabilities to find relevant
|
||||
content and information for research purposes.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Tavily Service with API credentials."""
|
||||
self.api_key = os.getenv("TAVILY_API_KEY")
|
||||
self.base_url = "https://api.tavily.com"
|
||||
self.enabled = False
|
||||
|
||||
# Don't assume key is available at import time in production.
|
||||
# Keys may be injected per-request via middleware, so defer init.
|
||||
self._try_initialize()
|
||||
|
||||
def _try_initialize(self) -> None:
|
||||
"""Attempt to (re)initialize the Tavily service from current environment."""
|
||||
if self.enabled and self.api_key:
|
||||
return
|
||||
try:
|
||||
self.api_key = os.getenv("TAVILY_API_KEY")
|
||||
if not self.api_key:
|
||||
# Leave disabled; caller may try again after middleware injection
|
||||
logger.warning("TAVILY_API_KEY not configured; Tavily service will be disabled")
|
||||
self.enabled = False
|
||||
return
|
||||
self.enabled = True
|
||||
logger.info("Tavily Service initialized successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize Tavily service: {e}")
|
||||
self.enabled = False
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
topic: str = "general",
|
||||
search_depth: str = "basic",
|
||||
max_results: int = 10,
|
||||
include_domains: Optional[List[str]] = None,
|
||||
exclude_domains: Optional[List[str]] = None,
|
||||
include_answer: Union[bool, str] = False,
|
||||
include_raw_content: Union[bool, str] = False,
|
||||
include_images: bool = False,
|
||||
include_image_descriptions: bool = False,
|
||||
include_favicon: bool = False,
|
||||
time_range: Optional[str] = None,
|
||||
start_date: Optional[str] = None,
|
||||
end_date: Optional[str] = None,
|
||||
country: Optional[str] = None,
|
||||
chunks_per_source: int = 3,
|
||||
auto_parameters: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Execute a search query using Tavily API.
|
||||
|
||||
Args:
|
||||
query: The search query to execute
|
||||
topic: Category of search (general, news, finance)
|
||||
search_depth: Depth of search (basic, advanced) - basic costs 1 credit, advanced costs 2
|
||||
max_results: Maximum number of results to return (0-20)
|
||||
include_domains: List of domains to specifically include
|
||||
exclude_domains: List of domains to specifically exclude
|
||||
include_answer: Include LLM-generated answer (basic/advanced/true/false)
|
||||
include_raw_content: Include raw HTML content (markdown/text/true/false)
|
||||
include_images: Include image search results
|
||||
include_image_descriptions: Include image descriptions
|
||||
include_favicon: Include favicon URLs
|
||||
time_range: Time range filter (day, week, month, year, d, w, m, y)
|
||||
start_date: Start date filter (YYYY-MM-DD)
|
||||
end_date: End date filter (YYYY-MM-DD)
|
||||
country: Country filter (boost results from specific country)
|
||||
chunks_per_source: Maximum chunks per source (1-3, only for advanced search)
|
||||
auto_parameters: Auto-configure parameters based on query
|
||||
|
||||
Returns:
|
||||
Dictionary containing search results
|
||||
"""
|
||||
try:
|
||||
# Ensure we pick up any per-request injected key
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
raise ValueError("Tavily Service is not enabled - API key missing")
|
||||
|
||||
logger.info(f"Starting Tavily search for: {query}")
|
||||
|
||||
# Build request payload
|
||||
payload = {
|
||||
"api_key": self.api_key,
|
||||
"query": query,
|
||||
"topic": topic,
|
||||
"search_depth": search_depth,
|
||||
"max_results": min(max_results, 20), # Tavily limit
|
||||
"include_favicon": include_favicon
|
||||
}
|
||||
|
||||
# Add optional parameters
|
||||
if include_domains:
|
||||
payload["include_domains"] = include_domains[:300] # Tavily limit
|
||||
|
||||
if exclude_domains:
|
||||
payload["exclude_domains"] = exclude_domains[:150] # Tavily limit
|
||||
|
||||
if include_answer:
|
||||
payload["include_answer"] = include_answer
|
||||
|
||||
if include_raw_content:
|
||||
payload["include_raw_content"] = include_raw_content
|
||||
|
||||
if include_images:
|
||||
payload["include_images"] = include_images
|
||||
if include_image_descriptions:
|
||||
payload["include_image_descriptions"] = include_image_descriptions
|
||||
|
||||
if time_range:
|
||||
payload["time_range"] = time_range
|
||||
|
||||
if start_date:
|
||||
payload["start_date"] = start_date
|
||||
|
||||
if end_date:
|
||||
payload["end_date"] = end_date
|
||||
|
||||
if country and topic == "general":
|
||||
payload["country"] = country
|
||||
|
||||
if search_depth == "advanced" and 1 <= chunks_per_source <= 3:
|
||||
payload["chunks_per_source"] = chunks_per_source
|
||||
|
||||
if auto_parameters:
|
||||
payload["auto_parameters"] = True
|
||||
|
||||
# Make API request
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
f"{self.base_url}/search",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=aiohttp.ClientTimeout(total=60)
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
result = await response.json()
|
||||
logger.info(f"Tavily search completed successfully. Found {len(result.get('results', []))} results.")
|
||||
|
||||
# Process and structure results
|
||||
processed_results = self._process_search_results(result, query)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"query": result.get("query", query),
|
||||
"answer": result.get("answer"), # If include_answer was requested
|
||||
"results": processed_results,
|
||||
"images": result.get("images", []),
|
||||
"response_time": result.get("response_time"),
|
||||
"request_id": result.get("request_id"),
|
||||
"auto_parameters": result.get("auto_parameters"),
|
||||
"total_results": len(processed_results),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
else:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Tavily API error: {response.status} - {error_text}")
|
||||
raise RuntimeError(f"Tavily API error: {response.status} - {error_text}")
|
||||
|
||||
except aiohttp.ClientTimeout:
|
||||
logger.error("Tavily API request timed out")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Request timed out",
|
||||
"details": "The search request took too long to complete"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Tavily search: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during search"
|
||||
}
|
||||
|
||||
def _process_search_results(self, api_response: Dict[str, Any], query: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process and structure Tavily API response into standardized format.
|
||||
|
||||
Args:
|
||||
api_response: Raw response from Tavily API
|
||||
query: Original search query
|
||||
|
||||
Returns:
|
||||
List of processed search results
|
||||
"""
|
||||
results = []
|
||||
raw_results = api_response.get("results", [])
|
||||
|
||||
for result in raw_results:
|
||||
try:
|
||||
# Extract domain from URL
|
||||
url = result.get("url", "")
|
||||
domain = urlparse(url).netloc if url else ""
|
||||
|
||||
# Calculate relevance score (Tavily provides score field)
|
||||
relevance_score = result.get("score", 0.5)
|
||||
|
||||
processed_result = {
|
||||
"url": url,
|
||||
"domain": domain,
|
||||
"title": result.get("title", ""),
|
||||
"content": result.get("content", ""),
|
||||
"raw_content": result.get("raw_content"), # If include_raw_content was requested
|
||||
"score": relevance_score,
|
||||
"relevance_score": relevance_score, # Alias for compatibility
|
||||
"favicon": result.get("favicon"),
|
||||
"published_date": result.get("published_date"),
|
||||
}
|
||||
|
||||
results.append(processed_result)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing Tavily result: {str(e)}")
|
||||
continue
|
||||
|
||||
# Sort by relevance score (highest first)
|
||||
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
|
||||
|
||||
return results
|
||||
|
||||
async def search_industry_trends(
|
||||
self,
|
||||
topic: str,
|
||||
industry: str,
|
||||
max_results: int = 10,
|
||||
search_depth: str = "basic"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Search for current industry trends and insights.
|
||||
|
||||
Args:
|
||||
topic: The specific topic to research
|
||||
industry: The industry context for the search
|
||||
max_results: Maximum number of search results to return
|
||||
search_depth: Depth of search (basic or advanced)
|
||||
|
||||
Returns:
|
||||
Dictionary containing search results with industry context
|
||||
"""
|
||||
# Build industry-specific query
|
||||
search_query = f"{topic} {industry} trends insights"
|
||||
|
||||
# Use news topic for current trends
|
||||
return await self.search(
|
||||
query=search_query,
|
||||
topic="news" if search_depth == "basic" else "general",
|
||||
search_depth=search_depth,
|
||||
max_results=max_results,
|
||||
include_answer="basic",
|
||||
include_favicon=True,
|
||||
time_range="month" # Last month for current trends
|
||||
)
|
||||
|
||||
async def discover_competitors(
|
||||
self,
|
||||
user_url: str,
|
||||
num_results: int = 10,
|
||||
include_domains: Optional[List[str]] = None,
|
||||
exclude_domains: Optional[List[str]] = None,
|
||||
industry_context: Optional[str] = None,
|
||||
website_analysis_data: Optional[Dict[str, Any]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Discover competitors for a given website using Tavily search.
|
||||
|
||||
Args:
|
||||
user_url: The website URL to find competitors for
|
||||
num_results: Number of competitor results to return
|
||||
include_domains: List of domains to include in search
|
||||
exclude_domains: List of domains to exclude from search
|
||||
industry_context: Industry context for better competitor discovery
|
||||
|
||||
Returns:
|
||||
Dictionary containing competitor analysis results
|
||||
"""
|
||||
try:
|
||||
# Ensure we pick up any per-request injected key
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
raise ValueError("Tavily Service is not enabled - API key missing")
|
||||
|
||||
logger.info(f"Starting competitor discovery for: {user_url}")
|
||||
|
||||
# Extract user domain for exclusion
|
||||
user_domain = urlparse(user_url).netloc
|
||||
exclude_domains_list = exclude_domains or []
|
||||
exclude_domains_list.append(user_domain)
|
||||
|
||||
# Build search query
|
||||
query_parts = ["similar websites", "competitors"]
|
||||
if industry_context:
|
||||
query_parts.append(f"in {industry_context}")
|
||||
|
||||
# Extract insights from website analysis if available
|
||||
if website_analysis_data:
|
||||
analysis = website_analysis_data.get('analysis', {})
|
||||
if 'target_audience' in analysis:
|
||||
audience = analysis['target_audience']
|
||||
if isinstance(audience, dict) and 'primary_audience' in audience:
|
||||
query_parts.append(audience['primary_audience'])
|
||||
|
||||
search_query = " ".join(query_parts)
|
||||
|
||||
# Perform search
|
||||
search_result = await self.search(
|
||||
query=search_query,
|
||||
topic="general",
|
||||
search_depth="advanced", # Use advanced for better competitor discovery
|
||||
max_results=num_results,
|
||||
include_domains=include_domains,
|
||||
exclude_domains=exclude_domains_list,
|
||||
include_favicon=True,
|
||||
chunks_per_source=3
|
||||
)
|
||||
|
||||
if not search_result.get("success"):
|
||||
return search_result
|
||||
|
||||
# Process results into competitor format
|
||||
competitors = []
|
||||
for result in search_result.get("results", []):
|
||||
competitor_data = {
|
||||
"url": result.get("url"),
|
||||
"domain": result.get("domain"),
|
||||
"title": result.get("title"),
|
||||
"summary": result.get("content", ""),
|
||||
"relevance_score": result.get("relevance_score", 0.5),
|
||||
"favicon": result.get("favicon"),
|
||||
"published_date": result.get("published_date"),
|
||||
"highlights": self._extract_highlights(result.get("content", "")),
|
||||
"competitive_insights": self._extract_competitive_insights(result),
|
||||
"content_insights": self._analyze_content_quality(result)
|
||||
}
|
||||
competitors.append(competitor_data)
|
||||
|
||||
logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"user_url": user_url,
|
||||
"competitors": competitors,
|
||||
"total_competitors": len(competitors),
|
||||
"analysis_timestamp": datetime.utcnow().isoformat(),
|
||||
"industry_context": industry_context,
|
||||
"request_id": search_result.get("request_id")
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in competitor discovery: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during competitor discovery"
|
||||
}
|
||||
|
||||
def _extract_highlights(self, content: str, num_sentences: int = 3) -> List[str]:
|
||||
"""Extract key highlights from content."""
|
||||
if not content:
|
||||
return []
|
||||
|
||||
# Simple sentence extraction (can be enhanced with NLP)
|
||||
sentences = [s.strip() for s in content.split('.') if s.strip()]
|
||||
return sentences[:num_sentences]
|
||||
|
||||
def _extract_competitive_insights(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Extract competitive insights from search result."""
|
||||
content = result.get("content", "")
|
||||
title = result.get("title", "")
|
||||
|
||||
return {
|
||||
"business_model": "unknown",
|
||||
"target_audience": "unknown",
|
||||
"key_differentiators": []
|
||||
}
|
||||
|
||||
def _analyze_content_quality(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Analyze content quality metrics."""
|
||||
content = result.get("content", "")
|
||||
|
||||
return {
|
||||
"content_focus": "general",
|
||||
"content_quality": "medium",
|
||||
"publishing_frequency": "unknown"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user