""" Exa API Service for ALwrity This service provides competitor discovery and analysis using the Exa API, which uses neural search to find semantically similar websites and content. Key Features: - Competitor discovery using neural search - Content analysis and summarization - Competitive intelligence gathering - Cost-effective API usage with caching - Integration with onboarding Step 3 Dependencies: - aiohttp (for async HTTP requests) - os (for environment variables) - logging (for debugging) Author: ALwrity Team Version: 1.0 Last Updated: January 2025 """ import os import json import asyncio from typing import Dict, List, Optional, Any, Union from datetime import datetime, timedelta from loguru import logger from urllib.parse import urlparse from exa_py import Exa class ExaService: """ Service for competitor discovery and analysis using the Exa API. This service provides neural search capabilities to find semantically similar websites and analyze their content for competitive intelligence. """ def __init__(self): """Initialize the Exa Service with API credentials.""" self.api_key = os.getenv("EXA_API_KEY") self.exa = None self.enabled = False # Don't assume key is available at import time in production. # Keys may be injected per-request via middleware, so defer init. self._try_initialize() def _try_initialize(self) -> None: """Attempt to (re)initialize the Exa SDK from current environment.""" if self.enabled and self.exa: return try: self.api_key = os.getenv("EXA_API_KEY") if not self.api_key: # Leave disabled; caller may try again after middleware injection logger.warning("EXA_API_KEY not configured; Exa service will be disabled") self.enabled = False self.exa = None return self.exa = Exa(api_key=self.api_key) self.enabled = True logger.info("Exa Service initialized successfully") except Exception as e: logger.error(f"Failed to initialize Exa service: {e}") self.enabled = False self.exa = None async def discover_competitors( self, user_url: str, num_results: int = 10, include_domains: Optional[List[str]] = None, exclude_domains: Optional[List[str]] = None, industry_context: Optional[str] = None, website_analysis_data: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: """ Discover competitors for a given website using Exa's neural search. Args: user_url: The website URL to find competitors for num_results: Number of competitor results to return (max 100) include_domains: List of domains to include in search exclude_domains: List of domains to exclude from search industry_context: Industry context for better competitor discovery Returns: Dictionary containing competitor analysis results """ try: # Ensure we pick up any per-request injected key self._try_initialize() if not self.enabled: raise ValueError("Exa Service is not enabled - API key missing") logger.info(f"Starting competitor discovery for: {user_url}") # Extract user domain for exclusion user_domain = urlparse(user_url).netloc exclude_domains_list = exclude_domains or [] exclude_domains_list.append(user_domain) logger.info(f"Excluding domains: {exclude_domains_list}") # Extract insights from website analysis for better targeting include_text_queries = [] summary_query = f"Business model, target audience, content strategy{f' in {industry_context}' if industry_context else ''}" if website_analysis_data: analysis = website_analysis_data.get('analysis', {}) # Extract key business terms from the analysis if 'target_audience' in analysis: audience = analysis['target_audience'] if isinstance(audience, dict) and 'primary_audience' in audience: primary_audience = audience['primary_audience'] if len(primary_audience.split()) <= 5: # Exa limit include_text_queries.append(primary_audience) # Use industry context from analysis if 'industry' in analysis and analysis['industry']: industry = analysis['industry'] if len(industry.split()) <= 5: include_text_queries.append(industry) # Enhance summary query with analysis insights if 'content_type' in analysis: content_type = analysis['content_type'] summary_query += f", {content_type} content strategy" logger.info(f"Enhanced targeting with analysis data: {include_text_queries}") # Use the Exa SDK to find similar links with content and context search_result = self.exa.find_similar_and_contents( url=user_url, num_results=min(num_results, 10), # Exa API limit include_domains=include_domains, exclude_domains=exclude_domains_list, include_text=include_text_queries if include_text_queries else None, text=True, highlights={ "numSentences": 2, "highlightsPerUrl": 3, "query": "Unique value proposition, competitive advantages, market position" }, summary={ "query": summary_query } ) # TODO: Add context generation once SDK supports it # For now, we'll generate a basic context from the results context_result = None # Log the raw Exa API response summary (avoiding verbose markdown content) logger.info(f"📊 Exa API response for {user_url}:") logger.info(f" ├─ Request ID: {getattr(search_result, 'request_id', 'N/A')}") logger.info(f" ├─ Results count: {len(getattr(search_result, 'results', []))}") logger.info(f" └─ Cost: ${getattr(getattr(search_result, 'cost_dollars', None), 'total', 0)}") # Note: Full raw response contains verbose markdown content - logging only summary # To see full response, set EXA_DEBUG=true in environment # Extract results from search results = getattr(search_result, 'results', []) # Log summary of results logger.info(f" - Found {len(results)} competitors") # Process and structure the results competitors = self._process_competitor_results(search_result, user_url) logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}") return { "success": True, "user_url": user_url, "competitors": competitors, "total_competitors": len(competitors), "analysis_timestamp": datetime.utcnow().isoformat(), "industry_context": industry_context, "api_cost": getattr(getattr(search_result, 'cost_dollars', None), 'total', 0) if hasattr(search_result, 'cost_dollars') and getattr(search_result, 'cost_dollars', None) else 0, "request_id": getattr(search_result, 'request_id', None) if hasattr(search_result, 'request_id') else None } except asyncio.TimeoutError: logger.error("Exa API request timed out") return { "success": False, "error": "Request timed out", "details": "The competitor discovery request took too long to complete" } except Exception as e: logger.error(f"Error in competitor discovery: {str(e)}") return { "success": False, "error": str(e), "details": "An unexpected error occurred during competitor discovery" } def _process_competitor_results(self, search_result, user_url: str) -> List[Dict[str, Any]]: """ Process and structure the Exa SDK response into competitor data. Args: search_result: Response from Exa SDK user_url: Original user URL for reference Returns: List of processed competitor data """ competitors = [] user_domain = urlparse(user_url).netloc # Extract results from the SDK response results = getattr(search_result, 'results', []) for result in results: try: # Extract basic information from the result object competitor_url = getattr(result, 'url', '') competitor_domain = urlparse(competitor_url).netloc # Skip if it's the same domain as the user if competitor_domain == user_domain: continue # Extract content insights summary = getattr(result, 'summary', '') highlights = getattr(result, 'highlights', []) highlight_scores = getattr(result, 'highlight_scores', []) # Calculate competitive relevance score relevance_score = self._calculate_relevance_score(result, user_url) competitor_data = { "url": competitor_url, "domain": competitor_domain, "title": getattr(result, 'title', ''), "published_date": getattr(result, 'published_date', None), "author": getattr(result, 'author', None), "favicon": getattr(result, 'favicon', None), "image": getattr(result, 'image', None), "summary": summary, "highlights": highlights, "highlight_scores": highlight_scores, "relevance_score": relevance_score, "competitive_insights": self._extract_competitive_insights(summary, highlights), "content_analysis": self._analyze_content_quality(result) } competitors.append(competitor_data) except Exception as e: logger.warning(f"Error processing competitor result: {str(e)}") continue # Sort by relevance score (highest first) competitors.sort(key=lambda x: x["relevance_score"], reverse=True) return competitors def _calculate_relevance_score(self, result, user_url: str) -> float: """ Calculate a relevance score for competitor ranking. Args: result: Competitor result from Exa SDK user_url: Original user URL Returns: Relevance score between 0 and 1 """ score = 0.0 # Base score from highlight scores highlight_scores = getattr(result, 'highlight_scores', []) if highlight_scores: score += sum(highlight_scores) / len(highlight_scores) * 0.4 # Score from summary quality summary = getattr(result, 'summary', '') if summary and len(summary) > 100: score += 0.3 # Score from title relevance title = getattr(result, 'title', '').lower() if any(keyword in title for keyword in ["business", "company", "service", "solution", "platform"]): score += 0.2 # Score from URL structure similarity competitor_url = getattr(result, 'url', '') if self._url_structure_similarity(user_url, competitor_url) > 0.5: score += 0.1 return min(score, 1.0) def _url_structure_similarity(self, url1: str, url2: str) -> float: """ Calculate URL structure similarity. Args: url1: First URL url2: Second URL Returns: Similarity score between 0 and 1 """ try: parsed1 = urlparse(url1) parsed2 = urlparse(url2) # Compare path structure path1_parts = [part for part in parsed1.path.split('/') if part] path2_parts = [part for part in parsed2.path.split('/') if part] if not path1_parts or not path2_parts: return 0.0 # Calculate similarity based on path length and structure max_parts = max(len(path1_parts), len(path2_parts)) common_parts = sum(1 for p1, p2 in zip(path1_parts, path2_parts) if p1 == p2) return common_parts / max_parts except Exception: return 0.0 def _extract_competitive_insights(self, summary: str, highlights: List[str]) -> Dict[str, Any]: """ Extract competitive insights from summary and highlights. Args: summary: Content summary highlights: Content highlights Returns: Dictionary of competitive insights """ insights = { "business_model": "", "target_audience": "", "value_proposition": "", "competitive_advantages": [], "content_strategy": "" } # Combine summary and highlights for analysis content = f"{summary} {' '.join(highlights)}".lower() # Extract business model indicators business_models = ["saas", "platform", "service", "product", "consulting", "agency", "marketplace"] for model in business_models: if model in content: insights["business_model"] = model.title() break # Extract target audience indicators audiences = ["enterprise", "small business", "startups", "developers", "marketers", "consumers"] for audience in audiences: if audience in content: insights["target_audience"] = audience.title() break # Extract value proposition from highlights if highlights: insights["value_proposition"] = highlights[0][:100] + "..." if len(highlights[0]) > 100 else highlights[0] return insights def _analyze_content_quality(self, result) -> Dict[str, Any]: """ Analyze the content quality of a competitor. Args: result: Competitor result from Exa SDK Returns: Dictionary of content quality metrics """ quality_metrics = { "content_depth": "medium", "technical_sophistication": "medium", "content_freshness": "unknown", "engagement_potential": "medium" } # Analyze content depth from summary length summary = getattr(result, 'summary', '') if len(summary) > 300: quality_metrics["content_depth"] = "high" elif len(summary) < 100: quality_metrics["content_depth"] = "low" # Analyze technical sophistication technical_keywords = ["api", "integration", "automation", "analytics", "data", "platform"] highlights = getattr(result, 'highlights', []) content_text = f"{summary} {' '.join(highlights)}".lower() technical_count = sum(1 for keyword in technical_keywords if keyword in content_text) if technical_count >= 3: quality_metrics["technical_sophistication"] = "high" elif technical_count == 0: quality_metrics["technical_sophistication"] = "low" return quality_metrics async def discover_social_media_accounts(self, user_url: str) -> Dict[str, Any]: """ Discover social media accounts for a given website using Exa's answer API. Args: user_url: The website URL to find social media accounts for Returns: Dictionary containing social media discovery results """ try: # Ensure we pick up any per-request injected key self._try_initialize() if not self.enabled: raise ValueError("Exa Service is not enabled - API key missing") logger.info(f"Starting social media discovery for: {user_url}") # Extract domain from URL for better targeting domain = urlparse(user_url).netloc.replace('www.', '') # Use Exa's answer API to find social media accounts result = self.exa.answer( f"Find all social media accounts of the url: {domain}. Return a JSON object with facebook, twitter, instagram, linkedin, youtube, and tiktok fields containing the URLs or empty strings if not found.", model="exa-pro", text=True ) # Log the raw Exa API response for debugging logger.info(f"Raw Exa social media response for {user_url}:") logger.info(f" - Request ID: {getattr(result, 'request_id', 'N/A')}") logger.info(f" └─ Cost: ${getattr(getattr(result, 'cost_dollars', None), 'total', 0)}") # Note: Full raw response contains verbose content - logging only summary # To see full response, set EXA_DEBUG=true in environment # Extract social media data answer_text = getattr(result, 'answer', '') citations = getattr(result, 'citations', []) # Convert AnswerResult objects to dictionaries for JSON serialization citations_dicts = [] for citation in citations: if hasattr(citation, '__dict__'): # Convert object to dictionary citation_dict = { 'id': getattr(citation, 'id', ''), 'title': getattr(citation, 'title', ''), 'url': getattr(citation, 'url', ''), 'text': getattr(citation, 'text', ''), 'snippet': getattr(citation, 'snippet', ''), 'published_date': getattr(citation, 'published_date', None), 'author': getattr(citation, 'author', None), 'image': getattr(citation, 'image', None), 'favicon': getattr(citation, 'favicon', None) } citations_dicts.append(citation_dict) else: # If it's already a dict, use as is citations_dicts.append(citation) logger.info(f" - Raw answer text: {answer_text}") logger.info(f" - Citations count: {len(citations_dicts)}") # Parse the response from the answer (could be JSON or markdown format) try: import json import re if answer_text.strip().startswith('{'): # Direct JSON format answer_data = json.loads(answer_text.strip()) else: # Parse markdown format with URLs answer_data = { "facebook": "", "twitter": "", "instagram": "", "linkedin": "", "youtube": "", "tiktok": "" } # Extract URLs using regex patterns facebook_match = re.search(r'Facebook.*?\[([^\]]+)\]', answer_text) if facebook_match: answer_data["facebook"] = facebook_match.group(1) twitter_match = re.search(r'Twitter.*?\[([^\]]+)\]', answer_text) if twitter_match: answer_data["twitter"] = twitter_match.group(1) instagram_match = re.search(r'Instagram.*?\[([^\]]+)\]', answer_text) if instagram_match: answer_data["instagram"] = instagram_match.group(1) linkedin_match = re.search(r'LinkedIn.*?\[([^\]]+)\]', answer_text) if linkedin_match: answer_data["linkedin"] = linkedin_match.group(1) youtube_match = re.search(r'YouTube.*?\[([^\]]+)\]', answer_text) if youtube_match: answer_data["youtube"] = youtube_match.group(1) tiktok_match = re.search(r'TikTok.*?\[([^\]]+)\]', answer_text) if tiktok_match: answer_data["tiktok"] = tiktok_match.group(1) except (json.JSONDecodeError, AttributeError, KeyError): # If parsing fails, create empty structure answer_data = { "facebook": "", "twitter": "", "instagram": "", "linkedin": "", "youtube": "", "tiktok": "" } logger.info(f" - Parsed social media accounts:") for platform, url in answer_data.items(): if url: logger.info(f" {platform}: {url}") return { "success": True, "user_url": user_url, "social_media_accounts": answer_data, "citations": citations_dicts, "analysis_timestamp": datetime.utcnow().isoformat(), "api_cost": getattr(getattr(result, 'cost_dollars', None), 'total', 0) if hasattr(result, 'cost_dollars') and getattr(result, 'cost_dollars', None) else 0, "request_id": getattr(result, 'request_id', None) if hasattr(result, 'request_id') else None } except Exception as e: logger.error(f"Error in social media discovery: {str(e)}") return { "success": False, "error": str(e), "details": "An unexpected error occurred during social media discovery" } def _generate_basic_context(self, results: List[Any], user_url: str) -> str: """ Generate a basic context string from competitor results for LLM consumption. Args: results: List of competitor results from Exa API user_url: Original user URL for reference Returns: Formatted context string """ context_parts = [ f"Competitive Analysis for: {user_url}", f"Found {len(results)} similar websites/competitors:", "" ] for i, result in enumerate(results[:5], 1): # Limit to top 5 for context url = getattr(result, 'url', 'Unknown URL') title = getattr(result, 'title', 'Unknown Title') summary = getattr(result, 'summary', 'No summary available') context_parts.extend([ f"{i}. {title}", f" URL: {url}", f" Summary: {summary[:200]}{'...' if len(summary) > 200 else ''}", "" ]) context_parts.append("Key insights:") context_parts.append("- These competitors offer similar services or content") context_parts.append("- Analyze their content strategy and positioning") context_parts.append("- Identify opportunities for differentiation") return "\n".join(context_parts) async def analyze_competitor_content( self, competitor_url: str, analysis_depth: str = "standard" ) -> Dict[str, Any]: """ Perform deeper analysis of a specific competitor. Args: competitor_url: URL of the competitor to analyze analysis_depth: Depth of analysis ("quick", "standard", "deep") Returns: Dictionary containing detailed competitor analysis """ try: logger.info(f"Starting detailed analysis for competitor: {competitor_url}") # Get similar content from this competitor similar_results = await self.discover_competitors( competitor_url, num_results=10, include_domains=[urlparse(competitor_url).netloc] ) if not similar_results["success"]: return similar_results # Analyze content patterns content_patterns = self._analyze_content_patterns(similar_results["competitors"]) # Generate competitive insights competitive_insights = self._generate_competitive_insights( competitor_url, similar_results["competitors"], content_patterns ) return { "success": True, "competitor_url": competitor_url, "content_patterns": content_patterns, "competitive_insights": competitive_insights, "analysis_timestamp": datetime.utcnow().isoformat(), "analysis_depth": analysis_depth } except Exception as e: logger.error(f"Error in competitor content analysis: {str(e)}") return { "success": False, "error": str(e), "details": "An unexpected error occurred during competitor analysis" } def _analyze_content_patterns(self, competitors: List[Dict[str, Any]]) -> Dict[str, Any]: """ Analyze content patterns across competitors. Args: competitors: List of competitor data Returns: Dictionary of content patterns """ patterns = { "common_themes": [], "content_types": [], "publishing_patterns": {}, "target_keywords": [], "content_strategies": [] } # Analyze common themes all_summaries = [comp.get("summary", "") for comp in competitors] # This would be enhanced with NLP analysis in a full implementation # Analyze content types from URLs content_types = set() for comp in competitors: url = comp.get("url", "") if "/blog/" in url: content_types.add("blog") elif "/product/" in url or "/service/" in url: content_types.add("product") elif "/about/" in url: content_types.add("about") elif "/contact/" in url: content_types.add("contact") patterns["content_types"] = list(content_types) return patterns def _generate_competitive_insights( self, competitor_url: str, competitors: List[Dict[str, Any]], content_patterns: Dict[str, Any] ) -> Dict[str, Any]: """ Generate competitive insights from analysis data. Args: competitor_url: URL of the competitor competitors: List of competitor data content_patterns: Content pattern analysis Returns: Dictionary of competitive insights """ insights = { "competitive_strengths": [], "content_opportunities": [], "market_positioning": "unknown", "strategic_recommendations": [] } # Analyze competitive strengths for comp in competitors: if comp.get("relevance_score", 0) > 0.7: insights["competitive_strengths"].append({ "strength": comp.get("summary", "")[:100], "relevance": comp.get("relevance_score", 0) }) # Generate content opportunities if content_patterns.get("content_types"): insights["content_opportunities"] = [ f"Develop {content_type} content" for content_type in content_patterns["content_types"] ] return insights def health_check(self) -> Dict[str, Any]: """ Check the health of the Exa service. Returns: Dictionary containing service health status """ try: # Ensure latest env before health check self._try_initialize() if not self.enabled: return { "status": "disabled", "message": "Exa API key not configured", "timestamp": datetime.utcnow().isoformat() } # Test with a simple request using the SDK directly test_result = self.exa.find_similar( url="https://example.com", num_results=1 ) # If we get here without an exception, the API is working return { "status": "healthy", "message": "Exa API is operational", "timestamp": datetime.utcnow().isoformat(), "test_successful": True } except Exception as e: return { "status": "error", "message": f"Health check failed: {str(e)}", "timestamp": datetime.utcnow().isoformat() } def get_cost_estimate(self, num_results: int, include_content: bool = True) -> Dict[str, Any]: """ Get cost estimate for Exa API usage. Args: num_results: Number of results requested include_content: Whether to include content analysis Returns: Dictionary containing cost estimate """ # Exa API pricing (as of documentation) if num_results <= 25: search_cost = 0.005 elif num_results <= 100: search_cost = 0.025 else: search_cost = 1.0 content_cost = 0.0 if include_content: # Estimate content analysis cost content_cost = num_results * 0.001 # Rough estimate total_cost = search_cost + content_cost return { "search_cost": search_cost, "content_cost": content_cost, "total_estimated_cost": total_cost, "num_results": num_results, "include_content": include_content }