Base code
This commit is contained in:
794
backend/services/research/exa_service.py
Normal file
794
backend/services/research/exa_service.py
Normal file
@@ -0,0 +1,794 @@
|
||||
"""
|
||||
Exa API Service for ALwrity
|
||||
|
||||
This service provides competitor discovery and analysis using the Exa API,
|
||||
which uses neural search to find semantically similar websites and content.
|
||||
|
||||
Key Features:
|
||||
- Competitor discovery using neural search
|
||||
- Content analysis and summarization
|
||||
- Competitive intelligence gathering
|
||||
- Cost-effective API usage with caching
|
||||
- Integration with onboarding Step 3
|
||||
|
||||
Dependencies:
|
||||
- aiohttp (for async HTTP requests)
|
||||
- os (for environment variables)
|
||||
- logging (for debugging)
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
Last Updated: January 2025
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
from typing import Dict, List, Optional, Any, Union
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
from urllib.parse import urlparse
|
||||
from exa_py import Exa
|
||||
|
||||
class ExaService:
|
||||
"""
|
||||
Service for competitor discovery and analysis using the Exa API.
|
||||
|
||||
This service provides neural search capabilities to find semantically similar
|
||||
websites and analyze their content for competitive intelligence.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Exa Service with API credentials."""
|
||||
self.api_key = os.getenv("EXA_API_KEY")
|
||||
self.exa = None
|
||||
self.enabled = False
|
||||
|
||||
# Don't assume key is available at import time in production.
|
||||
# Keys may be injected per-request via middleware, so defer init.
|
||||
self._try_initialize()
|
||||
|
||||
def _try_initialize(self) -> None:
|
||||
"""Attempt to (re)initialize the Exa SDK from current environment."""
|
||||
if self.enabled and self.exa:
|
||||
return
|
||||
try:
|
||||
self.api_key = os.getenv("EXA_API_KEY")
|
||||
if not self.api_key:
|
||||
# Leave disabled; caller may try again after middleware injection
|
||||
logger.warning("EXA_API_KEY not configured; Exa service will be disabled")
|
||||
self.enabled = False
|
||||
self.exa = None
|
||||
return
|
||||
self.exa = Exa(api_key=self.api_key)
|
||||
self.enabled = True
|
||||
logger.info("Exa Service initialized successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize Exa service: {e}")
|
||||
self.enabled = False
|
||||
self.exa = None
|
||||
|
||||
async def discover_competitors(
|
||||
self,
|
||||
user_url: str,
|
||||
num_results: int = 10,
|
||||
include_domains: Optional[List[str]] = None,
|
||||
exclude_domains: Optional[List[str]] = None,
|
||||
industry_context: Optional[str] = None,
|
||||
website_analysis_data: Optional[Dict[str, Any]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Discover competitors for a given website using Exa's neural search.
|
||||
|
||||
Args:
|
||||
user_url: The website URL to find competitors for
|
||||
num_results: Number of competitor results to return (max 100)
|
||||
include_domains: List of domains to include in search
|
||||
exclude_domains: List of domains to exclude from search
|
||||
industry_context: Industry context for better competitor discovery
|
||||
|
||||
Returns:
|
||||
Dictionary containing competitor analysis results
|
||||
"""
|
||||
try:
|
||||
# Ensure we pick up any per-request injected key
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
raise ValueError("Exa Service is not enabled - API key missing")
|
||||
|
||||
logger.info(f"Starting competitor discovery for: {user_url}")
|
||||
|
||||
# Extract user domain for exclusion
|
||||
user_domain = urlparse(user_url).netloc
|
||||
exclude_domains_list = exclude_domains or []
|
||||
exclude_domains_list.append(user_domain)
|
||||
|
||||
logger.info(f"Excluding domains: {exclude_domains_list}")
|
||||
|
||||
# Extract insights from website analysis for better targeting
|
||||
include_text_queries = []
|
||||
summary_query = f"Business model, target audience, content strategy{f' in {industry_context}' if industry_context else ''}"
|
||||
|
||||
if website_analysis_data:
|
||||
analysis = website_analysis_data.get('analysis', {})
|
||||
|
||||
# Extract key business terms from the analysis
|
||||
if 'target_audience' in analysis:
|
||||
audience = analysis['target_audience']
|
||||
if isinstance(audience, dict) and 'primary_audience' in audience:
|
||||
primary_audience = audience['primary_audience']
|
||||
if len(primary_audience.split()) <= 5: # Exa limit
|
||||
include_text_queries.append(primary_audience)
|
||||
|
||||
# Use industry context from analysis
|
||||
if 'industry' in analysis and analysis['industry']:
|
||||
industry = analysis['industry']
|
||||
if len(industry.split()) <= 5:
|
||||
include_text_queries.append(industry)
|
||||
|
||||
# Enhance summary query with analysis insights
|
||||
if 'content_type' in analysis:
|
||||
content_type = analysis['content_type']
|
||||
summary_query += f", {content_type} content strategy"
|
||||
|
||||
logger.info(f"Enhanced targeting with analysis data: {include_text_queries}")
|
||||
|
||||
# Use the Exa SDK to find similar links with content and context
|
||||
search_result = self.exa.find_similar_and_contents(
|
||||
url=user_url,
|
||||
num_results=min(num_results, 10), # Exa API limit
|
||||
include_domains=include_domains,
|
||||
exclude_domains=exclude_domains_list,
|
||||
include_text=include_text_queries if include_text_queries else None,
|
||||
text=True,
|
||||
highlights={
|
||||
"numSentences": 2,
|
||||
"highlightsPerUrl": 3,
|
||||
"query": "Unique value proposition, competitive advantages, market position"
|
||||
},
|
||||
summary={
|
||||
"query": summary_query
|
||||
}
|
||||
)
|
||||
|
||||
# TODO: Add context generation once SDK supports it
|
||||
# For now, we'll generate a basic context from the results
|
||||
context_result = None
|
||||
|
||||
# Log the raw Exa API response summary (avoiding verbose markdown content)
|
||||
logger.info(f"📊 Exa API response for {user_url}:")
|
||||
logger.info(f" ├─ Request ID: {getattr(search_result, 'request_id', 'N/A')}")
|
||||
logger.info(f" ├─ Results count: {len(getattr(search_result, 'results', []))}")
|
||||
logger.info(f" └─ Cost: ${getattr(getattr(search_result, 'cost_dollars', None), 'total', 0)}")
|
||||
|
||||
# Note: Full raw response contains verbose markdown content - logging only summary
|
||||
# To see full response, set EXA_DEBUG=true in environment
|
||||
|
||||
# Extract results from search
|
||||
results = getattr(search_result, 'results', [])
|
||||
|
||||
# Log summary of results
|
||||
logger.info(f" - Found {len(results)} competitors")
|
||||
|
||||
# Process and structure the results
|
||||
competitors = self._process_competitor_results(search_result, user_url)
|
||||
|
||||
logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"user_url": user_url,
|
||||
"competitors": competitors,
|
||||
"total_competitors": len(competitors),
|
||||
"analysis_timestamp": datetime.utcnow().isoformat(),
|
||||
"industry_context": industry_context,
|
||||
"api_cost": getattr(getattr(search_result, 'cost_dollars', None), 'total', 0) if hasattr(search_result, 'cost_dollars') and getattr(search_result, 'cost_dollars', None) else 0,
|
||||
"request_id": getattr(search_result, 'request_id', None) if hasattr(search_result, 'request_id') else None
|
||||
}
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("Exa API request timed out")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Request timed out",
|
||||
"details": "The competitor discovery request took too long to complete"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in competitor discovery: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during competitor discovery"
|
||||
}
|
||||
|
||||
def _process_competitor_results(self, search_result, user_url: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process and structure the Exa SDK response into competitor data.
|
||||
|
||||
Args:
|
||||
search_result: Response from Exa SDK
|
||||
user_url: Original user URL for reference
|
||||
|
||||
Returns:
|
||||
List of processed competitor data
|
||||
"""
|
||||
competitors = []
|
||||
user_domain = urlparse(user_url).netloc
|
||||
|
||||
# Extract results from the SDK response
|
||||
results = getattr(search_result, 'results', [])
|
||||
|
||||
for result in results:
|
||||
try:
|
||||
# Extract basic information from the result object
|
||||
competitor_url = getattr(result, 'url', '')
|
||||
competitor_domain = urlparse(competitor_url).netloc
|
||||
|
||||
# Skip if it's the same domain as the user
|
||||
if competitor_domain == user_domain:
|
||||
continue
|
||||
|
||||
# Extract content insights
|
||||
summary = getattr(result, 'summary', '')
|
||||
highlights = getattr(result, 'highlights', [])
|
||||
highlight_scores = getattr(result, 'highlight_scores', [])
|
||||
|
||||
# Calculate competitive relevance score
|
||||
relevance_score = self._calculate_relevance_score(result, user_url)
|
||||
|
||||
competitor_data = {
|
||||
"url": competitor_url,
|
||||
"domain": competitor_domain,
|
||||
"title": getattr(result, 'title', ''),
|
||||
"published_date": getattr(result, 'published_date', None),
|
||||
"author": getattr(result, 'author', None),
|
||||
"favicon": getattr(result, 'favicon', None),
|
||||
"image": getattr(result, 'image', None),
|
||||
"summary": summary,
|
||||
"highlights": highlights,
|
||||
"highlight_scores": highlight_scores,
|
||||
"relevance_score": relevance_score,
|
||||
"competitive_insights": self._extract_competitive_insights(summary, highlights),
|
||||
"content_analysis": self._analyze_content_quality(result)
|
||||
}
|
||||
|
||||
competitors.append(competitor_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing competitor result: {str(e)}")
|
||||
continue
|
||||
|
||||
# Sort by relevance score (highest first)
|
||||
competitors.sort(key=lambda x: x["relevance_score"], reverse=True)
|
||||
|
||||
return competitors
|
||||
|
||||
def _calculate_relevance_score(self, result, user_url: str) -> float:
|
||||
"""
|
||||
Calculate a relevance score for competitor ranking.
|
||||
|
||||
Args:
|
||||
result: Competitor result from Exa SDK
|
||||
user_url: Original user URL
|
||||
|
||||
Returns:
|
||||
Relevance score between 0 and 1
|
||||
"""
|
||||
score = 0.0
|
||||
|
||||
# Base score from highlight scores
|
||||
highlight_scores = getattr(result, 'highlight_scores', [])
|
||||
if highlight_scores:
|
||||
score += sum(highlight_scores) / len(highlight_scores) * 0.4
|
||||
|
||||
# Score from summary quality
|
||||
summary = getattr(result, 'summary', '')
|
||||
if summary and len(summary) > 100:
|
||||
score += 0.3
|
||||
|
||||
# Score from title relevance
|
||||
title = getattr(result, 'title', '').lower()
|
||||
if any(keyword in title for keyword in ["business", "company", "service", "solution", "platform"]):
|
||||
score += 0.2
|
||||
|
||||
# Score from URL structure similarity
|
||||
competitor_url = getattr(result, 'url', '')
|
||||
if self._url_structure_similarity(user_url, competitor_url) > 0.5:
|
||||
score += 0.1
|
||||
|
||||
return min(score, 1.0)
|
||||
|
||||
def _url_structure_similarity(self, url1: str, url2: str) -> float:
|
||||
"""
|
||||
Calculate URL structure similarity.
|
||||
|
||||
Args:
|
||||
url1: First URL
|
||||
url2: Second URL
|
||||
|
||||
Returns:
|
||||
Similarity score between 0 and 1
|
||||
"""
|
||||
try:
|
||||
parsed1 = urlparse(url1)
|
||||
parsed2 = urlparse(url2)
|
||||
|
||||
# Compare path structure
|
||||
path1_parts = [part for part in parsed1.path.split('/') if part]
|
||||
path2_parts = [part for part in parsed2.path.split('/') if part]
|
||||
|
||||
if not path1_parts or not path2_parts:
|
||||
return 0.0
|
||||
|
||||
# Calculate similarity based on path length and structure
|
||||
max_parts = max(len(path1_parts), len(path2_parts))
|
||||
common_parts = sum(1 for p1, p2 in zip(path1_parts, path2_parts) if p1 == p2)
|
||||
|
||||
return common_parts / max_parts
|
||||
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
def _extract_competitive_insights(self, summary: str, highlights: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract competitive insights from summary and highlights.
|
||||
|
||||
Args:
|
||||
summary: Content summary
|
||||
highlights: Content highlights
|
||||
|
||||
Returns:
|
||||
Dictionary of competitive insights
|
||||
"""
|
||||
insights = {
|
||||
"business_model": "",
|
||||
"target_audience": "",
|
||||
"value_proposition": "",
|
||||
"competitive_advantages": [],
|
||||
"content_strategy": ""
|
||||
}
|
||||
|
||||
# Combine summary and highlights for analysis
|
||||
content = f"{summary} {' '.join(highlights)}".lower()
|
||||
|
||||
# Extract business model indicators
|
||||
business_models = ["saas", "platform", "service", "product", "consulting", "agency", "marketplace"]
|
||||
for model in business_models:
|
||||
if model in content:
|
||||
insights["business_model"] = model.title()
|
||||
break
|
||||
|
||||
# Extract target audience indicators
|
||||
audiences = ["enterprise", "small business", "startups", "developers", "marketers", "consumers"]
|
||||
for audience in audiences:
|
||||
if audience in content:
|
||||
insights["target_audience"] = audience.title()
|
||||
break
|
||||
|
||||
# Extract value proposition from highlights
|
||||
if highlights:
|
||||
insights["value_proposition"] = highlights[0][:100] + "..." if len(highlights[0]) > 100 else highlights[0]
|
||||
|
||||
return insights
|
||||
|
||||
def _analyze_content_quality(self, result) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze the content quality of a competitor.
|
||||
|
||||
Args:
|
||||
result: Competitor result from Exa SDK
|
||||
|
||||
Returns:
|
||||
Dictionary of content quality metrics
|
||||
"""
|
||||
quality_metrics = {
|
||||
"content_depth": "medium",
|
||||
"technical_sophistication": "medium",
|
||||
"content_freshness": "unknown",
|
||||
"engagement_potential": "medium"
|
||||
}
|
||||
|
||||
# Analyze content depth from summary length
|
||||
summary = getattr(result, 'summary', '')
|
||||
if len(summary) > 300:
|
||||
quality_metrics["content_depth"] = "high"
|
||||
elif len(summary) < 100:
|
||||
quality_metrics["content_depth"] = "low"
|
||||
|
||||
# Analyze technical sophistication
|
||||
technical_keywords = ["api", "integration", "automation", "analytics", "data", "platform"]
|
||||
highlights = getattr(result, 'highlights', [])
|
||||
content_text = f"{summary} {' '.join(highlights)}".lower()
|
||||
|
||||
technical_count = sum(1 for keyword in technical_keywords if keyword in content_text)
|
||||
if technical_count >= 3:
|
||||
quality_metrics["technical_sophistication"] = "high"
|
||||
elif technical_count == 0:
|
||||
quality_metrics["technical_sophistication"] = "low"
|
||||
|
||||
return quality_metrics
|
||||
|
||||
async def discover_social_media_accounts(self, user_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Discover social media accounts for a given website using Exa's answer API.
|
||||
|
||||
Args:
|
||||
user_url: The website URL to find social media accounts for
|
||||
|
||||
Returns:
|
||||
Dictionary containing social media discovery results
|
||||
"""
|
||||
try:
|
||||
# Ensure we pick up any per-request injected key
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
raise ValueError("Exa Service is not enabled - API key missing")
|
||||
|
||||
logger.info(f"Starting social media discovery for: {user_url}")
|
||||
|
||||
# Extract domain from URL for better targeting
|
||||
domain = urlparse(user_url).netloc.replace('www.', '')
|
||||
|
||||
# Use Exa's answer API to find social media accounts
|
||||
result = self.exa.answer(
|
||||
f"Find all social media accounts of the url: {domain}. Return a JSON object with facebook, twitter, instagram, linkedin, youtube, and tiktok fields containing the URLs or empty strings if not found.",
|
||||
model="exa-pro",
|
||||
text=True
|
||||
)
|
||||
|
||||
# Log the raw Exa API response for debugging
|
||||
logger.info(f"Raw Exa social media response for {user_url}:")
|
||||
logger.info(f" - Request ID: {getattr(result, 'request_id', 'N/A')}")
|
||||
logger.info(f" └─ Cost: ${getattr(getattr(result, 'cost_dollars', None), 'total', 0)}")
|
||||
# Note: Full raw response contains verbose content - logging only summary
|
||||
# To see full response, set EXA_DEBUG=true in environment
|
||||
|
||||
# Extract social media data
|
||||
answer_text = getattr(result, 'answer', '')
|
||||
citations = getattr(result, 'citations', [])
|
||||
|
||||
# Convert AnswerResult objects to dictionaries for JSON serialization
|
||||
citations_dicts = []
|
||||
for citation in citations:
|
||||
if hasattr(citation, '__dict__'):
|
||||
# Convert object to dictionary
|
||||
citation_dict = {
|
||||
'id': getattr(citation, 'id', ''),
|
||||
'title': getattr(citation, 'title', ''),
|
||||
'url': getattr(citation, 'url', ''),
|
||||
'text': getattr(citation, 'text', ''),
|
||||
'snippet': getattr(citation, 'snippet', ''),
|
||||
'published_date': getattr(citation, 'published_date', None),
|
||||
'author': getattr(citation, 'author', None),
|
||||
'image': getattr(citation, 'image', None),
|
||||
'favicon': getattr(citation, 'favicon', None)
|
||||
}
|
||||
citations_dicts.append(citation_dict)
|
||||
else:
|
||||
# If it's already a dict, use as is
|
||||
citations_dicts.append(citation)
|
||||
|
||||
logger.info(f" - Raw answer text: {answer_text}")
|
||||
logger.info(f" - Citations count: {len(citations_dicts)}")
|
||||
|
||||
# Parse the response from the answer (could be JSON or markdown format)
|
||||
try:
|
||||
import json
|
||||
import re
|
||||
|
||||
if answer_text.strip().startswith('{'):
|
||||
# Direct JSON format
|
||||
answer_data = json.loads(answer_text.strip())
|
||||
else:
|
||||
# Parse markdown format with URLs
|
||||
answer_data = {
|
||||
"facebook": "",
|
||||
"twitter": "",
|
||||
"instagram": "",
|
||||
"linkedin": "",
|
||||
"youtube": "",
|
||||
"tiktok": ""
|
||||
}
|
||||
|
||||
# Extract URLs using regex patterns
|
||||
facebook_match = re.search(r'Facebook.*?\[([^\]]+)\]', answer_text)
|
||||
if facebook_match:
|
||||
answer_data["facebook"] = facebook_match.group(1)
|
||||
|
||||
twitter_match = re.search(r'Twitter.*?\[([^\]]+)\]', answer_text)
|
||||
if twitter_match:
|
||||
answer_data["twitter"] = twitter_match.group(1)
|
||||
|
||||
instagram_match = re.search(r'Instagram.*?\[([^\]]+)\]', answer_text)
|
||||
if instagram_match:
|
||||
answer_data["instagram"] = instagram_match.group(1)
|
||||
|
||||
linkedin_match = re.search(r'LinkedIn.*?\[([^\]]+)\]', answer_text)
|
||||
if linkedin_match:
|
||||
answer_data["linkedin"] = linkedin_match.group(1)
|
||||
|
||||
youtube_match = re.search(r'YouTube.*?\[([^\]]+)\]', answer_text)
|
||||
if youtube_match:
|
||||
answer_data["youtube"] = youtube_match.group(1)
|
||||
|
||||
tiktok_match = re.search(r'TikTok.*?\[([^\]]+)\]', answer_text)
|
||||
if tiktok_match:
|
||||
answer_data["tiktok"] = tiktok_match.group(1)
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, KeyError):
|
||||
# If parsing fails, create empty structure
|
||||
answer_data = {
|
||||
"facebook": "",
|
||||
"twitter": "",
|
||||
"instagram": "",
|
||||
"linkedin": "",
|
||||
"youtube": "",
|
||||
"tiktok": ""
|
||||
}
|
||||
|
||||
logger.info(f" - Parsed social media accounts:")
|
||||
for platform, url in answer_data.items():
|
||||
if url:
|
||||
logger.info(f" {platform}: {url}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"user_url": user_url,
|
||||
"social_media_accounts": answer_data,
|
||||
"citations": citations_dicts,
|
||||
"analysis_timestamp": datetime.utcnow().isoformat(),
|
||||
"api_cost": getattr(getattr(result, 'cost_dollars', None), 'total', 0) if hasattr(result, 'cost_dollars') and getattr(result, 'cost_dollars', None) else 0,
|
||||
"request_id": getattr(result, 'request_id', None) if hasattr(result, 'request_id') else None
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in social media discovery: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during social media discovery"
|
||||
}
|
||||
|
||||
def _generate_basic_context(self, results: List[Any], user_url: str) -> str:
|
||||
"""
|
||||
Generate a basic context string from competitor results for LLM consumption.
|
||||
|
||||
Args:
|
||||
results: List of competitor results from Exa API
|
||||
user_url: Original user URL for reference
|
||||
|
||||
Returns:
|
||||
Formatted context string
|
||||
"""
|
||||
context_parts = [
|
||||
f"Competitive Analysis for: {user_url}",
|
||||
f"Found {len(results)} similar websites/competitors:",
|
||||
""
|
||||
]
|
||||
|
||||
for i, result in enumerate(results[:5], 1): # Limit to top 5 for context
|
||||
url = getattr(result, 'url', 'Unknown URL')
|
||||
title = getattr(result, 'title', 'Unknown Title')
|
||||
summary = getattr(result, 'summary', 'No summary available')
|
||||
|
||||
context_parts.extend([
|
||||
f"{i}. {title}",
|
||||
f" URL: {url}",
|
||||
f" Summary: {summary[:200]}{'...' if len(summary) > 200 else ''}",
|
||||
""
|
||||
])
|
||||
|
||||
context_parts.append("Key insights:")
|
||||
context_parts.append("- These competitors offer similar services or content")
|
||||
context_parts.append("- Analyze their content strategy and positioning")
|
||||
context_parts.append("- Identify opportunities for differentiation")
|
||||
|
||||
return "\n".join(context_parts)
|
||||
|
||||
async def analyze_competitor_content(
|
||||
self,
|
||||
competitor_url: str,
|
||||
analysis_depth: str = "standard"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Perform deeper analysis of a specific competitor.
|
||||
|
||||
Args:
|
||||
competitor_url: URL of the competitor to analyze
|
||||
analysis_depth: Depth of analysis ("quick", "standard", "deep")
|
||||
|
||||
Returns:
|
||||
Dictionary containing detailed competitor analysis
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Starting detailed analysis for competitor: {competitor_url}")
|
||||
|
||||
# Get similar content from this competitor
|
||||
similar_results = await self.discover_competitors(
|
||||
competitor_url,
|
||||
num_results=10,
|
||||
include_domains=[urlparse(competitor_url).netloc]
|
||||
)
|
||||
|
||||
if not similar_results["success"]:
|
||||
return similar_results
|
||||
|
||||
# Analyze content patterns
|
||||
content_patterns = self._analyze_content_patterns(similar_results["competitors"])
|
||||
|
||||
# Generate competitive insights
|
||||
competitive_insights = self._generate_competitive_insights(
|
||||
competitor_url,
|
||||
similar_results["competitors"],
|
||||
content_patterns
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"competitor_url": competitor_url,
|
||||
"content_patterns": content_patterns,
|
||||
"competitive_insights": competitive_insights,
|
||||
"analysis_timestamp": datetime.utcnow().isoformat(),
|
||||
"analysis_depth": analysis_depth
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in competitor content analysis: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during competitor analysis"
|
||||
}
|
||||
|
||||
def _analyze_content_patterns(self, competitors: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze content patterns across competitors.
|
||||
|
||||
Args:
|
||||
competitors: List of competitor data
|
||||
|
||||
Returns:
|
||||
Dictionary of content patterns
|
||||
"""
|
||||
patterns = {
|
||||
"common_themes": [],
|
||||
"content_types": [],
|
||||
"publishing_patterns": {},
|
||||
"target_keywords": [],
|
||||
"content_strategies": []
|
||||
}
|
||||
|
||||
# Analyze common themes
|
||||
all_summaries = [comp.get("summary", "") for comp in competitors]
|
||||
# This would be enhanced with NLP analysis in a full implementation
|
||||
|
||||
# Analyze content types from URLs
|
||||
content_types = set()
|
||||
for comp in competitors:
|
||||
url = comp.get("url", "")
|
||||
if "/blog/" in url:
|
||||
content_types.add("blog")
|
||||
elif "/product/" in url or "/service/" in url:
|
||||
content_types.add("product")
|
||||
elif "/about/" in url:
|
||||
content_types.add("about")
|
||||
elif "/contact/" in url:
|
||||
content_types.add("contact")
|
||||
|
||||
patterns["content_types"] = list(content_types)
|
||||
|
||||
return patterns
|
||||
|
||||
def _generate_competitive_insights(
|
||||
self,
|
||||
competitor_url: str,
|
||||
competitors: List[Dict[str, Any]],
|
||||
content_patterns: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate competitive insights from analysis data.
|
||||
|
||||
Args:
|
||||
competitor_url: URL of the competitor
|
||||
competitors: List of competitor data
|
||||
content_patterns: Content pattern analysis
|
||||
|
||||
Returns:
|
||||
Dictionary of competitive insights
|
||||
"""
|
||||
insights = {
|
||||
"competitive_strengths": [],
|
||||
"content_opportunities": [],
|
||||
"market_positioning": "unknown",
|
||||
"strategic_recommendations": []
|
||||
}
|
||||
|
||||
# Analyze competitive strengths
|
||||
for comp in competitors:
|
||||
if comp.get("relevance_score", 0) > 0.7:
|
||||
insights["competitive_strengths"].append({
|
||||
"strength": comp.get("summary", "")[:100],
|
||||
"relevance": comp.get("relevance_score", 0)
|
||||
})
|
||||
|
||||
# Generate content opportunities
|
||||
if content_patterns.get("content_types"):
|
||||
insights["content_opportunities"] = [
|
||||
f"Develop {content_type} content"
|
||||
for content_type in content_patterns["content_types"]
|
||||
]
|
||||
|
||||
return insights
|
||||
|
||||
def health_check(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check the health of the Exa service.
|
||||
|
||||
Returns:
|
||||
Dictionary containing service health status
|
||||
"""
|
||||
try:
|
||||
# Ensure latest env before health check
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
return {
|
||||
"status": "disabled",
|
||||
"message": "Exa API key not configured",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Test with a simple request using the SDK directly
|
||||
test_result = self.exa.find_similar(
|
||||
url="https://example.com",
|
||||
num_results=1
|
||||
)
|
||||
|
||||
# If we get here without an exception, the API is working
|
||||
return {
|
||||
"status": "healthy",
|
||||
"message": "Exa API is operational",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"test_successful": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"status": "error",
|
||||
"message": f"Health check failed: {str(e)}",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
def get_cost_estimate(self, num_results: int, include_content: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Get cost estimate for Exa API usage.
|
||||
|
||||
Args:
|
||||
num_results: Number of results requested
|
||||
include_content: Whether to include content analysis
|
||||
|
||||
Returns:
|
||||
Dictionary containing cost estimate
|
||||
"""
|
||||
# Exa API pricing (as of documentation)
|
||||
if num_results <= 25:
|
||||
search_cost = 0.005
|
||||
elif num_results <= 100:
|
||||
search_cost = 0.025
|
||||
else:
|
||||
search_cost = 1.0
|
||||
|
||||
content_cost = 0.0
|
||||
if include_content:
|
||||
# Estimate content analysis cost
|
||||
content_cost = num_results * 0.001 # Rough estimate
|
||||
|
||||
total_cost = search_cost + content_cost
|
||||
|
||||
return {
|
||||
"search_cost": search_cost,
|
||||
"content_cost": content_cost,
|
||||
"total_estimated_cost": total_cost,
|
||||
"num_results": num_results,
|
||||
"include_content": include_content
|
||||
}
|
||||
Reference in New Issue
Block a user