795 lines
31 KiB
Python
795 lines
31 KiB
Python
"""
|
|
Exa API Service for ALwrity
|
|
|
|
This service provides competitor discovery and analysis using the Exa API,
|
|
which uses neural search to find semantically similar websites and content.
|
|
|
|
Key Features:
|
|
- Competitor discovery using neural search
|
|
- Content analysis and summarization
|
|
- Competitive intelligence gathering
|
|
- Cost-effective API usage with caching
|
|
- Integration with onboarding Step 3
|
|
|
|
Dependencies:
|
|
- aiohttp (for async HTTP requests)
|
|
- os (for environment variables)
|
|
- logging (for debugging)
|
|
|
|
Author: ALwrity Team
|
|
Version: 1.0
|
|
Last Updated: January 2025
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import asyncio
|
|
from typing import Dict, List, Optional, Any, Union
|
|
from datetime import datetime, timedelta
|
|
from loguru import logger
|
|
from urllib.parse import urlparse
|
|
from exa_py import Exa
|
|
|
|
class ExaService:
|
|
"""
|
|
Service for competitor discovery and analysis using the Exa API.
|
|
|
|
This service provides neural search capabilities to find semantically similar
|
|
websites and analyze their content for competitive intelligence.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the Exa Service with API credentials."""
|
|
self.api_key = os.getenv("EXA_API_KEY")
|
|
self.exa = None
|
|
self.enabled = False
|
|
|
|
# Don't assume key is available at import time in production.
|
|
# Keys may be injected per-request via middleware, so defer init.
|
|
self._try_initialize()
|
|
|
|
def _try_initialize(self) -> None:
|
|
"""Attempt to (re)initialize the Exa SDK from current environment."""
|
|
if self.enabled and self.exa:
|
|
return
|
|
try:
|
|
self.api_key = os.getenv("EXA_API_KEY")
|
|
if not self.api_key:
|
|
# Leave disabled; caller may try again after middleware injection
|
|
logger.warning("EXA_API_KEY not configured; Exa service will be disabled")
|
|
self.enabled = False
|
|
self.exa = None
|
|
return
|
|
self.exa = Exa(api_key=self.api_key)
|
|
self.enabled = True
|
|
logger.info("Exa Service initialized successfully")
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize Exa service: {e}")
|
|
self.enabled = False
|
|
self.exa = None
|
|
|
|
async def discover_competitors(
|
|
self,
|
|
user_url: str,
|
|
num_results: int = 10,
|
|
include_domains: Optional[List[str]] = None,
|
|
exclude_domains: Optional[List[str]] = None,
|
|
industry_context: Optional[str] = None,
|
|
website_analysis_data: Optional[Dict[str, Any]] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Discover competitors for a given website using Exa's neural search.
|
|
|
|
Args:
|
|
user_url: The website URL to find competitors for
|
|
num_results: Number of competitor results to return (max 100)
|
|
include_domains: List of domains to include in search
|
|
exclude_domains: List of domains to exclude from search
|
|
industry_context: Industry context for better competitor discovery
|
|
|
|
Returns:
|
|
Dictionary containing competitor analysis results
|
|
"""
|
|
try:
|
|
# Ensure we pick up any per-request injected key
|
|
self._try_initialize()
|
|
if not self.enabled:
|
|
raise ValueError("Exa Service is not enabled - API key missing")
|
|
|
|
logger.info(f"Starting competitor discovery for: {user_url}")
|
|
|
|
# Extract user domain for exclusion
|
|
user_domain = urlparse(user_url).netloc
|
|
exclude_domains_list = exclude_domains or []
|
|
exclude_domains_list.append(user_domain)
|
|
|
|
logger.info(f"Excluding domains: {exclude_domains_list}")
|
|
|
|
# Extract insights from website analysis for better targeting
|
|
include_text_queries = []
|
|
summary_query = f"Business model, target audience, content strategy{f' in {industry_context}' if industry_context else ''}"
|
|
|
|
if website_analysis_data:
|
|
analysis = website_analysis_data.get('analysis', {})
|
|
|
|
# Extract key business terms from the analysis
|
|
if 'target_audience' in analysis:
|
|
audience = analysis['target_audience']
|
|
if isinstance(audience, dict) and 'primary_audience' in audience:
|
|
primary_audience = audience['primary_audience']
|
|
if len(primary_audience.split()) <= 5: # Exa limit
|
|
include_text_queries.append(primary_audience)
|
|
|
|
# Use industry context from analysis
|
|
if 'industry' in analysis and analysis['industry']:
|
|
industry = analysis['industry']
|
|
if len(industry.split()) <= 5:
|
|
include_text_queries.append(industry)
|
|
|
|
# Enhance summary query with analysis insights
|
|
if 'content_type' in analysis:
|
|
content_type = analysis['content_type']
|
|
summary_query += f", {content_type} content strategy"
|
|
|
|
logger.info(f"Enhanced targeting with analysis data: {include_text_queries}")
|
|
|
|
# Use the Exa SDK to find similar links with content and context
|
|
search_result = self.exa.find_similar_and_contents(
|
|
url=user_url,
|
|
num_results=min(num_results, 10), # Exa API limit
|
|
include_domains=include_domains,
|
|
exclude_domains=exclude_domains_list,
|
|
include_text=include_text_queries if include_text_queries else None,
|
|
text=True,
|
|
highlights={
|
|
"numSentences": 2,
|
|
"highlightsPerUrl": 3,
|
|
"query": "Unique value proposition, competitive advantages, market position"
|
|
},
|
|
summary={
|
|
"query": summary_query
|
|
}
|
|
)
|
|
|
|
# TODO: Add context generation once SDK supports it
|
|
# For now, we'll generate a basic context from the results
|
|
context_result = None
|
|
|
|
# Log the raw Exa API response summary (avoiding verbose markdown content)
|
|
logger.info(f"📊 Exa API response for {user_url}:")
|
|
logger.info(f" ├─ Request ID: {getattr(search_result, 'request_id', 'N/A')}")
|
|
logger.info(f" ├─ Results count: {len(getattr(search_result, 'results', []))}")
|
|
logger.info(f" └─ Cost: ${getattr(getattr(search_result, 'cost_dollars', None), 'total', 0)}")
|
|
|
|
# Note: Full raw response contains verbose markdown content - logging only summary
|
|
# To see full response, set EXA_DEBUG=true in environment
|
|
|
|
# Extract results from search
|
|
results = getattr(search_result, 'results', [])
|
|
|
|
# Log summary of results
|
|
logger.info(f" - Found {len(results)} competitors")
|
|
|
|
# Process and structure the results
|
|
competitors = self._process_competitor_results(search_result, user_url)
|
|
|
|
logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")
|
|
|
|
return {
|
|
"success": True,
|
|
"user_url": user_url,
|
|
"competitors": competitors,
|
|
"total_competitors": len(competitors),
|
|
"analysis_timestamp": datetime.utcnow().isoformat(),
|
|
"industry_context": industry_context,
|
|
"api_cost": getattr(getattr(search_result, 'cost_dollars', None), 'total', 0) if hasattr(search_result, 'cost_dollars') and getattr(search_result, 'cost_dollars', None) else 0,
|
|
"request_id": getattr(search_result, 'request_id', None) if hasattr(search_result, 'request_id') else None
|
|
}
|
|
|
|
except asyncio.TimeoutError:
|
|
logger.error("Exa API request timed out")
|
|
return {
|
|
"success": False,
|
|
"error": "Request timed out",
|
|
"details": "The competitor discovery request took too long to complete"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in competitor discovery: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"details": "An unexpected error occurred during competitor discovery"
|
|
}
|
|
|
|
def _process_competitor_results(self, search_result, user_url: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Process and structure the Exa SDK response into competitor data.
|
|
|
|
Args:
|
|
search_result: Response from Exa SDK
|
|
user_url: Original user URL for reference
|
|
|
|
Returns:
|
|
List of processed competitor data
|
|
"""
|
|
competitors = []
|
|
user_domain = urlparse(user_url).netloc
|
|
|
|
# Extract results from the SDK response
|
|
results = getattr(search_result, 'results', [])
|
|
|
|
for result in results:
|
|
try:
|
|
# Extract basic information from the result object
|
|
competitor_url = getattr(result, 'url', '')
|
|
competitor_domain = urlparse(competitor_url).netloc
|
|
|
|
# Skip if it's the same domain as the user
|
|
if competitor_domain == user_domain:
|
|
continue
|
|
|
|
# Extract content insights
|
|
summary = getattr(result, 'summary', '')
|
|
highlights = getattr(result, 'highlights', [])
|
|
highlight_scores = getattr(result, 'highlight_scores', [])
|
|
|
|
# Calculate competitive relevance score
|
|
relevance_score = self._calculate_relevance_score(result, user_url)
|
|
|
|
competitor_data = {
|
|
"url": competitor_url,
|
|
"domain": competitor_domain,
|
|
"title": getattr(result, 'title', ''),
|
|
"published_date": getattr(result, 'published_date', None),
|
|
"author": getattr(result, 'author', None),
|
|
"favicon": getattr(result, 'favicon', None),
|
|
"image": getattr(result, 'image', None),
|
|
"summary": summary,
|
|
"highlights": highlights,
|
|
"highlight_scores": highlight_scores,
|
|
"relevance_score": relevance_score,
|
|
"competitive_insights": self._extract_competitive_insights(summary, highlights),
|
|
"content_analysis": self._analyze_content_quality(result)
|
|
}
|
|
|
|
competitors.append(competitor_data)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error processing competitor result: {str(e)}")
|
|
continue
|
|
|
|
# Sort by relevance score (highest first)
|
|
competitors.sort(key=lambda x: x["relevance_score"], reverse=True)
|
|
|
|
return competitors
|
|
|
|
def _calculate_relevance_score(self, result, user_url: str) -> float:
|
|
"""
|
|
Calculate a relevance score for competitor ranking.
|
|
|
|
Args:
|
|
result: Competitor result from Exa SDK
|
|
user_url: Original user URL
|
|
|
|
Returns:
|
|
Relevance score between 0 and 1
|
|
"""
|
|
score = 0.0
|
|
|
|
# Base score from highlight scores
|
|
highlight_scores = getattr(result, 'highlight_scores', [])
|
|
if highlight_scores:
|
|
score += sum(highlight_scores) / len(highlight_scores) * 0.4
|
|
|
|
# Score from summary quality
|
|
summary = getattr(result, 'summary', '')
|
|
if summary and len(summary) > 100:
|
|
score += 0.3
|
|
|
|
# Score from title relevance
|
|
title = getattr(result, 'title', '').lower()
|
|
if any(keyword in title for keyword in ["business", "company", "service", "solution", "platform"]):
|
|
score += 0.2
|
|
|
|
# Score from URL structure similarity
|
|
competitor_url = getattr(result, 'url', '')
|
|
if self._url_structure_similarity(user_url, competitor_url) > 0.5:
|
|
score += 0.1
|
|
|
|
return min(score, 1.0)
|
|
|
|
def _url_structure_similarity(self, url1: str, url2: str) -> float:
|
|
"""
|
|
Calculate URL structure similarity.
|
|
|
|
Args:
|
|
url1: First URL
|
|
url2: Second URL
|
|
|
|
Returns:
|
|
Similarity score between 0 and 1
|
|
"""
|
|
try:
|
|
parsed1 = urlparse(url1)
|
|
parsed2 = urlparse(url2)
|
|
|
|
# Compare path structure
|
|
path1_parts = [part for part in parsed1.path.split('/') if part]
|
|
path2_parts = [part for part in parsed2.path.split('/') if part]
|
|
|
|
if not path1_parts or not path2_parts:
|
|
return 0.0
|
|
|
|
# Calculate similarity based on path length and structure
|
|
max_parts = max(len(path1_parts), len(path2_parts))
|
|
common_parts = sum(1 for p1, p2 in zip(path1_parts, path2_parts) if p1 == p2)
|
|
|
|
return common_parts / max_parts
|
|
|
|
except Exception:
|
|
return 0.0
|
|
|
|
def _extract_competitive_insights(self, summary: str, highlights: List[str]) -> Dict[str, Any]:
|
|
"""
|
|
Extract competitive insights from summary and highlights.
|
|
|
|
Args:
|
|
summary: Content summary
|
|
highlights: Content highlights
|
|
|
|
Returns:
|
|
Dictionary of competitive insights
|
|
"""
|
|
insights = {
|
|
"business_model": "",
|
|
"target_audience": "",
|
|
"value_proposition": "",
|
|
"competitive_advantages": [],
|
|
"content_strategy": ""
|
|
}
|
|
|
|
# Combine summary and highlights for analysis
|
|
content = f"{summary} {' '.join(highlights)}".lower()
|
|
|
|
# Extract business model indicators
|
|
business_models = ["saas", "platform", "service", "product", "consulting", "agency", "marketplace"]
|
|
for model in business_models:
|
|
if model in content:
|
|
insights["business_model"] = model.title()
|
|
break
|
|
|
|
# Extract target audience indicators
|
|
audiences = ["enterprise", "small business", "startups", "developers", "marketers", "consumers"]
|
|
for audience in audiences:
|
|
if audience in content:
|
|
insights["target_audience"] = audience.title()
|
|
break
|
|
|
|
# Extract value proposition from highlights
|
|
if highlights:
|
|
insights["value_proposition"] = highlights[0][:100] + "..." if len(highlights[0]) > 100 else highlights[0]
|
|
|
|
return insights
|
|
|
|
def _analyze_content_quality(self, result) -> Dict[str, Any]:
|
|
"""
|
|
Analyze the content quality of a competitor.
|
|
|
|
Args:
|
|
result: Competitor result from Exa SDK
|
|
|
|
Returns:
|
|
Dictionary of content quality metrics
|
|
"""
|
|
quality_metrics = {
|
|
"content_depth": "medium",
|
|
"technical_sophistication": "medium",
|
|
"content_freshness": "unknown",
|
|
"engagement_potential": "medium"
|
|
}
|
|
|
|
# Analyze content depth from summary length
|
|
summary = getattr(result, 'summary', '')
|
|
if len(summary) > 300:
|
|
quality_metrics["content_depth"] = "high"
|
|
elif len(summary) < 100:
|
|
quality_metrics["content_depth"] = "low"
|
|
|
|
# Analyze technical sophistication
|
|
technical_keywords = ["api", "integration", "automation", "analytics", "data", "platform"]
|
|
highlights = getattr(result, 'highlights', [])
|
|
content_text = f"{summary} {' '.join(highlights)}".lower()
|
|
|
|
technical_count = sum(1 for keyword in technical_keywords if keyword in content_text)
|
|
if technical_count >= 3:
|
|
quality_metrics["technical_sophistication"] = "high"
|
|
elif technical_count == 0:
|
|
quality_metrics["technical_sophistication"] = "low"
|
|
|
|
return quality_metrics
|
|
|
|
async def discover_social_media_accounts(self, user_url: str) -> Dict[str, Any]:
|
|
"""
|
|
Discover social media accounts for a given website using Exa's answer API.
|
|
|
|
Args:
|
|
user_url: The website URL to find social media accounts for
|
|
|
|
Returns:
|
|
Dictionary containing social media discovery results
|
|
"""
|
|
try:
|
|
# Ensure we pick up any per-request injected key
|
|
self._try_initialize()
|
|
if not self.enabled:
|
|
raise ValueError("Exa Service is not enabled - API key missing")
|
|
|
|
logger.info(f"Starting social media discovery for: {user_url}")
|
|
|
|
# Extract domain from URL for better targeting
|
|
domain = urlparse(user_url).netloc.replace('www.', '')
|
|
|
|
# Use Exa's answer API to find social media accounts
|
|
result = self.exa.answer(
|
|
f"Find all social media accounts of the url: {domain}. Return a JSON object with facebook, twitter, instagram, linkedin, youtube, and tiktok fields containing the URLs or empty strings if not found.",
|
|
model="exa-pro",
|
|
text=True
|
|
)
|
|
|
|
# Log the raw Exa API response for debugging
|
|
logger.info(f"Raw Exa social media response for {user_url}:")
|
|
logger.info(f" - Request ID: {getattr(result, 'request_id', 'N/A')}")
|
|
logger.info(f" └─ Cost: ${getattr(getattr(result, 'cost_dollars', None), 'total', 0)}")
|
|
# Note: Full raw response contains verbose content - logging only summary
|
|
# To see full response, set EXA_DEBUG=true in environment
|
|
|
|
# Extract social media data
|
|
answer_text = getattr(result, 'answer', '')
|
|
citations = getattr(result, 'citations', [])
|
|
|
|
# Convert AnswerResult objects to dictionaries for JSON serialization
|
|
citations_dicts = []
|
|
for citation in citations:
|
|
if hasattr(citation, '__dict__'):
|
|
# Convert object to dictionary
|
|
citation_dict = {
|
|
'id': getattr(citation, 'id', ''),
|
|
'title': getattr(citation, 'title', ''),
|
|
'url': getattr(citation, 'url', ''),
|
|
'text': getattr(citation, 'text', ''),
|
|
'snippet': getattr(citation, 'snippet', ''),
|
|
'published_date': getattr(citation, 'published_date', None),
|
|
'author': getattr(citation, 'author', None),
|
|
'image': getattr(citation, 'image', None),
|
|
'favicon': getattr(citation, 'favicon', None)
|
|
}
|
|
citations_dicts.append(citation_dict)
|
|
else:
|
|
# If it's already a dict, use as is
|
|
citations_dicts.append(citation)
|
|
|
|
logger.info(f" - Raw answer text: {answer_text}")
|
|
logger.info(f" - Citations count: {len(citations_dicts)}")
|
|
|
|
# Parse the response from the answer (could be JSON or markdown format)
|
|
try:
|
|
import json
|
|
import re
|
|
|
|
if answer_text.strip().startswith('{'):
|
|
# Direct JSON format
|
|
answer_data = json.loads(answer_text.strip())
|
|
else:
|
|
# Parse markdown format with URLs
|
|
answer_data = {
|
|
"facebook": "",
|
|
"twitter": "",
|
|
"instagram": "",
|
|
"linkedin": "",
|
|
"youtube": "",
|
|
"tiktok": ""
|
|
}
|
|
|
|
# Extract URLs using regex patterns
|
|
facebook_match = re.search(r'Facebook.*?\[([^\]]+)\]', answer_text)
|
|
if facebook_match:
|
|
answer_data["facebook"] = facebook_match.group(1)
|
|
|
|
twitter_match = re.search(r'Twitter.*?\[([^\]]+)\]', answer_text)
|
|
if twitter_match:
|
|
answer_data["twitter"] = twitter_match.group(1)
|
|
|
|
instagram_match = re.search(r'Instagram.*?\[([^\]]+)\]', answer_text)
|
|
if instagram_match:
|
|
answer_data["instagram"] = instagram_match.group(1)
|
|
|
|
linkedin_match = re.search(r'LinkedIn.*?\[([^\]]+)\]', answer_text)
|
|
if linkedin_match:
|
|
answer_data["linkedin"] = linkedin_match.group(1)
|
|
|
|
youtube_match = re.search(r'YouTube.*?\[([^\]]+)\]', answer_text)
|
|
if youtube_match:
|
|
answer_data["youtube"] = youtube_match.group(1)
|
|
|
|
tiktok_match = re.search(r'TikTok.*?\[([^\]]+)\]', answer_text)
|
|
if tiktok_match:
|
|
answer_data["tiktok"] = tiktok_match.group(1)
|
|
|
|
except (json.JSONDecodeError, AttributeError, KeyError):
|
|
# If parsing fails, create empty structure
|
|
answer_data = {
|
|
"facebook": "",
|
|
"twitter": "",
|
|
"instagram": "",
|
|
"linkedin": "",
|
|
"youtube": "",
|
|
"tiktok": ""
|
|
}
|
|
|
|
logger.info(f" - Parsed social media accounts:")
|
|
for platform, url in answer_data.items():
|
|
if url:
|
|
logger.info(f" {platform}: {url}")
|
|
|
|
return {
|
|
"success": True,
|
|
"user_url": user_url,
|
|
"social_media_accounts": answer_data,
|
|
"citations": citations_dicts,
|
|
"analysis_timestamp": datetime.utcnow().isoformat(),
|
|
"api_cost": getattr(getattr(result, 'cost_dollars', None), 'total', 0) if hasattr(result, 'cost_dollars') and getattr(result, 'cost_dollars', None) else 0,
|
|
"request_id": getattr(result, 'request_id', None) if hasattr(result, 'request_id') else None
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in social media discovery: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"details": "An unexpected error occurred during social media discovery"
|
|
}
|
|
|
|
def _generate_basic_context(self, results: List[Any], user_url: str) -> str:
|
|
"""
|
|
Generate a basic context string from competitor results for LLM consumption.
|
|
|
|
Args:
|
|
results: List of competitor results from Exa API
|
|
user_url: Original user URL for reference
|
|
|
|
Returns:
|
|
Formatted context string
|
|
"""
|
|
context_parts = [
|
|
f"Competitive Analysis for: {user_url}",
|
|
f"Found {len(results)} similar websites/competitors:",
|
|
""
|
|
]
|
|
|
|
for i, result in enumerate(results[:5], 1): # Limit to top 5 for context
|
|
url = getattr(result, 'url', 'Unknown URL')
|
|
title = getattr(result, 'title', 'Unknown Title')
|
|
summary = getattr(result, 'summary', 'No summary available')
|
|
|
|
context_parts.extend([
|
|
f"{i}. {title}",
|
|
f" URL: {url}",
|
|
f" Summary: {summary[:200]}{'...' if len(summary) > 200 else ''}",
|
|
""
|
|
])
|
|
|
|
context_parts.append("Key insights:")
|
|
context_parts.append("- These competitors offer similar services or content")
|
|
context_parts.append("- Analyze their content strategy and positioning")
|
|
context_parts.append("- Identify opportunities for differentiation")
|
|
|
|
return "\n".join(context_parts)
|
|
|
|
async def analyze_competitor_content(
|
|
self,
|
|
competitor_url: str,
|
|
analysis_depth: str = "standard"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Perform deeper analysis of a specific competitor.
|
|
|
|
Args:
|
|
competitor_url: URL of the competitor to analyze
|
|
analysis_depth: Depth of analysis ("quick", "standard", "deep")
|
|
|
|
Returns:
|
|
Dictionary containing detailed competitor analysis
|
|
"""
|
|
try:
|
|
logger.info(f"Starting detailed analysis for competitor: {competitor_url}")
|
|
|
|
# Get similar content from this competitor
|
|
similar_results = await self.discover_competitors(
|
|
competitor_url,
|
|
num_results=10,
|
|
include_domains=[urlparse(competitor_url).netloc]
|
|
)
|
|
|
|
if not similar_results["success"]:
|
|
return similar_results
|
|
|
|
# Analyze content patterns
|
|
content_patterns = self._analyze_content_patterns(similar_results["competitors"])
|
|
|
|
# Generate competitive insights
|
|
competitive_insights = self._generate_competitive_insights(
|
|
competitor_url,
|
|
similar_results["competitors"],
|
|
content_patterns
|
|
)
|
|
|
|
return {
|
|
"success": True,
|
|
"competitor_url": competitor_url,
|
|
"content_patterns": content_patterns,
|
|
"competitive_insights": competitive_insights,
|
|
"analysis_timestamp": datetime.utcnow().isoformat(),
|
|
"analysis_depth": analysis_depth
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in competitor content analysis: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"details": "An unexpected error occurred during competitor analysis"
|
|
}
|
|
|
|
def _analyze_content_patterns(self, competitors: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""
|
|
Analyze content patterns across competitors.
|
|
|
|
Args:
|
|
competitors: List of competitor data
|
|
|
|
Returns:
|
|
Dictionary of content patterns
|
|
"""
|
|
patterns = {
|
|
"common_themes": [],
|
|
"content_types": [],
|
|
"publishing_patterns": {},
|
|
"target_keywords": [],
|
|
"content_strategies": []
|
|
}
|
|
|
|
# Analyze common themes
|
|
all_summaries = [comp.get("summary", "") for comp in competitors]
|
|
# This would be enhanced with NLP analysis in a full implementation
|
|
|
|
# Analyze content types from URLs
|
|
content_types = set()
|
|
for comp in competitors:
|
|
url = comp.get("url", "")
|
|
if "/blog/" in url:
|
|
content_types.add("blog")
|
|
elif "/product/" in url or "/service/" in url:
|
|
content_types.add("product")
|
|
elif "/about/" in url:
|
|
content_types.add("about")
|
|
elif "/contact/" in url:
|
|
content_types.add("contact")
|
|
|
|
patterns["content_types"] = list(content_types)
|
|
|
|
return patterns
|
|
|
|
def _generate_competitive_insights(
|
|
self,
|
|
competitor_url: str,
|
|
competitors: List[Dict[str, Any]],
|
|
content_patterns: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Generate competitive insights from analysis data.
|
|
|
|
Args:
|
|
competitor_url: URL of the competitor
|
|
competitors: List of competitor data
|
|
content_patterns: Content pattern analysis
|
|
|
|
Returns:
|
|
Dictionary of competitive insights
|
|
"""
|
|
insights = {
|
|
"competitive_strengths": [],
|
|
"content_opportunities": [],
|
|
"market_positioning": "unknown",
|
|
"strategic_recommendations": []
|
|
}
|
|
|
|
# Analyze competitive strengths
|
|
for comp in competitors:
|
|
if comp.get("relevance_score", 0) > 0.7:
|
|
insights["competitive_strengths"].append({
|
|
"strength": comp.get("summary", "")[:100],
|
|
"relevance": comp.get("relevance_score", 0)
|
|
})
|
|
|
|
# Generate content opportunities
|
|
if content_patterns.get("content_types"):
|
|
insights["content_opportunities"] = [
|
|
f"Develop {content_type} content"
|
|
for content_type in content_patterns["content_types"]
|
|
]
|
|
|
|
return insights
|
|
|
|
def health_check(self) -> Dict[str, Any]:
|
|
"""
|
|
Check the health of the Exa service.
|
|
|
|
Returns:
|
|
Dictionary containing service health status
|
|
"""
|
|
try:
|
|
# Ensure latest env before health check
|
|
self._try_initialize()
|
|
if not self.enabled:
|
|
return {
|
|
"status": "disabled",
|
|
"message": "Exa API key not configured",
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
# Test with a simple request using the SDK directly
|
|
test_result = self.exa.find_similar(
|
|
url="https://example.com",
|
|
num_results=1
|
|
)
|
|
|
|
# If we get here without an exception, the API is working
|
|
return {
|
|
"status": "healthy",
|
|
"message": "Exa API is operational",
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"test_successful": True
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"status": "error",
|
|
"message": f"Health check failed: {str(e)}",
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
def get_cost_estimate(self, num_results: int, include_content: bool = True) -> Dict[str, Any]:
|
|
"""
|
|
Get cost estimate for Exa API usage.
|
|
|
|
Args:
|
|
num_results: Number of results requested
|
|
include_content: Whether to include content analysis
|
|
|
|
Returns:
|
|
Dictionary containing cost estimate
|
|
"""
|
|
# Exa API pricing (as of documentation)
|
|
if num_results <= 25:
|
|
search_cost = 0.005
|
|
elif num_results <= 100:
|
|
search_cost = 0.025
|
|
else:
|
|
search_cost = 1.0
|
|
|
|
content_cost = 0.0
|
|
if include_content:
|
|
# Estimate content analysis cost
|
|
content_cost = num_results * 0.001 # Rough estimate
|
|
|
|
total_cost = search_cost + content_cost
|
|
|
|
return {
|
|
"search_cost": search_cost,
|
|
"content_cost": content_cost,
|
|
"total_estimated_cost": total_cost,
|
|
"num_results": num_results,
|
|
"include_content": include_content
|
|
}
|