Base code
This commit is contained in:
402
backend/services/seo/competitive_analyzer.py
Normal file
402
backend/services/seo/competitive_analyzer.py
Normal file
@@ -0,0 +1,402 @@
|
||||
"""
|
||||
Competitive Analyzer Service
|
||||
|
||||
Leverages onboarding step 3 research data and combines it with GSC/Bing
|
||||
query data to provide competitive insights. Superior to SEMrush/Ahrefs
|
||||
because it uses actual user data and personalized content strategy.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional, Set, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
from sqlalchemy.orm import Session
|
||||
from loguru import logger
|
||||
|
||||
from utils.logger_utils import get_service_logger
|
||||
from services.onboarding.data_service import OnboardingDataService
|
||||
from services.calendar_generation_datasource_framework.data_processing.comprehensive_user_data import ComprehensiveUserDataProcessor
|
||||
|
||||
logger = get_service_logger("competitive_analyzer")
|
||||
|
||||
class CompetitiveAnalyzer:
|
||||
"""Analyzes competitive landscape using onboarding research data and analytics."""
|
||||
|
||||
def __init__(self, db: Session):
|
||||
"""Initialize the competitive analyzer."""
|
||||
self.db = db
|
||||
self.user_data_service = OnboardingDataService(db)
|
||||
self.comprehensive_processor = ComprehensiveUserDataProcessor(db)
|
||||
|
||||
async def get_competitive_insights(self, user_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get comprehensive competitive insights for a user.
|
||||
|
||||
Args:
|
||||
user_id: User ID
|
||||
|
||||
Returns:
|
||||
Dictionary containing competitive insights
|
||||
"""
|
||||
try:
|
||||
# Get user's research preferences and competitor data
|
||||
research_prefs = self.user_data_service.get_user_research_preferences(user_id)
|
||||
competitors = research_prefs.get('competitors', []) if research_prefs else []
|
||||
|
||||
if not competitors:
|
||||
logger.info(f"No competitors found for user {user_id}")
|
||||
return {
|
||||
"competitor_keywords": [],
|
||||
"content_gaps": [],
|
||||
"opportunity_score": 0,
|
||||
"competitors_analyzed": 0,
|
||||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Get comprehensive user data including competitor analysis
|
||||
comprehensive_data = self.comprehensive_processor.get_comprehensive_user_data(user_id)
|
||||
competitor_analysis = comprehensive_data.get('competitor_analysis', {})
|
||||
|
||||
# Extract competitor keywords and content topics
|
||||
competitor_keywords = self._extract_competitor_keywords(competitor_analysis, competitors)
|
||||
|
||||
# Get user's current keywords from GSC/Bing (would be passed in real implementation)
|
||||
user_keywords = self._get_user_keywords(user_id)
|
||||
|
||||
# Find content gaps
|
||||
content_gaps = self._find_content_gaps(user_keywords, competitor_keywords)
|
||||
|
||||
# Calculate opportunity score
|
||||
opportunity_score = self._calculate_opportunity_score(content_gaps, competitor_keywords)
|
||||
|
||||
# Generate actionable insights
|
||||
insights = self._generate_insights(content_gaps, competitor_keywords, opportunity_score)
|
||||
|
||||
return {
|
||||
"competitor_keywords": competitor_keywords,
|
||||
"content_gaps": content_gaps,
|
||||
"opportunity_score": opportunity_score,
|
||||
"competitors_analyzed": len(competitors),
|
||||
"insights": insights,
|
||||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting competitive insights for user {user_id}: {e}")
|
||||
return {
|
||||
"competitor_keywords": [],
|
||||
"content_gaps": [],
|
||||
"opportunity_score": 0,
|
||||
"competitors_analyzed": 0,
|
||||
"insights": [],
|
||||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
def _extract_competitor_keywords(self, competitor_analysis: Dict[str, Any], competitors: List[str]) -> List[Dict[str, Any]]:
|
||||
"""Extract keywords from competitor analysis."""
|
||||
try:
|
||||
keywords = []
|
||||
|
||||
# Extract from competitor analysis data
|
||||
for competitor_url in competitors:
|
||||
competitor_data = competitor_analysis.get(competitor_url, {})
|
||||
|
||||
# Extract keywords from various sources
|
||||
competitor_keywords = competitor_data.get('keywords', [])
|
||||
content_topics = competitor_data.get('content_topics', [])
|
||||
meta_keywords = competitor_data.get('meta_keywords', [])
|
||||
|
||||
# Combine all keyword sources
|
||||
all_keywords = set()
|
||||
all_keywords.update(competitor_keywords)
|
||||
all_keywords.update(content_topics)
|
||||
all_keywords.update(meta_keywords)
|
||||
|
||||
# Add to keywords list with competitor attribution
|
||||
for keyword in all_keywords:
|
||||
if keyword and len(keyword.strip()) > 0:
|
||||
keywords.append({
|
||||
"keyword": keyword.strip(),
|
||||
"competitor": competitor_url,
|
||||
"source": "analysis",
|
||||
"volume_estimate": competitor_data.get('keyword_volume', {}).get(keyword, 0),
|
||||
"difficulty_estimate": competitor_data.get('keyword_difficulty', {}).get(keyword, 0),
|
||||
"relevance_score": self._calculate_relevance_score(keyword, competitor_data)
|
||||
})
|
||||
|
||||
# Remove duplicates and sort by relevance
|
||||
unique_keywords = self._deduplicate_keywords(keywords)
|
||||
sorted_keywords = sorted(unique_keywords, key=lambda x: x['relevance_score'], reverse=True)
|
||||
|
||||
logger.debug(f"Extracted {len(sorted_keywords)} unique competitor keywords")
|
||||
return sorted_keywords[:100] # Limit to top 100
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting competitor keywords: {e}")
|
||||
return []
|
||||
|
||||
def _get_user_keywords(self, user_id: str) -> Set[str]:
|
||||
"""Get user's current keywords from GSC/Bing data."""
|
||||
try:
|
||||
# In a real implementation, this would fetch from GSC/Bing APIs
|
||||
# For now, return empty set as placeholder
|
||||
# This would be called from the dashboard service with actual query data
|
||||
return set()
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting user keywords: {e}")
|
||||
return set()
|
||||
|
||||
def _find_content_gaps(self, user_keywords: Set[str], competitor_keywords: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Find content gaps between user and competitors."""
|
||||
try:
|
||||
content_gaps = []
|
||||
user_keywords_lower = {kw.lower() for kw in user_keywords}
|
||||
|
||||
for comp_keyword in competitor_keywords:
|
||||
keyword = comp_keyword['keyword'].lower()
|
||||
|
||||
# Check if user doesn't have this keyword
|
||||
if keyword not in user_keywords_lower:
|
||||
# Check for partial matches (related keywords)
|
||||
is_related = any(
|
||||
self._are_keywords_related(keyword, user_kw)
|
||||
for user_kw in user_keywords_lower
|
||||
)
|
||||
|
||||
if not is_related:
|
||||
content_gaps.append({
|
||||
"keyword": comp_keyword['keyword'],
|
||||
"competitor": comp_keyword['competitor'],
|
||||
"volume_estimate": comp_keyword.get('volume_estimate', 0),
|
||||
"difficulty_estimate": comp_keyword.get('difficulty_estimate', 0),
|
||||
"relevance_score": comp_keyword['relevance_score'],
|
||||
"opportunity_type": self._classify_opportunity_type(comp_keyword),
|
||||
"content_suggestion": self._generate_content_suggestion(comp_keyword)
|
||||
})
|
||||
|
||||
# Sort by opportunity score (volume * relevance / difficulty)
|
||||
sorted_gaps = sorted(
|
||||
content_gaps,
|
||||
key=lambda x: (x['volume_estimate'] * x['relevance_score']) / max(x['difficulty_estimate'], 1),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
logger.debug(f"Found {len(sorted_gaps)} content gaps")
|
||||
return sorted_gaps[:50] # Limit to top 50
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error finding content gaps: {e}")
|
||||
return []
|
||||
|
||||
def _calculate_opportunity_score(self, content_gaps: List[Dict[str, Any]], competitor_keywords: List[Dict[str, Any]]) -> int:
|
||||
"""Calculate overall opportunity score (0-100)."""
|
||||
try:
|
||||
if not content_gaps:
|
||||
return 0
|
||||
|
||||
# Calculate average opportunity metrics
|
||||
avg_volume = sum(gap['volume_estimate'] for gap in content_gaps) / len(content_gaps)
|
||||
avg_relevance = sum(gap['relevance_score'] for gap in content_gaps) / len(content_gaps)
|
||||
avg_difficulty = sum(gap['difficulty_estimate'] for gap in content_gaps) / len(content_gaps)
|
||||
|
||||
# Calculate opportunity score
|
||||
# Higher volume and relevance = higher score
|
||||
# Lower difficulty = higher score
|
||||
volume_score = min(avg_volume / 1000, 1.0) * 40 # Max 40 points for volume
|
||||
relevance_score = avg_relevance * 30 # Max 30 points for relevance
|
||||
difficulty_score = max(0, (10 - avg_difficulty) / 10) * 30 # Max 30 points for low difficulty
|
||||
|
||||
total_score = volume_score + relevance_score + difficulty_score
|
||||
opportunity_score = min(int(total_score), 100)
|
||||
|
||||
logger.debug(f"Calculated opportunity score: {opportunity_score}")
|
||||
return opportunity_score
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating opportunity score: {e}")
|
||||
return 0
|
||||
|
||||
def _generate_insights(self, content_gaps: List[Dict[str, Any]], competitor_keywords: List[Dict[str, Any]], opportunity_score: int) -> List[Dict[str, Any]]:
|
||||
"""Generate actionable insights from competitive analysis."""
|
||||
try:
|
||||
insights = []
|
||||
|
||||
# High opportunity score insight
|
||||
if opportunity_score > 70:
|
||||
insights.append({
|
||||
"type": "opportunity",
|
||||
"priority": "high",
|
||||
"title": "High Competitive Opportunity",
|
||||
"description": f"Your opportunity score is {opportunity_score}% - competitors are ranking for many keywords you're not targeting.",
|
||||
"action": "Create content for the identified keyword gaps to capture more organic traffic."
|
||||
})
|
||||
elif opportunity_score > 40:
|
||||
insights.append({
|
||||
"type": "opportunity",
|
||||
"priority": "medium",
|
||||
"title": "Moderate Competitive Opportunity",
|
||||
"description": f"Your opportunity score is {opportunity_score}% - there are some keyword gaps you could target.",
|
||||
"action": "Review the content gaps and prioritize high-volume, low-difficulty keywords."
|
||||
})
|
||||
|
||||
# Content gap insights
|
||||
if content_gaps:
|
||||
high_volume_gaps = [gap for gap in content_gaps if gap['volume_estimate'] > 500]
|
||||
if high_volume_gaps:
|
||||
insights.append({
|
||||
"type": "content",
|
||||
"priority": "high",
|
||||
"title": "High-Volume Content Gaps",
|
||||
"description": f"Found {len(high_volume_gaps)} high-volume keywords that competitors rank for but you don't.",
|
||||
"action": "Create comprehensive content targeting these high-volume keywords."
|
||||
})
|
||||
|
||||
low_difficulty_gaps = [gap for gap in content_gaps if gap['difficulty_estimate'] < 3]
|
||||
if low_difficulty_gaps:
|
||||
insights.append({
|
||||
"type": "content",
|
||||
"priority": "medium",
|
||||
"title": "Low-Difficulty Content Gaps",
|
||||
"description": f"Found {len(low_difficulty_gaps)} low-difficulty keywords that would be easy to rank for.",
|
||||
"action": "Quick wins: Create content for these low-difficulty keywords first."
|
||||
})
|
||||
|
||||
# Competitor analysis insights
|
||||
if competitor_keywords:
|
||||
top_competitors = {}
|
||||
for kw in competitor_keywords:
|
||||
competitor = kw['competitor']
|
||||
if competitor not in top_competitors:
|
||||
top_competitors[competitor] = 0
|
||||
top_competitors[competitor] += 1
|
||||
|
||||
top_competitor = max(top_competitors.items(), key=lambda x: x[1]) if top_competitors else None
|
||||
if top_competitor:
|
||||
insights.append({
|
||||
"type": "competitive",
|
||||
"priority": "medium",
|
||||
"title": "Top Competitor Analysis",
|
||||
"description": f"{top_competitor[0]} has the most keyword overlap with your content strategy.",
|
||||
"action": f"Analyze {top_competitor[0]}'s content strategy for additional keyword opportunities."
|
||||
})
|
||||
|
||||
return insights
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating insights: {e}")
|
||||
return []
|
||||
|
||||
def _deduplicate_keywords(self, keywords: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Remove duplicate keywords and merge data."""
|
||||
try:
|
||||
keyword_map = {}
|
||||
|
||||
for kw in keywords:
|
||||
keyword = kw['keyword'].lower()
|
||||
if keyword in keyword_map:
|
||||
# Merge data from multiple competitors
|
||||
existing = keyword_map[keyword]
|
||||
existing['competitors'].append(kw['competitor'])
|
||||
existing['volume_estimate'] = max(existing['volume_estimate'], kw['volume_estimate'])
|
||||
existing['relevance_score'] = max(existing['relevance_score'], kw['relevance_score'])
|
||||
else:
|
||||
keyword_map[keyword] = {
|
||||
'keyword': kw['keyword'],
|
||||
'competitors': [kw['competitor']],
|
||||
'source': kw['source'],
|
||||
'volume_estimate': kw['volume_estimate'],
|
||||
'difficulty_estimate': kw['difficulty_estimate'],
|
||||
'relevance_score': kw['relevance_score']
|
||||
}
|
||||
|
||||
return list(keyword_map.values())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error deduplicating keywords: {e}")
|
||||
return []
|
||||
|
||||
def _calculate_relevance_score(self, keyword: str, competitor_data: Dict[str, Any]) -> float:
|
||||
"""Calculate relevance score for a keyword based on competitor data."""
|
||||
try:
|
||||
# Base relevance score
|
||||
relevance = 0.5
|
||||
|
||||
# Increase relevance based on keyword frequency in competitor content
|
||||
content_frequency = competitor_data.get('content_frequency', {})
|
||||
if keyword in content_frequency:
|
||||
relevance += min(content_frequency[keyword] / 10, 0.3)
|
||||
|
||||
# Increase relevance based on meta keyword presence
|
||||
meta_keywords = competitor_data.get('meta_keywords', [])
|
||||
if keyword in meta_keywords:
|
||||
relevance += 0.2
|
||||
|
||||
# Increase relevance based on title presence
|
||||
titles = competitor_data.get('titles', [])
|
||||
if any(keyword.lower() in title.lower() for title in titles):
|
||||
relevance += 0.2
|
||||
|
||||
# Normalize to 0-1 range
|
||||
return min(relevance, 1.0)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating relevance score: {e}")
|
||||
return 0.5
|
||||
|
||||
def _are_keywords_related(self, keyword1: str, keyword2: str) -> bool:
|
||||
"""Check if two keywords are related."""
|
||||
try:
|
||||
# Simple similarity check - can be enhanced with NLP
|
||||
words1 = set(keyword1.lower().split())
|
||||
words2 = set(keyword2.lower().split())
|
||||
|
||||
# Check for word overlap
|
||||
overlap = len(words1.intersection(words2))
|
||||
total_words = len(words1.union(words2))
|
||||
|
||||
if total_words == 0:
|
||||
return False
|
||||
|
||||
similarity = overlap / total_words
|
||||
return similarity > 0.3 # 30% word overlap threshold
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking keyword relatedness: {e}")
|
||||
return False
|
||||
|
||||
def _classify_opportunity_type(self, keyword_data: Dict[str, Any]) -> str:
|
||||
"""Classify the type of opportunity for a keyword."""
|
||||
try:
|
||||
volume = keyword_data.get('volume_estimate', 0)
|
||||
difficulty = keyword_data.get('difficulty_estimate', 0)
|
||||
relevance = keyword_data.get('relevance_score', 0)
|
||||
|
||||
if volume > 1000 and difficulty < 5 and relevance > 0.7:
|
||||
return "high_priority"
|
||||
elif volume > 500 and difficulty < 7 and relevance > 0.5:
|
||||
return "medium_priority"
|
||||
elif volume > 100 and difficulty < 8:
|
||||
return "low_priority"
|
||||
else:
|
||||
return "long_term"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error classifying opportunity type: {e}")
|
||||
return "unknown"
|
||||
|
||||
def _generate_content_suggestion(self, keyword_data: Dict[str, Any]) -> str:
|
||||
"""Generate content suggestion for a keyword."""
|
||||
try:
|
||||
keyword = keyword_data['keyword']
|
||||
opportunity_type = self._classify_opportunity_type(keyword_data)
|
||||
|
||||
suggestions = {
|
||||
"high_priority": f"Create comprehensive, in-depth content targeting '{keyword}' - high volume, low difficulty opportunity.",
|
||||
"medium_priority": f"Consider creating content around '{keyword}' - good volume with moderate competition.",
|
||||
"low_priority": f"'{keyword}' could be a good long-tail keyword to target in future content.",
|
||||
"long_term": f"'{keyword}' might be worth monitoring for future content opportunities."
|
||||
}
|
||||
|
||||
return suggestions.get(opportunity_type, f"Consider creating content around '{keyword}'.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating content suggestion: {e}")
|
||||
return f"Consider creating content around '{keyword_data.get('keyword', 'this keyword')}'."
|
||||
Reference in New Issue
Block a user