Files
moreminimore-marketing/backend/services/seo/competitive_analyzer.py
Kunthawat Greethong c35fa52117 Base code
2026-01-08 22:39:53 +07:00

402 lines
19 KiB
Python

"""
Competitive Analyzer Service
Leverages onboarding step 3 research data and combines it with GSC/Bing
query data to provide competitive insights. Superior to SEMrush/Ahrefs
because it uses actual user data and personalized content strategy.
"""
from typing import Dict, Any, List, Optional, Set, Tuple
from datetime import datetime, timedelta
from sqlalchemy.orm import Session
from loguru import logger
from utils.logger_utils import get_service_logger
from services.onboarding.data_service import OnboardingDataService
from services.calendar_generation_datasource_framework.data_processing.comprehensive_user_data import ComprehensiveUserDataProcessor
logger = get_service_logger("competitive_analyzer")
class CompetitiveAnalyzer:
"""Analyzes competitive landscape using onboarding research data and analytics."""
def __init__(self, db: Session):
"""Initialize the competitive analyzer."""
self.db = db
self.user_data_service = OnboardingDataService(db)
self.comprehensive_processor = ComprehensiveUserDataProcessor(db)
async def get_competitive_insights(self, user_id: str) -> Dict[str, Any]:
"""
Get comprehensive competitive insights for a user.
Args:
user_id: User ID
Returns:
Dictionary containing competitive insights
"""
try:
# Get user's research preferences and competitor data
research_prefs = self.user_data_service.get_user_research_preferences(user_id)
competitors = research_prefs.get('competitors', []) if research_prefs else []
if not competitors:
logger.info(f"No competitors found for user {user_id}")
return {
"competitor_keywords": [],
"content_gaps": [],
"opportunity_score": 0,
"competitors_analyzed": 0,
"last_updated": datetime.now().isoformat()
}
# Get comprehensive user data including competitor analysis
comprehensive_data = self.comprehensive_processor.get_comprehensive_user_data(user_id)
competitor_analysis = comprehensive_data.get('competitor_analysis', {})
# Extract competitor keywords and content topics
competitor_keywords = self._extract_competitor_keywords(competitor_analysis, competitors)
# Get user's current keywords from GSC/Bing (would be passed in real implementation)
user_keywords = self._get_user_keywords(user_id)
# Find content gaps
content_gaps = self._find_content_gaps(user_keywords, competitor_keywords)
# Calculate opportunity score
opportunity_score = self._calculate_opportunity_score(content_gaps, competitor_keywords)
# Generate actionable insights
insights = self._generate_insights(content_gaps, competitor_keywords, opportunity_score)
return {
"competitor_keywords": competitor_keywords,
"content_gaps": content_gaps,
"opportunity_score": opportunity_score,
"competitors_analyzed": len(competitors),
"insights": insights,
"last_updated": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Error getting competitive insights for user {user_id}: {e}")
return {
"competitor_keywords": [],
"content_gaps": [],
"opportunity_score": 0,
"competitors_analyzed": 0,
"insights": [],
"last_updated": datetime.now().isoformat()
}
def _extract_competitor_keywords(self, competitor_analysis: Dict[str, Any], competitors: List[str]) -> List[Dict[str, Any]]:
"""Extract keywords from competitor analysis."""
try:
keywords = []
# Extract from competitor analysis data
for competitor_url in competitors:
competitor_data = competitor_analysis.get(competitor_url, {})
# Extract keywords from various sources
competitor_keywords = competitor_data.get('keywords', [])
content_topics = competitor_data.get('content_topics', [])
meta_keywords = competitor_data.get('meta_keywords', [])
# Combine all keyword sources
all_keywords = set()
all_keywords.update(competitor_keywords)
all_keywords.update(content_topics)
all_keywords.update(meta_keywords)
# Add to keywords list with competitor attribution
for keyword in all_keywords:
if keyword and len(keyword.strip()) > 0:
keywords.append({
"keyword": keyword.strip(),
"competitor": competitor_url,
"source": "analysis",
"volume_estimate": competitor_data.get('keyword_volume', {}).get(keyword, 0),
"difficulty_estimate": competitor_data.get('keyword_difficulty', {}).get(keyword, 0),
"relevance_score": self._calculate_relevance_score(keyword, competitor_data)
})
# Remove duplicates and sort by relevance
unique_keywords = self._deduplicate_keywords(keywords)
sorted_keywords = sorted(unique_keywords, key=lambda x: x['relevance_score'], reverse=True)
logger.debug(f"Extracted {len(sorted_keywords)} unique competitor keywords")
return sorted_keywords[:100] # Limit to top 100
except Exception as e:
logger.error(f"Error extracting competitor keywords: {e}")
return []
def _get_user_keywords(self, user_id: str) -> Set[str]:
"""Get user's current keywords from GSC/Bing data."""
try:
# In a real implementation, this would fetch from GSC/Bing APIs
# For now, return empty set as placeholder
# This would be called from the dashboard service with actual query data
return set()
except Exception as e:
logger.error(f"Error getting user keywords: {e}")
return set()
def _find_content_gaps(self, user_keywords: Set[str], competitor_keywords: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Find content gaps between user and competitors."""
try:
content_gaps = []
user_keywords_lower = {kw.lower() for kw in user_keywords}
for comp_keyword in competitor_keywords:
keyword = comp_keyword['keyword'].lower()
# Check if user doesn't have this keyword
if keyword not in user_keywords_lower:
# Check for partial matches (related keywords)
is_related = any(
self._are_keywords_related(keyword, user_kw)
for user_kw in user_keywords_lower
)
if not is_related:
content_gaps.append({
"keyword": comp_keyword['keyword'],
"competitor": comp_keyword['competitor'],
"volume_estimate": comp_keyword.get('volume_estimate', 0),
"difficulty_estimate": comp_keyword.get('difficulty_estimate', 0),
"relevance_score": comp_keyword['relevance_score'],
"opportunity_type": self._classify_opportunity_type(comp_keyword),
"content_suggestion": self._generate_content_suggestion(comp_keyword)
})
# Sort by opportunity score (volume * relevance / difficulty)
sorted_gaps = sorted(
content_gaps,
key=lambda x: (x['volume_estimate'] * x['relevance_score']) / max(x['difficulty_estimate'], 1),
reverse=True
)
logger.debug(f"Found {len(sorted_gaps)} content gaps")
return sorted_gaps[:50] # Limit to top 50
except Exception as e:
logger.error(f"Error finding content gaps: {e}")
return []
def _calculate_opportunity_score(self, content_gaps: List[Dict[str, Any]], competitor_keywords: List[Dict[str, Any]]) -> int:
"""Calculate overall opportunity score (0-100)."""
try:
if not content_gaps:
return 0
# Calculate average opportunity metrics
avg_volume = sum(gap['volume_estimate'] for gap in content_gaps) / len(content_gaps)
avg_relevance = sum(gap['relevance_score'] for gap in content_gaps) / len(content_gaps)
avg_difficulty = sum(gap['difficulty_estimate'] for gap in content_gaps) / len(content_gaps)
# Calculate opportunity score
# Higher volume and relevance = higher score
# Lower difficulty = higher score
volume_score = min(avg_volume / 1000, 1.0) * 40 # Max 40 points for volume
relevance_score = avg_relevance * 30 # Max 30 points for relevance
difficulty_score = max(0, (10 - avg_difficulty) / 10) * 30 # Max 30 points for low difficulty
total_score = volume_score + relevance_score + difficulty_score
opportunity_score = min(int(total_score), 100)
logger.debug(f"Calculated opportunity score: {opportunity_score}")
return opportunity_score
except Exception as e:
logger.error(f"Error calculating opportunity score: {e}")
return 0
def _generate_insights(self, content_gaps: List[Dict[str, Any]], competitor_keywords: List[Dict[str, Any]], opportunity_score: int) -> List[Dict[str, Any]]:
"""Generate actionable insights from competitive analysis."""
try:
insights = []
# High opportunity score insight
if opportunity_score > 70:
insights.append({
"type": "opportunity",
"priority": "high",
"title": "High Competitive Opportunity",
"description": f"Your opportunity score is {opportunity_score}% - competitors are ranking for many keywords you're not targeting.",
"action": "Create content for the identified keyword gaps to capture more organic traffic."
})
elif opportunity_score > 40:
insights.append({
"type": "opportunity",
"priority": "medium",
"title": "Moderate Competitive Opportunity",
"description": f"Your opportunity score is {opportunity_score}% - there are some keyword gaps you could target.",
"action": "Review the content gaps and prioritize high-volume, low-difficulty keywords."
})
# Content gap insights
if content_gaps:
high_volume_gaps = [gap for gap in content_gaps if gap['volume_estimate'] > 500]
if high_volume_gaps:
insights.append({
"type": "content",
"priority": "high",
"title": "High-Volume Content Gaps",
"description": f"Found {len(high_volume_gaps)} high-volume keywords that competitors rank for but you don't.",
"action": "Create comprehensive content targeting these high-volume keywords."
})
low_difficulty_gaps = [gap for gap in content_gaps if gap['difficulty_estimate'] < 3]
if low_difficulty_gaps:
insights.append({
"type": "content",
"priority": "medium",
"title": "Low-Difficulty Content Gaps",
"description": f"Found {len(low_difficulty_gaps)} low-difficulty keywords that would be easy to rank for.",
"action": "Quick wins: Create content for these low-difficulty keywords first."
})
# Competitor analysis insights
if competitor_keywords:
top_competitors = {}
for kw in competitor_keywords:
competitor = kw['competitor']
if competitor not in top_competitors:
top_competitors[competitor] = 0
top_competitors[competitor] += 1
top_competitor = max(top_competitors.items(), key=lambda x: x[1]) if top_competitors else None
if top_competitor:
insights.append({
"type": "competitive",
"priority": "medium",
"title": "Top Competitor Analysis",
"description": f"{top_competitor[0]} has the most keyword overlap with your content strategy.",
"action": f"Analyze {top_competitor[0]}'s content strategy for additional keyword opportunities."
})
return insights
except Exception as e:
logger.error(f"Error generating insights: {e}")
return []
def _deduplicate_keywords(self, keywords: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Remove duplicate keywords and merge data."""
try:
keyword_map = {}
for kw in keywords:
keyword = kw['keyword'].lower()
if keyword in keyword_map:
# Merge data from multiple competitors
existing = keyword_map[keyword]
existing['competitors'].append(kw['competitor'])
existing['volume_estimate'] = max(existing['volume_estimate'], kw['volume_estimate'])
existing['relevance_score'] = max(existing['relevance_score'], kw['relevance_score'])
else:
keyword_map[keyword] = {
'keyword': kw['keyword'],
'competitors': [kw['competitor']],
'source': kw['source'],
'volume_estimate': kw['volume_estimate'],
'difficulty_estimate': kw['difficulty_estimate'],
'relevance_score': kw['relevance_score']
}
return list(keyword_map.values())
except Exception as e:
logger.error(f"Error deduplicating keywords: {e}")
return []
def _calculate_relevance_score(self, keyword: str, competitor_data: Dict[str, Any]) -> float:
"""Calculate relevance score for a keyword based on competitor data."""
try:
# Base relevance score
relevance = 0.5
# Increase relevance based on keyword frequency in competitor content
content_frequency = competitor_data.get('content_frequency', {})
if keyword in content_frequency:
relevance += min(content_frequency[keyword] / 10, 0.3)
# Increase relevance based on meta keyword presence
meta_keywords = competitor_data.get('meta_keywords', [])
if keyword in meta_keywords:
relevance += 0.2
# Increase relevance based on title presence
titles = competitor_data.get('titles', [])
if any(keyword.lower() in title.lower() for title in titles):
relevance += 0.2
# Normalize to 0-1 range
return min(relevance, 1.0)
except Exception as e:
logger.error(f"Error calculating relevance score: {e}")
return 0.5
def _are_keywords_related(self, keyword1: str, keyword2: str) -> bool:
"""Check if two keywords are related."""
try:
# Simple similarity check - can be enhanced with NLP
words1 = set(keyword1.lower().split())
words2 = set(keyword2.lower().split())
# Check for word overlap
overlap = len(words1.intersection(words2))
total_words = len(words1.union(words2))
if total_words == 0:
return False
similarity = overlap / total_words
return similarity > 0.3 # 30% word overlap threshold
except Exception as e:
logger.error(f"Error checking keyword relatedness: {e}")
return False
def _classify_opportunity_type(self, keyword_data: Dict[str, Any]) -> str:
"""Classify the type of opportunity for a keyword."""
try:
volume = keyword_data.get('volume_estimate', 0)
difficulty = keyword_data.get('difficulty_estimate', 0)
relevance = keyword_data.get('relevance_score', 0)
if volume > 1000 and difficulty < 5 and relevance > 0.7:
return "high_priority"
elif volume > 500 and difficulty < 7 and relevance > 0.5:
return "medium_priority"
elif volume > 100 and difficulty < 8:
return "low_priority"
else:
return "long_term"
except Exception as e:
logger.error(f"Error classifying opportunity type: {e}")
return "unknown"
def _generate_content_suggestion(self, keyword_data: Dict[str, Any]) -> str:
"""Generate content suggestion for a keyword."""
try:
keyword = keyword_data['keyword']
opportunity_type = self._classify_opportunity_type(keyword_data)
suggestions = {
"high_priority": f"Create comprehensive, in-depth content targeting '{keyword}' - high volume, low difficulty opportunity.",
"medium_priority": f"Consider creating content around '{keyword}' - good volume with moderate competition.",
"low_priority": f"'{keyword}' could be a good long-tail keyword to target in future content.",
"long_term": f"'{keyword}' might be worth monitoring for future content opportunities."
}
return suggestions.get(opportunity_type, f"Consider creating content around '{keyword}'.")
except Exception as e:
logger.error(f"Error generating content suggestion: {e}")
return f"Consider creating content around '{keyword_data.get('keyword', 'this keyword')}'."