Base code

This commit is contained in:
Kunthawat Greethong
2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions

View File

@@ -0,0 +1,879 @@
"""
Blog Content SEO Analyzer
Specialized SEO analyzer for blog content with parallel processing.
Leverages existing non-AI SEO tools and uses single AI prompt for structured analysis.
"""
import asyncio
import re
import textstat
from datetime import datetime
from typing import Dict, Any, List, Optional
from utils.logger_utils import get_service_logger
from services.seo_analyzer import (
ContentAnalyzer, KeywordAnalyzer,
URLStructureAnalyzer, AIInsightGenerator
)
from services.llm_providers.main_text_generation import llm_text_gen
class BlogContentSEOAnalyzer:
"""Specialized SEO analyzer for blog content with parallel processing"""
def __init__(self):
"""Initialize the blog content SEO analyzer"""
# Service-specific logger (no global reconfiguration)
global logger
logger = get_service_logger("blog_content_seo_analyzer")
self.content_analyzer = ContentAnalyzer()
self.keyword_analyzer = KeywordAnalyzer()
self.url_analyzer = URLStructureAnalyzer()
self.ai_insights = AIInsightGenerator()
logger.info("BlogContentSEOAnalyzer initialized")
async def analyze_blog_content(self, blog_content: str, research_data: Dict[str, Any], blog_title: Optional[str] = None, user_id: str = None) -> Dict[str, Any]:
"""
Main analysis method with parallel processing
Args:
blog_content: The blog content to analyze
research_data: Research data containing keywords and other insights
blog_title: Optional blog title
user_id: Clerk user ID for subscription checking (required)
Returns:
Comprehensive SEO analysis results
"""
if not user_id:
raise ValueError("user_id is required for subscription checking. Please provide Clerk user ID.")
try:
logger.info("Starting blog content SEO analysis")
# Extract keywords from research data
keywords_data = self._extract_keywords_from_research(research_data)
logger.info(f"Extracted keywords: {keywords_data}")
# Phase 1: Run non-AI analyzers in parallel
logger.info("Running non-AI analyzers in parallel")
non_ai_results = await self._run_non_ai_analyzers(blog_content, keywords_data)
# Phase 2: Single AI analysis for structured insights
logger.info("Running AI analysis")
ai_insights = await self._run_ai_analysis(blog_content, keywords_data, non_ai_results, user_id=user_id)
# Phase 3: Compile and format results
logger.info("Compiling results")
results = self._compile_blog_seo_results(non_ai_results, ai_insights, keywords_data)
logger.info(f"SEO analysis completed. Overall score: {results.get('overall_score', 0)}")
return results
except Exception as e:
logger.error(f"Blog SEO analysis failed: {e}")
# Fail fast - don't return fallback data
raise e
def _extract_keywords_from_research(self, research_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract keywords from research data"""
try:
logger.info(f"Extracting keywords from research data: {research_data}")
# Extract keywords from research data structure
keyword_analysis = research_data.get('keyword_analysis', {})
logger.info(f"Found keyword_analysis: {keyword_analysis}")
# Handle different possible structures
primary_keywords = []
long_tail_keywords = []
semantic_keywords = []
all_keywords = []
# Try to extract primary keywords from different possible locations
if 'primary' in keyword_analysis:
primary_keywords = keyword_analysis.get('primary', [])
elif 'keywords' in research_data:
# Fallback to top-level keywords
primary_keywords = research_data.get('keywords', [])
# Extract other keyword types
long_tail_keywords = keyword_analysis.get('long_tail', [])
# Handle both 'semantic' and 'semantic_keywords' field names
semantic_keywords = keyword_analysis.get('semantic', []) or keyword_analysis.get('semantic_keywords', [])
all_keywords = keyword_analysis.get('all_keywords', primary_keywords)
result = {
'primary': primary_keywords,
'long_tail': long_tail_keywords,
'semantic': semantic_keywords,
'all_keywords': all_keywords,
'search_intent': keyword_analysis.get('search_intent', 'informational')
}
logger.info(f"Extracted keywords: {result}")
return result
except Exception as e:
logger.error(f"Failed to extract keywords from research data: {e}")
logger.error(f"Research data structure: {research_data}")
# Fail fast - don't return empty keywords
raise ValueError(f"Keyword extraction failed: {e}")
async def _run_non_ai_analyzers(self, blog_content: str, keywords_data: Dict[str, Any]) -> Dict[str, Any]:
"""Run all non-AI analyzers in parallel for maximum performance"""
logger.info(f"Starting non-AI analyzers with content length: {len(blog_content)} chars")
logger.info(f"Keywords data: {keywords_data}")
# Parallel execution of fast analyzers
tasks = [
self._analyze_content_structure(blog_content),
self._analyze_keyword_usage(blog_content, keywords_data),
self._analyze_readability(blog_content),
self._analyze_content_quality(blog_content),
self._analyze_heading_structure(blog_content)
]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Check for exceptions and fail fast
for i, result in enumerate(results):
if isinstance(result, Exception):
task_names = ['content_structure', 'keyword_analysis', 'readability_analysis', 'content_quality', 'heading_structure']
logger.error(f"Task {task_names[i]} failed: {result}")
raise result
# Log successful results
task_names = ['content_structure', 'keyword_analysis', 'readability_analysis', 'content_quality', 'heading_structure']
for i, (name, result) in enumerate(zip(task_names, results)):
logger.info(f"{name} completed: {type(result).__name__} with {len(result) if isinstance(result, dict) else 'N/A'} fields")
return {
'content_structure': results[0],
'keyword_analysis': results[1],
'readability_analysis': results[2],
'content_quality': results[3],
'heading_structure': results[4]
}
async def _analyze_content_structure(self, content: str) -> Dict[str, Any]:
"""Analyze blog content structure"""
try:
# Parse markdown content
lines = content.split('\n')
# Count sections, paragraphs, sentences
sections = len([line for line in lines if line.startswith('##')])
paragraphs = len([line for line in lines if line.strip() and not line.startswith('#')])
sentences = len(re.findall(r'[.!?]+', content))
# Blog-specific structure analysis
has_introduction = any('introduction' in line.lower() or 'overview' in line.lower()
for line in lines[:10])
has_conclusion = any('conclusion' in line.lower() or 'summary' in line.lower()
for line in lines[-10:])
has_cta = any('call to action' in line.lower() or 'learn more' in line.lower()
for line in lines)
structure_score = self._calculate_structure_score(sections, paragraphs, has_introduction, has_conclusion)
return {
'total_sections': sections,
'total_paragraphs': paragraphs,
'total_sentences': sentences,
'has_introduction': has_introduction,
'has_conclusion': has_conclusion,
'has_call_to_action': has_cta,
'structure_score': structure_score,
'recommendations': self._get_structure_recommendations(sections, has_introduction, has_conclusion)
}
except Exception as e:
logger.error(f"Content structure analysis failed: {e}")
raise e
async def _analyze_keyword_usage(self, content: str, keywords_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze keyword usage and optimization"""
try:
# Extract keywords from research data
primary_keywords = keywords_data.get('primary', [])
long_tail_keywords = keywords_data.get('long_tail', [])
semantic_keywords = keywords_data.get('semantic', [])
# Use existing KeywordAnalyzer
keyword_result = self.keyword_analyzer.analyze(content, primary_keywords)
# Blog-specific keyword analysis
keyword_analysis = {
'primary_keywords': primary_keywords,
'long_tail_keywords': long_tail_keywords,
'semantic_keywords': semantic_keywords,
'keyword_density': {},
'keyword_distribution': {},
'missing_keywords': [],
'over_optimization': [],
'recommendations': []
}
# Analyze each keyword type
for keyword in primary_keywords:
density = self._calculate_keyword_density(content, keyword)
keyword_analysis['keyword_density'][keyword] = density
# Check if keyword appears in headings
in_headings = self._keyword_in_headings(content, keyword)
keyword_analysis['keyword_distribution'][keyword] = {
'density': density,
'in_headings': in_headings,
'first_occurrence': content.lower().find(keyword.lower())
}
# Check for missing important keywords
for keyword in primary_keywords:
if keyword.lower() not in content.lower():
keyword_analysis['missing_keywords'].append(keyword)
# Check for over-optimization
for keyword, density in keyword_analysis['keyword_density'].items():
if density > 3.0: # Over 3% density
keyword_analysis['over_optimization'].append(keyword)
return keyword_analysis
except Exception as e:
logger.error(f"Keyword analysis failed: {e}")
raise e
async def _analyze_readability(self, content: str) -> Dict[str, Any]:
"""Analyze content readability using textstat integration"""
try:
# Calculate readability metrics
readability_metrics = {
'flesch_reading_ease': textstat.flesch_reading_ease(content),
'flesch_kincaid_grade': textstat.flesch_kincaid_grade(content),
'gunning_fog': textstat.gunning_fog(content),
'smog_index': textstat.smog_index(content),
'automated_readability': textstat.automated_readability_index(content),
'coleman_liau': textstat.coleman_liau_index(content)
}
# Blog-specific readability analysis
avg_sentence_length = self._calculate_avg_sentence_length(content)
avg_paragraph_length = self._calculate_avg_paragraph_length(content)
readability_score = self._calculate_readability_score(readability_metrics)
return {
'metrics': readability_metrics,
'avg_sentence_length': avg_sentence_length,
'avg_paragraph_length': avg_paragraph_length,
'readability_score': readability_score,
'target_audience': self._determine_target_audience(readability_metrics),
'recommendations': self._get_readability_recommendations(readability_metrics, avg_sentence_length)
}
except Exception as e:
logger.error(f"Readability analysis failed: {e}")
raise e
async def _analyze_content_quality(self, content: str) -> Dict[str, Any]:
"""Analyze overall content quality"""
try:
# Word count analysis
words = content.split()
word_count = len(words)
# Content depth analysis
unique_words = len(set(word.lower() for word in words))
vocabulary_diversity = unique_words / word_count if word_count > 0 else 0
# Content flow analysis
transition_words = ['however', 'therefore', 'furthermore', 'moreover', 'additionally', 'consequently']
transition_count = sum(content.lower().count(word) for word in transition_words)
content_depth_score = self._calculate_content_depth_score(word_count, vocabulary_diversity)
flow_score = self._calculate_flow_score(transition_count, word_count)
return {
'word_count': word_count,
'unique_words': unique_words,
'vocabulary_diversity': vocabulary_diversity,
'transition_words_used': transition_count,
'content_depth_score': content_depth_score,
'flow_score': flow_score,
'recommendations': self._get_content_quality_recommendations(word_count, vocabulary_diversity, transition_count)
}
except Exception as e:
logger.error(f"Content quality analysis failed: {e}")
raise e
async def _analyze_heading_structure(self, content: str) -> Dict[str, Any]:
"""Analyze heading structure and hierarchy"""
try:
# Extract headings
h1_headings = re.findall(r'^# (.+)$', content, re.MULTILINE)
h2_headings = re.findall(r'^## (.+)$', content, re.MULTILINE)
h3_headings = re.findall(r'^### (.+)$', content, re.MULTILINE)
# Analyze heading structure
heading_hierarchy_score = self._calculate_heading_hierarchy_score(h1_headings, h2_headings, h3_headings)
return {
'h1_count': len(h1_headings),
'h2_count': len(h2_headings),
'h3_count': len(h3_headings),
'h1_headings': h1_headings,
'h2_headings': h2_headings,
'h3_headings': h3_headings,
'heading_hierarchy_score': heading_hierarchy_score,
'recommendations': self._get_heading_recommendations(h1_headings, h2_headings, h3_headings)
}
except Exception as e:
logger.error(f"Heading structure analysis failed: {e}")
raise e
# Helper methods for calculations and scoring
def _calculate_structure_score(self, sections: int, paragraphs: int, has_intro: bool, has_conclusion: bool) -> int:
"""Calculate content structure score"""
score = 0
# Section count (optimal: 3-8 sections)
if 3 <= sections <= 8:
score += 30
elif sections < 3:
score += 15
else:
score += 20
# Paragraph count (optimal: 8-20 paragraphs)
if 8 <= paragraphs <= 20:
score += 30
elif paragraphs < 8:
score += 15
else:
score += 20
# Introduction and conclusion
if has_intro:
score += 20
if has_conclusion:
score += 20
return min(score, 100)
def _calculate_keyword_density(self, content: str, keyword: str) -> float:
"""Calculate keyword density percentage"""
content_lower = content.lower()
keyword_lower = keyword.lower()
word_count = len(content.split())
keyword_count = content_lower.count(keyword_lower)
return (keyword_count / word_count * 100) if word_count > 0 else 0
def _keyword_in_headings(self, content: str, keyword: str) -> bool:
"""Check if keyword appears in headings"""
headings = re.findall(r'^#+ (.+)$', content, re.MULTILINE)
return any(keyword.lower() in heading.lower() for heading in headings)
def _calculate_avg_sentence_length(self, content: str) -> float:
"""Calculate average sentence length"""
sentences = re.split(r'[.!?]+', content)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return 0
total_words = sum(len(sentence.split()) for sentence in sentences)
return total_words / len(sentences)
def _calculate_avg_paragraph_length(self, content: str) -> float:
"""Calculate average paragraph length"""
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
if not paragraphs:
return 0
total_words = sum(len(paragraph.split()) for paragraph in paragraphs)
return total_words / len(paragraphs)
def _calculate_readability_score(self, metrics: Dict[str, float]) -> int:
"""Calculate overall readability score"""
# Flesch Reading Ease (0-100, higher is better)
flesch_score = metrics.get('flesch_reading_ease', 0)
# Convert to 0-100 scale
if flesch_score >= 80:
return 90
elif flesch_score >= 60:
return 80
elif flesch_score >= 40:
return 70
elif flesch_score >= 20:
return 60
else:
return 50
def _determine_target_audience(self, metrics: Dict[str, float]) -> str:
"""Determine target audience based on readability metrics"""
flesch_score = metrics.get('flesch_reading_ease', 0)
if flesch_score >= 80:
return "General audience (8th grade level)"
elif flesch_score >= 60:
return "High school level"
elif flesch_score >= 40:
return "College level"
else:
return "Graduate level"
def _calculate_content_depth_score(self, word_count: int, vocabulary_diversity: float) -> int:
"""Calculate content depth score"""
score = 0
# Word count (optimal: 800-2000 words)
if 800 <= word_count <= 2000:
score += 50
elif word_count < 800:
score += 30
else:
score += 40
# Vocabulary diversity (optimal: 0.4-0.7)
if 0.4 <= vocabulary_diversity <= 0.7:
score += 50
elif vocabulary_diversity < 0.4:
score += 30
else:
score += 40
return min(score, 100)
def _calculate_flow_score(self, transition_count: int, word_count: int) -> int:
"""Calculate content flow score"""
if word_count == 0:
return 0
transition_density = transition_count / (word_count / 100)
# Optimal transition density: 1-3 per 100 words
if 1 <= transition_density <= 3:
return 90
elif transition_density < 1:
return 60
else:
return 70
def _calculate_heading_hierarchy_score(self, h1: List[str], h2: List[str], h3: List[str]) -> int:
"""Calculate heading hierarchy score"""
score = 0
# Should have exactly 1 H1
if len(h1) == 1:
score += 40
elif len(h1) == 0:
score += 20
else:
score += 10
# Should have 3-8 H2 headings
if 3 <= len(h2) <= 8:
score += 40
elif len(h2) < 3:
score += 20
else:
score += 30
# H3 headings are optional but good for structure
if len(h3) > 0:
score += 20
return min(score, 100)
def _calculate_keyword_score(self, keyword_analysis: Dict[str, Any]) -> int:
"""Calculate keyword optimization score"""
score = 0
# Check keyword density (optimal: 1-3%)
densities = keyword_analysis.get('keyword_density', {})
for keyword, density in densities.items():
if 1 <= density <= 3:
score += 30
elif density < 1:
score += 15
else:
score += 10
# Check keyword distribution
distributions = keyword_analysis.get('keyword_distribution', {})
for keyword, dist in distributions.items():
if dist.get('in_headings', False):
score += 20
if dist.get('first_occurrence', -1) < 100: # Early occurrence
score += 20
# Penalize missing keywords
missing = len(keyword_analysis.get('missing_keywords', []))
score -= missing * 10
# Penalize over-optimization
over_opt = len(keyword_analysis.get('over_optimization', []))
score -= over_opt * 15
return max(0, min(score, 100))
def _calculate_weighted_score(self, scores: Dict[str, int]) -> int:
"""Calculate weighted overall score"""
weights = {
'structure': 0.2,
'keywords': 0.25,
'readability': 0.2,
'quality': 0.15,
'headings': 0.1,
'ai_insights': 0.1
}
weighted_sum = sum(scores.get(key, 0) * weight for key, weight in weights.items())
return int(weighted_sum)
# Recommendation methods
def _get_structure_recommendations(self, sections: int, has_intro: bool, has_conclusion: bool) -> List[str]:
"""Get structure recommendations"""
recommendations = []
if sections < 3:
recommendations.append("Add more sections to improve content structure")
elif sections > 8:
recommendations.append("Consider combining some sections for better flow")
if not has_intro:
recommendations.append("Add an introduction section to set context")
if not has_conclusion:
recommendations.append("Add a conclusion section to summarize key points")
return recommendations
def _get_readability_recommendations(self, metrics: Dict[str, float], avg_sentence_length: float) -> List[str]:
"""Get readability recommendations"""
recommendations = []
flesch_score = metrics.get('flesch_reading_ease', 0)
if flesch_score < 60:
recommendations.append("Simplify language and use shorter sentences")
if avg_sentence_length > 20:
recommendations.append("Break down long sentences for better readability")
if flesch_score > 80:
recommendations.append("Consider adding more technical depth for expert audience")
return recommendations
def _get_content_quality_recommendations(self, word_count: int, vocabulary_diversity: float, transition_count: int) -> List[str]:
"""Get content quality recommendations"""
recommendations = []
if word_count < 800:
recommendations.append("Expand content with more detailed explanations")
elif word_count > 2000:
recommendations.append("Consider breaking into multiple posts")
if vocabulary_diversity < 0.4:
recommendations.append("Use more varied vocabulary to improve engagement")
if transition_count < 3:
recommendations.append("Add more transition words to improve flow")
return recommendations
def _get_heading_recommendations(self, h1: List[str], h2: List[str], h3: List[str]) -> List[str]:
"""Get heading recommendations"""
recommendations = []
if len(h1) == 0:
recommendations.append("Add a main H1 heading")
elif len(h1) > 1:
recommendations.append("Use only one H1 heading per post")
if len(h2) < 3:
recommendations.append("Add more H2 headings to structure content")
elif len(h2) > 8:
recommendations.append("Consider using H3 headings for better hierarchy")
return recommendations
async def _run_ai_analysis(self, blog_content: str, keywords_data: Dict[str, Any], non_ai_results: Dict[str, Any], user_id: str = None) -> Dict[str, Any]:
"""Run single AI analysis for structured insights (provider-agnostic)"""
if not user_id:
raise ValueError("user_id is required for subscription checking. Please provide Clerk user ID.")
try:
# Prepare context for AI analysis
context = {
'blog_content': blog_content,
'keywords_data': keywords_data,
'non_ai_results': non_ai_results
}
# Create AI prompt for structured analysis
prompt = self._create_ai_analysis_prompt(context)
schema = {
"type": "object",
"properties": {
"content_quality_insights": {
"type": "object",
"properties": {
"engagement_score": {"type": "number"},
"value_proposition": {"type": "string"},
"content_gaps": {"type": "array", "items": {"type": "string"}},
"improvement_suggestions": {"type": "array", "items": {"type": "string"}}
}
},
"seo_optimization_insights": {
"type": "object",
"properties": {
"keyword_optimization": {"type": "string"},
"content_relevance": {"type": "string"},
"search_intent_alignment": {"type": "string"},
"seo_improvements": {"type": "array", "items": {"type": "string"}}
}
},
"user_experience_insights": {
"type": "object",
"properties": {
"content_flow": {"type": "string"},
"readability_assessment": {"type": "string"},
"engagement_factors": {"type": "array", "items": {"type": "string"}},
"ux_improvements": {"type": "array", "items": {"type": "string"}}
}
},
"competitive_analysis": {
"type": "object",
"properties": {
"content_differentiation": {"type": "string"},
"unique_value": {"type": "string"},
"competitive_advantages": {"type": "array", "items": {"type": "string"}},
"market_positioning": {"type": "string"}
}
}
}
}
# Provider-agnostic structured response respecting GPT_PROVIDER
ai_response = llm_text_gen(
prompt=prompt,
json_struct=schema,
system_prompt=None,
user_id=user_id # Pass user_id for subscription checking
)
return ai_response
except Exception as e:
logger.error(f"AI analysis failed: {e}")
raise e
def _create_ai_analysis_prompt(self, context: Dict[str, Any]) -> str:
"""Create AI analysis prompt"""
blog_content = context['blog_content']
keywords_data = context['keywords_data']
non_ai_results = context['non_ai_results']
prompt = f"""
Analyze this blog content for SEO optimization and user experience. Provide structured insights based on the content and keyword data.
BLOG CONTENT:
{blog_content[:2000]}...
KEYWORDS DATA:
Primary Keywords: {keywords_data.get('primary', [])}
Long-tail Keywords: {keywords_data.get('long_tail', [])}
Semantic Keywords: {keywords_data.get('semantic', [])}
Search Intent: {keywords_data.get('search_intent', 'informational')}
NON-AI ANALYSIS RESULTS:
Structure Score: {non_ai_results.get('content_structure', {}).get('structure_score', 0)}
Readability Score: {non_ai_results.get('readability_analysis', {}).get('readability_score', 0)}
Content Quality Score: {non_ai_results.get('content_quality', {}).get('content_depth_score', 0)}
Please provide:
1. Content Quality Insights: Assess engagement potential, value proposition, content gaps, and improvement suggestions
2. SEO Optimization Insights: Evaluate keyword optimization, content relevance, search intent alignment, and SEO improvements
3. User Experience Insights: Analyze content flow, readability, engagement factors, and UX improvements
4. Competitive Analysis: Identify content differentiation, unique value, competitive advantages, and market positioning
Focus on actionable insights that can improve the blog's performance and user engagement.
"""
return prompt
def _compile_blog_seo_results(self, non_ai_results: Dict[str, Any], ai_insights: Dict[str, Any], keywords_data: Dict[str, Any]) -> Dict[str, Any]:
"""Compile comprehensive SEO analysis results"""
try:
# Validate required data - fail fast if missing
if not non_ai_results:
raise ValueError("Non-AI analysis results are missing")
if not ai_insights:
raise ValueError("AI insights are missing")
# Calculate category scores
category_scores = {
'structure': non_ai_results.get('content_structure', {}).get('structure_score', 0),
'keywords': self._calculate_keyword_score(non_ai_results.get('keyword_analysis', {})),
'readability': non_ai_results.get('readability_analysis', {}).get('readability_score', 0),
'quality': non_ai_results.get('content_quality', {}).get('content_depth_score', 0),
'headings': non_ai_results.get('heading_structure', {}).get('heading_hierarchy_score', 0),
'ai_insights': ai_insights.get('content_quality_insights', {}).get('engagement_score', 0)
}
# Calculate overall score
overall_score = self._calculate_weighted_score(category_scores)
# Compile actionable recommendations
actionable_recommendations = self._compile_actionable_recommendations(non_ai_results, ai_insights)
# Create visualization data
visualization_data = self._create_visualization_data(category_scores, non_ai_results)
return {
'overall_score': overall_score,
'category_scores': category_scores,
'detailed_analysis': non_ai_results,
'ai_insights': ai_insights,
'keywords_data': keywords_data,
'visualization_data': visualization_data,
'actionable_recommendations': actionable_recommendations,
'generated_at': datetime.utcnow().isoformat(),
'analysis_summary': self._create_analysis_summary(overall_score, category_scores, ai_insights)
}
except Exception as e:
logger.error(f"Results compilation failed: {e}")
# Fail fast - don't return fallback data
raise e
def _compile_actionable_recommendations(self, non_ai_results: Dict[str, Any], ai_insights: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Compile actionable recommendations from all sources"""
recommendations = []
# Structure recommendations
structure_recs = non_ai_results.get('content_structure', {}).get('recommendations', [])
for rec in structure_recs:
recommendations.append({
'category': 'Structure',
'priority': 'High',
'recommendation': rec,
'impact': 'Improves content organization and user experience'
})
# Keyword recommendations
keyword_recs = non_ai_results.get('keyword_analysis', {}).get('recommendations', [])
for rec in keyword_recs:
recommendations.append({
'category': 'Keywords',
'priority': 'High',
'recommendation': rec,
'impact': 'Improves search engine visibility'
})
# Readability recommendations
readability_recs = non_ai_results.get('readability_analysis', {}).get('recommendations', [])
for rec in readability_recs:
recommendations.append({
'category': 'Readability',
'priority': 'Medium',
'recommendation': rec,
'impact': 'Improves user engagement and comprehension'
})
# AI insights recommendations
ai_recs = ai_insights.get('content_quality_insights', {}).get('improvement_suggestions', [])
for rec in ai_recs:
recommendations.append({
'category': 'Content Quality',
'priority': 'Medium',
'recommendation': rec,
'impact': 'Enhances content value and engagement'
})
return recommendations
def _create_visualization_data(self, category_scores: Dict[str, int], non_ai_results: Dict[str, Any]) -> Dict[str, Any]:
"""Create data for visualization components"""
return {
'score_radar': {
'categories': list(category_scores.keys()),
'scores': list(category_scores.values()),
'max_score': 100
},
'keyword_analysis': {
'densities': non_ai_results.get('keyword_analysis', {}).get('keyword_density', {}),
'missing_keywords': non_ai_results.get('keyword_analysis', {}).get('missing_keywords', []),
'over_optimization': non_ai_results.get('keyword_analysis', {}).get('over_optimization', [])
},
'readability_metrics': non_ai_results.get('readability_analysis', {}).get('metrics', {}),
'content_stats': {
'word_count': non_ai_results.get('content_quality', {}).get('word_count', 0),
'sections': non_ai_results.get('content_structure', {}).get('total_sections', 0),
'paragraphs': non_ai_results.get('content_structure', {}).get('total_paragraphs', 0)
}
}
def _create_analysis_summary(self, overall_score: int, category_scores: Dict[str, int], ai_insights: Dict[str, Any]) -> Dict[str, Any]:
"""Create analysis summary"""
# Determine overall grade
if overall_score >= 90:
grade = 'A'
status = 'Excellent'
elif overall_score >= 80:
grade = 'B'
status = 'Good'
elif overall_score >= 70:
grade = 'C'
status = 'Fair'
elif overall_score >= 60:
grade = 'D'
status = 'Needs Improvement'
else:
grade = 'F'
status = 'Poor'
# Find strongest and weakest categories
strongest_category = max(category_scores.items(), key=lambda x: x[1])
weakest_category = min(category_scores.items(), key=lambda x: x[1])
return {
'overall_grade': grade,
'status': status,
'strongest_category': strongest_category[0],
'weakest_category': weakest_category[0],
'key_strengths': self._identify_key_strengths(category_scores),
'key_weaknesses': self._identify_key_weaknesses(category_scores),
'ai_summary': ai_insights.get('content_quality_insights', {}).get('value_proposition', '')
}
def _identify_key_strengths(self, category_scores: Dict[str, int]) -> List[str]:
"""Identify key strengths"""
strengths = []
for category, score in category_scores.items():
if score >= 80:
strengths.append(f"Strong {category} optimization")
return strengths
def _identify_key_weaknesses(self, category_scores: Dict[str, int]) -> List[str]:
"""Identify key weaknesses"""
weaknesses = []
for category, score in category_scores.items():
if score < 60:
weaknesses.append(f"Needs improvement in {category}")
return weaknesses
def _create_error_result(self, error_message: str) -> Dict[str, Any]:
"""Create error result - this should not be used in fail-fast mode"""
raise ValueError(f"Error result creation not allowed in fail-fast mode: {error_message}")

View File

@@ -0,0 +1,668 @@
"""
Blog SEO Metadata Generator
Optimized SEO metadata generation service that uses maximum 2 AI calls
to generate comprehensive metadata including titles, descriptions,
Open Graph tags, Twitter cards, and structured data.
"""
import asyncio
import re
from datetime import datetime
from typing import Dict, Any, List, Optional
from loguru import logger
from services.llm_providers.main_text_generation import llm_text_gen
class BlogSEOMetadataGenerator:
"""Optimized SEO metadata generator with maximum 2 AI calls"""
def __init__(self):
"""Initialize the metadata generator"""
logger.info("BlogSEOMetadataGenerator initialized")
async def generate_comprehensive_metadata(
self,
blog_content: str,
blog_title: str,
research_data: Dict[str, Any],
outline: Optional[List[Dict[str, Any]]] = None,
seo_analysis: Optional[Dict[str, Any]] = None,
user_id: str = None
) -> Dict[str, Any]:
"""
Generate comprehensive SEO metadata using maximum 2 AI calls
Args:
blog_content: The blog content to analyze
blog_title: The blog title
research_data: Research data containing keywords and insights
outline: Outline structure with sections and headings
seo_analysis: SEO analysis results from previous phase
user_id: Clerk user ID for subscription checking (required)
Returns:
Comprehensive metadata including all SEO elements
"""
if not user_id:
raise ValueError("user_id is required for subscription checking. Please provide Clerk user ID.")
try:
logger.info("Starting comprehensive SEO metadata generation")
# Extract keywords and context from research data
keywords_data = self._extract_keywords_from_research(research_data)
logger.info(f"Extracted keywords: {keywords_data}")
# Call 1: Generate core SEO metadata (parallel with Call 2)
logger.info("Generating core SEO metadata")
core_metadata_task = self._generate_core_metadata(
blog_content, blog_title, keywords_data, outline, seo_analysis, user_id=user_id
)
# Call 2: Generate social media and structured data (parallel with Call 1)
logger.info("Generating social media and structured data")
social_metadata_task = self._generate_social_metadata(
blog_content, blog_title, keywords_data, outline, seo_analysis, user_id=user_id
)
# Wait for both calls to complete
core_metadata, social_metadata = await asyncio.gather(
core_metadata_task,
social_metadata_task
)
# Compile final response
results = self._compile_metadata_response(core_metadata, social_metadata, blog_title)
logger.info(f"SEO metadata generation completed successfully")
return results
except Exception as e:
logger.error(f"SEO metadata generation failed: {e}")
# Fail fast - don't return fallback data
raise e
def _extract_keywords_from_research(self, research_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract keywords and context from research data"""
try:
keyword_analysis = research_data.get('keyword_analysis', {})
# Handle both 'semantic' and 'semantic_keywords' field names
semantic_keywords = keyword_analysis.get('semantic', []) or keyword_analysis.get('semantic_keywords', [])
return {
'primary_keywords': keyword_analysis.get('primary', []),
'long_tail_keywords': keyword_analysis.get('long_tail', []),
'semantic_keywords': semantic_keywords,
'all_keywords': keyword_analysis.get('all_keywords', []),
'search_intent': keyword_analysis.get('search_intent', 'informational'),
'target_audience': research_data.get('target_audience', 'general'),
'industry': research_data.get('industry', 'general')
}
except Exception as e:
logger.error(f"Failed to extract keywords from research: {e}")
return {
'primary_keywords': [],
'long_tail_keywords': [],
'semantic_keywords': [],
'all_keywords': [],
'search_intent': 'informational',
'target_audience': 'general',
'industry': 'general'
}
async def _generate_core_metadata(
self,
blog_content: str,
blog_title: str,
keywords_data: Dict[str, Any],
outline: Optional[List[Dict[str, Any]]] = None,
seo_analysis: Optional[Dict[str, Any]] = None,
user_id: str = None
) -> Dict[str, Any]:
"""Generate core SEO metadata (Call 1)"""
if not user_id:
raise ValueError("user_id is required for subscription checking. Please provide Clerk user ID.")
try:
# Create comprehensive prompt for core metadata
prompt = self._create_core_metadata_prompt(
blog_content, blog_title, keywords_data, outline, seo_analysis
)
# Define simplified structured schema for core metadata
schema = {
"type": "object",
"properties": {
"seo_title": {
"type": "string",
"description": "SEO-optimized title (50-60 characters)"
},
"meta_description": {
"type": "string",
"description": "Meta description (150-160 characters)"
},
"url_slug": {
"type": "string",
"description": "URL slug (lowercase, hyphens)"
},
"blog_tags": {
"type": "array",
"items": {"type": "string"},
"description": "Blog tags array"
},
"blog_categories": {
"type": "array",
"items": {"type": "string"},
"description": "Blog categories array"
},
"social_hashtags": {
"type": "array",
"items": {"type": "string"},
"description": "Social media hashtags array"
},
"reading_time": {
"type": "integer",
"description": "Reading time in minutes"
},
"focus_keyword": {
"type": "string",
"description": "Primary focus keyword"
}
},
"required": ["seo_title", "meta_description", "url_slug", "blog_tags", "blog_categories", "social_hashtags", "reading_time", "focus_keyword"]
}
# Get structured response using provider-agnostic llm_text_gen
ai_response_raw = llm_text_gen(
prompt=prompt,
json_struct=schema,
system_prompt=None,
user_id=user_id # Pass user_id for subscription checking
)
# Handle response: llm_text_gen may return dict (from structured JSON) or str (needs parsing)
ai_response = ai_response_raw
if isinstance(ai_response_raw, str):
try:
import json
ai_response = json.loads(ai_response_raw)
except json.JSONDecodeError:
logger.error(f"Failed to parse JSON response: {ai_response_raw[:200]}...")
ai_response = None
# Check if we got a valid response
if not ai_response or not isinstance(ai_response, dict):
logger.error("Core metadata generation failed: Invalid response from LLM")
# Return fallback response
primary_keywords = ', '.join(keywords_data.get('primary_keywords', ['content']))
word_count = len(blog_content.split())
return {
'seo_title': blog_title,
'meta_description': f'Learn about {primary_keywords.split(", ")[0] if primary_keywords else "this topic"}.',
'url_slug': blog_title.lower().replace(' ', '-').replace(':', '').replace(',', '')[:50],
'blog_tags': primary_keywords.split(', ') if primary_keywords else ['content'],
'blog_categories': ['Content Marketing', 'Technology'],
'social_hashtags': ['#content', '#marketing', '#technology'],
'reading_time': max(1, word_count // 200),
'focus_keyword': primary_keywords.split(', ')[0] if primary_keywords else 'content'
}
logger.info(f"Core metadata generation completed. Response keys: {list(ai_response.keys())}")
logger.info(f"Core metadata response: {ai_response}")
return ai_response
except Exception as e:
logger.error(f"Core metadata generation failed: {e}")
raise e
async def _generate_social_metadata(
self,
blog_content: str,
blog_title: str,
keywords_data: Dict[str, Any],
outline: Optional[List[Dict[str, Any]]] = None,
seo_analysis: Optional[Dict[str, Any]] = None,
user_id: str = None
) -> Dict[str, Any]:
"""Generate social media and structured data (Call 2)"""
if not user_id:
raise ValueError("user_id is required for subscription checking. Please provide Clerk user ID.")
try:
# Create comprehensive prompt for social metadata
prompt = self._create_social_metadata_prompt(
blog_content, blog_title, keywords_data, outline, seo_analysis
)
# Define simplified structured schema for social metadata
schema = {
"type": "object",
"properties": {
"open_graph": {
"type": "object",
"properties": {
"title": {"type": "string"},
"description": {"type": "string"},
"image": {"type": "string"},
"type": {"type": "string"},
"site_name": {"type": "string"},
"url": {"type": "string"}
}
},
"twitter_card": {
"type": "object",
"properties": {
"card": {"type": "string"},
"title": {"type": "string"},
"description": {"type": "string"},
"image": {"type": "string"},
"site": {"type": "string"},
"creator": {"type": "string"}
}
},
"json_ld_schema": {
"type": "object",
"properties": {
"@context": {"type": "string"},
"@type": {"type": "string"},
"headline": {"type": "string"},
"description": {"type": "string"},
"author": {"type": "object"},
"publisher": {"type": "object"},
"datePublished": {"type": "string"},
"dateModified": {"type": "string"},
"mainEntityOfPage": {"type": "string"},
"keywords": {"type": "array"},
"wordCount": {"type": "integer"}
}
}
},
"required": ["open_graph", "twitter_card", "json_ld_schema"]
}
# Get structured response using provider-agnostic llm_text_gen
ai_response_raw = llm_text_gen(
prompt=prompt,
json_struct=schema,
system_prompt=None,
user_id=user_id # Pass user_id for subscription checking
)
# Handle response: llm_text_gen may return dict (from structured JSON) or str (needs parsing)
ai_response = ai_response_raw
if isinstance(ai_response_raw, str):
try:
import json
ai_response = json.loads(ai_response_raw)
except json.JSONDecodeError:
logger.error(f"Failed to parse JSON response: {ai_response_raw[:200]}...")
ai_response = None
# Check if we got a valid response
if not ai_response or not isinstance(ai_response, dict) or not ai_response.get('open_graph') or not ai_response.get('twitter_card') or not ai_response.get('json_ld_schema'):
logger.error("Social metadata generation failed: Invalid or empty response from LLM")
# Return fallback response
return {
'open_graph': {
'title': blog_title,
'description': f'Learn about {keywords_data.get("primary_keywords", ["this topic"])[0] if keywords_data.get("primary_keywords") else "this topic"}.',
'image': 'https://example.com/image.jpg',
'type': 'article',
'site_name': 'Your Website',
'url': 'https://example.com/blog'
},
'twitter_card': {
'card': 'summary_large_image',
'title': blog_title,
'description': f'Learn about {keywords_data.get("primary_keywords", ["this topic"])[0] if keywords_data.get("primary_keywords") else "this topic"}.',
'image': 'https://example.com/image.jpg',
'site': '@yourwebsite',
'creator': '@author'
},
'json_ld_schema': {
'@context': 'https://schema.org',
'@type': 'Article',
'headline': blog_title,
'description': f'Learn about {keywords_data.get("primary_keywords", ["this topic"])[0] if keywords_data.get("primary_keywords") else "this topic"}.',
'author': {'@type': 'Person', 'name': 'Author Name'},
'publisher': {'@type': 'Organization', 'name': 'Your Website'},
'datePublished': '2025-01-01T00:00:00Z',
'dateModified': '2025-01-01T00:00:00Z',
'mainEntityOfPage': 'https://example.com/blog',
'keywords': keywords_data.get('primary_keywords', ['content']),
'wordCount': len(blog_content.split())
}
}
logger.info(f"Social metadata generation completed. Response keys: {list(ai_response.keys())}")
logger.info(f"Open Graph data: {ai_response.get('open_graph', 'Not found')}")
logger.info(f"Twitter Card data: {ai_response.get('twitter_card', 'Not found')}")
logger.info(f"JSON-LD data: {ai_response.get('json_ld_schema', 'Not found')}")
return ai_response
except Exception as e:
logger.error(f"Social metadata generation failed: {e}")
raise e
def _extract_content_highlights(self, blog_content: str, max_length: int = 2500) -> str:
"""Extract key sections from blog content for prompt context"""
try:
lines = blog_content.split('\n')
# Get first paragraph (introduction)
intro = ""
for line in lines[:20]:
if line.strip() and not line.strip().startswith('#'):
intro += line.strip() + " "
if len(intro) > 300:
break
# Get section headings
headings = [line.strip() for line in lines if line.strip().startswith('##')][:6]
# Get conclusion if available
conclusion = ""
for line in reversed(lines[-20:]):
if line.strip() and not line.strip().startswith('#'):
conclusion = line.strip() + " " + conclusion
if len(conclusion) > 300:
break
highlights = f"INTRODUCTION: {intro[:300]}...\n\n"
highlights += f"SECTION HEADINGS: {' | '.join([h.replace('##', '').strip() for h in headings])}\n\n"
if conclusion:
highlights += f"CONCLUSION: {conclusion[:300]}..."
return highlights[:max_length]
except Exception as e:
logger.warning(f"Failed to extract content highlights: {e}")
return blog_content[:2000] + "..."
def _create_core_metadata_prompt(
self,
blog_content: str,
blog_title: str,
keywords_data: Dict[str, Any],
outline: Optional[List[Dict[str, Any]]] = None,
seo_analysis: Optional[Dict[str, Any]] = None
) -> str:
"""Create high-quality prompt for core metadata generation"""
primary_keywords = ", ".join(keywords_data.get('primary_keywords', []))
semantic_keywords = ", ".join(keywords_data.get('semantic_keywords', []))
search_intent = keywords_data.get('search_intent', 'informational')
target_audience = keywords_data.get('target_audience', 'general')
industry = keywords_data.get('industry', 'general')
word_count = len(blog_content.split())
# Extract outline structure
outline_context = ""
if outline:
headings = [s.get('heading', '') for s in outline if s.get('heading')]
outline_context = f"""
OUTLINE STRUCTURE:
- Total sections: {len(outline)}
- Section headings: {', '.join(headings[:8])}
- Content hierarchy: Well-structured with {len(outline)} main sections
"""
# Extract SEO analysis insights
seo_context = ""
if seo_analysis:
overall_score = seo_analysis.get('overall_score', seo_analysis.get('seo_score', 0))
category_scores = seo_analysis.get('category_scores', {})
applied_recs = seo_analysis.get('applied_recommendations', [])
seo_context = f"""
SEO ANALYSIS RESULTS:
- Overall SEO Score: {overall_score}/100
- Category Scores: Structure {category_scores.get('structure', category_scores.get('Structure', 0))}, Keywords {category_scores.get('keywords', category_scores.get('Keywords', 0))}, Readability {category_scores.get('readability', category_scores.get('Readability', 0))}
- Applied Recommendations: {len(applied_recs)} SEO optimizations have been applied
- Content Quality: Optimized for search engines with keyword focus
"""
# Get more content context (key sections instead of just first 1000 chars)
content_preview = self._extract_content_highlights(blog_content)
prompt = f"""
Generate comprehensive, personalized SEO metadata for this blog post.
=== BLOG CONTENT CONTEXT ===
TITLE: {blog_title}
CONTENT PREVIEW (key sections): {content_preview}
WORD COUNT: {word_count} words
READING TIME ESTIMATE: {max(1, word_count // 200)} minutes
{outline_context}
=== KEYWORD & AUDIENCE DATA ===
PRIMARY KEYWORDS: {primary_keywords}
SEMANTIC KEYWORDS: {semantic_keywords}
SEARCH INTENT: {search_intent}
TARGET AUDIENCE: {target_audience}
INDUSTRY: {industry}
{seo_context}
=== METADATA GENERATION REQUIREMENTS ===
1. SEO TITLE (50-60 characters, must include primary keyword):
- Front-load primary keyword
- Make it compelling and click-worthy
- Include power words if appropriate for {target_audience} audience
- Optimized for {search_intent} search intent
2. META DESCRIPTION (150-160 characters, must include CTA):
- Include primary keyword naturally in first 120 chars
- Add compelling call-to-action (e.g., "Learn more", "Discover how", "Get started")
- Highlight value proposition for {target_audience} audience
- Use {industry} industry-specific terminology where relevant
3. URL SLUG (lowercase, hyphens, 3-5 words):
- Include primary keyword
- Remove stop words
- Keep it concise and readable
4. BLOG TAGS (5-8 relevant tags):
- Mix of primary, semantic, and long-tail keywords
- Industry-specific tags for {industry}
- Audience-relevant tags for {target_audience}
5. BLOG CATEGORIES (2-3 categories):
- Based on content structure and {industry} industry standards
- Reflect main themes from outline sections
6. SOCIAL HASHTAGS (5-10 hashtags with #):
- Include primary keyword as hashtag
- Industry-specific hashtags for {industry}
- Trending/relevant hashtags for {target_audience}
7. READING TIME (calculate from {word_count} words):
- Average reading speed: 200 words/minute
- Round to nearest minute
8. FOCUS KEYWORD (primary keyword for SEO):
- Select the most important primary keyword
- Should match the main topic and search intent
=== QUALITY REQUIREMENTS ===
- All metadata must be unique, not generic
- Incorporate insights from SEO analysis if provided
- Reflect the actual content structure from outline
- Use language appropriate for {target_audience} audience
- Optimize for {search_intent} search intent
- Make descriptions compelling and action-oriented
Generate metadata that is personalized, compelling, and SEO-optimized.
"""
return prompt
def _create_social_metadata_prompt(
self,
blog_content: str,
blog_title: str,
keywords_data: Dict[str, Any],
outline: Optional[List[Dict[str, Any]]] = None,
seo_analysis: Optional[Dict[str, Any]] = None
) -> str:
"""Create high-quality prompt for social metadata generation"""
primary_keywords = ", ".join(keywords_data.get('primary_keywords', []))
search_intent = keywords_data.get('search_intent', 'informational')
target_audience = keywords_data.get('target_audience', 'general')
industry = keywords_data.get('industry', 'general')
current_date = datetime.now().isoformat()
# Add outline and SEO context similar to core metadata prompt
outline_context = ""
if outline:
headings = [s.get('heading', '') for s in outline if s.get('heading')]
outline_context = f"\nOUTLINE SECTIONS: {', '.join(headings[:6])}\n"
seo_context = ""
if seo_analysis:
overall_score = seo_analysis.get('overall_score', seo_analysis.get('seo_score', 0))
seo_context = f"\nSEO SCORE: {overall_score}/100 (optimized content)\n"
content_preview = self._extract_content_highlights(blog_content, 1500)
prompt = f"""
Generate engaging social media metadata for this blog post.
=== CONTENT ===
TITLE: {blog_title}
CONTENT: {content_preview}
{outline_context}
{seo_context}
KEYWORDS: {primary_keywords}
TARGET AUDIENCE: {target_audience}
INDUSTRY: {industry}
CURRENT DATE: {current_date}
=== GENERATION REQUIREMENTS ===
1. OPEN GRAPH (Facebook/LinkedIn):
- title: 60 chars max, include primary keyword, compelling for {target_audience}
- description: 160 chars max, include CTA and value proposition
- image: Suggest an appropriate image URL (placeholder if none available)
- type: "article"
- site_name: Use appropriate site name for {industry} industry
- url: Generate canonical URL structure
2. TWITTER CARD:
- card: "summary_large_image"
- title: 70 chars max, optimized for Twitter audience
- description: 200 chars max with relevant hashtags inline
- image: Match Open Graph image
- site: @yourwebsite (placeholder, user should update)
- creator: @author (placeholder, user should update)
3. JSON-LD SCHEMA (Article):
- @context: "https://schema.org"
- @type: "Article"
- headline: Article title (optimized)
- description: Article description (150-200 chars)
- author: {{"@type": "Person", "name": "Author Name"}} (placeholder)
- publisher: {{"@type": "Organization", "name": "Site Name", "logo": {{"@type": "ImageObject", "url": "logo-url"}}}}
- datePublished: {current_date}
- dateModified: {current_date}
- mainEntityOfPage: {{"@type": "WebPage", "@id": "canonical-url"}}
- keywords: Array of primary and semantic keywords
- wordCount: {len(blog_content.split())}
- articleSection: Primary category based on content
- inLanguage: "en-US"
Make it engaging, personalized for {target_audience}, and optimized for {industry} industry.
"""
return prompt
def _compile_metadata_response(
self,
core_metadata: Dict[str, Any],
social_metadata: Dict[str, Any],
original_title: str
) -> Dict[str, Any]:
"""Compile final metadata response"""
try:
# Extract data from AI responses
seo_title = core_metadata.get('seo_title', original_title)
meta_description = core_metadata.get('meta_description', '')
url_slug = core_metadata.get('url_slug', '')
blog_tags = core_metadata.get('blog_tags', [])
blog_categories = core_metadata.get('blog_categories', [])
social_hashtags = core_metadata.get('social_hashtags', [])
canonical_url = core_metadata.get('canonical_url', '')
reading_time = core_metadata.get('reading_time', 0)
focus_keyword = core_metadata.get('focus_keyword', '')
open_graph = social_metadata.get('open_graph', {})
twitter_card = social_metadata.get('twitter_card', {})
json_ld_schema = social_metadata.get('json_ld_schema', {})
# Compile comprehensive response
response = {
'success': True,
'title_options': [seo_title], # For backward compatibility
'meta_descriptions': [meta_description], # For backward compatibility
'seo_title': seo_title,
'meta_description': meta_description,
'url_slug': url_slug,
'blog_tags': blog_tags,
'blog_categories': blog_categories,
'social_hashtags': social_hashtags,
'canonical_url': canonical_url,
'reading_time': reading_time,
'focus_keyword': focus_keyword,
'open_graph': open_graph,
'twitter_card': twitter_card,
'json_ld_schema': json_ld_schema,
'generated_at': datetime.utcnow().isoformat(),
'metadata_summary': {
'total_metadata_types': 10,
'ai_calls_used': 2,
'optimization_score': self._calculate_optimization_score(core_metadata, social_metadata)
}
}
logger.info(f"Metadata compilation completed. Generated {len(response)} metadata fields")
return response
except Exception as e:
logger.error(f"Metadata compilation failed: {e}")
raise e
def _calculate_optimization_score(self, core_metadata: Dict[str, Any], social_metadata: Dict[str, Any]) -> int:
"""Calculate overall optimization score for the generated metadata"""
try:
score = 0
# Check core metadata completeness
if core_metadata.get('seo_title'):
score += 15
if core_metadata.get('meta_description'):
score += 15
if core_metadata.get('url_slug'):
score += 10
if core_metadata.get('blog_tags'):
score += 10
if core_metadata.get('blog_categories'):
score += 10
if core_metadata.get('social_hashtags'):
score += 10
if core_metadata.get('focus_keyword'):
score += 10
# Check social metadata completeness
if social_metadata.get('open_graph'):
score += 10
if social_metadata.get('twitter_card'):
score += 5
if social_metadata.get('json_ld_schema'):
score += 5
return min(score, 100) # Cap at 100
except Exception as e:
logger.error(f"Failed to calculate optimization score: {e}")
return 0

View File

@@ -0,0 +1,273 @@
"""Blog SEO Recommendation Applier
Applies actionable SEO recommendations to existing blog content using the
provider-agnostic `llm_text_gen` dispatcher. Ensures GPT_PROVIDER parity.
"""
import asyncio
from typing import Dict, Any, List
from utils.logger_utils import get_service_logger
from services.llm_providers.main_text_generation import llm_text_gen
logger = get_service_logger("blog_seo_recommendation_applier")
class BlogSEORecommendationApplier:
"""Apply actionable SEO recommendations to blog content."""
def __init__(self):
logger.debug("Initialized BlogSEORecommendationApplier")
async def apply_recommendations(self, payload: Dict[str, Any], user_id: str = None) -> Dict[str, Any]:
"""Apply recommendations and return updated content."""
if not user_id:
raise ValueError("user_id is required for subscription checking. Please provide Clerk user ID.")
title = payload.get("title", "Untitled Blog")
sections: List[Dict[str, Any]] = payload.get("sections", [])
outline = payload.get("outline", [])
research = payload.get("research", {})
recommendations = payload.get("recommendations", [])
persona = payload.get("persona", {})
tone = payload.get("tone")
audience = payload.get("audience")
if not sections:
return {"success": False, "error": "No sections provided for recommendation application"}
if not recommendations:
logger.warning("apply_recommendations called without recommendations")
return {"success": True, "title": title, "sections": sections, "applied": []}
prompt = self._build_prompt(
title=title,
sections=sections,
outline=outline,
research=research,
recommendations=recommendations,
persona=persona,
tone=tone,
audience=audience,
)
schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"sections": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "string"},
"heading": {"type": "string"},
"content": {"type": "string"},
"notes": {"type": "array", "items": {"type": "string"}},
},
"required": ["id", "heading", "content"],
},
},
"applied_recommendations": {
"type": "array",
"items": {
"type": "object",
"properties": {
"category": {"type": "string"},
"summary": {"type": "string"},
},
},
},
},
"required": ["sections"],
}
logger.info("Applying SEO recommendations via llm_text_gen")
result = await asyncio.to_thread(
llm_text_gen,
prompt,
None,
schema,
user_id, # Pass user_id for subscription checking
)
if not result or result.get("error"):
error_msg = result.get("error", "Unknown error") if result else "No response from text generator"
logger.error(f"SEO recommendation application failed: {error_msg}")
return {"success": False, "error": error_msg}
raw_sections = result.get("sections", []) or []
normalized_sections: List[Dict[str, Any]] = []
# Build lookup table from updated sections using their identifiers
updated_map: Dict[str, Dict[str, Any]] = {}
for updated in raw_sections:
section_id = str(
updated.get("id")
or updated.get("section_id")
or updated.get("heading")
or ""
).strip()
if not section_id:
continue
heading = (
updated.get("heading")
or updated.get("title")
or section_id
)
content_text = updated.get("content", "")
if isinstance(content_text, list):
content_text = "\n\n".join(str(p).strip() for p in content_text if p)
updated_map[section_id] = {
"id": section_id,
"heading": heading,
"content": str(content_text).strip(),
"notes": updated.get("notes", []),
}
if not updated_map and raw_sections:
logger.warning("Updated sections missing identifiers; falling back to positional mapping")
for index, original in enumerate(sections):
fallback_id = str(
original.get("id")
or original.get("section_id")
or f"section_{index + 1}"
).strip()
mapped = updated_map.get(fallback_id)
if not mapped and raw_sections:
# Fall back to positional match if identifier lookup failed
candidate = raw_sections[index] if index < len(raw_sections) else {}
heading = (
candidate.get("heading")
or candidate.get("title")
or original.get("heading")
or original.get("title")
or f"Section {index + 1}"
)
content_text = candidate.get("content") or original.get("content", "")
if isinstance(content_text, list):
content_text = "\n\n".join(str(p).strip() for p in content_text if p)
mapped = {
"id": fallback_id,
"heading": heading,
"content": str(content_text).strip(),
"notes": candidate.get("notes", []),
}
if not mapped:
# Fallback to original content if nothing else available
mapped = {
"id": fallback_id,
"heading": original.get("heading") or original.get("title") or f"Section {index + 1}",
"content": str(original.get("content", "")).strip(),
"notes": original.get("notes", []),
}
normalized_sections.append(mapped)
applied = result.get("applied_recommendations", [])
logger.info("SEO recommendations applied successfully")
return {
"success": True,
"title": result.get("title", title),
"sections": normalized_sections,
"applied": applied,
}
def _build_prompt(
self,
*,
title: str,
sections: List[Dict[str, Any]],
outline: List[Dict[str, Any]],
research: Dict[str, Any],
recommendations: List[Dict[str, Any]],
persona: Dict[str, Any],
tone: str | None,
audience: str | None,
) -> str:
"""Construct prompt for applying recommendations."""
sections_str = []
for section in sections:
sections_str.append(
f"ID: {section.get('id', 'section')}, Heading: {section.get('heading', 'Untitled')}\n"
f"Current Content:\n{section.get('content', '')}\n"
)
outline_str = "\n".join(
[
f"- {item.get('heading', 'Section')} (Target words: {item.get('target_words', 'N/A')})"
for item in outline
]
)
research_summary = research.get("keyword_analysis", {}) if research else {}
primary_keywords = ", ".join(research_summary.get("primary", [])[:10]) or "None"
recommendations_str = []
for rec in recommendations:
recommendations_str.append(
f"Category: {rec.get('category', 'General')} | Priority: {rec.get('priority', 'Medium')}\n"
f"Recommendation: {rec.get('recommendation', '')}\n"
f"Impact: {rec.get('impact', '')}\n"
)
persona_str = (
f"Persona: {persona}\n"
if persona
else "Persona: (not provided)\n"
)
style_guidance = []
if tone:
style_guidance.append(f"Desired tone: {tone}")
if audience:
style_guidance.append(f"Target audience: {audience}")
style_str = "\n".join(style_guidance) if style_guidance else "Maintain current tone and audience alignment."
prompt = f"""
You are an expert SEO content strategist. Update the blog content to apply the actionable recommendations.
Current Title: {title}
Primary Keywords (for context): {primary_keywords}
Outline Overview:
{outline_str or 'No outline supplied'}
Existing Sections:
{''.join(sections_str)}
Actionable Recommendations to Apply:
{''.join(recommendations_str)}
{persona_str}
{style_str}
Instructions:
1. Carefully apply the recommendations while preserving factual accuracy and research alignment.
2. Keep section identifiers (IDs) unchanged so the frontend can map updates correctly.
3. Improve clarity, flow, and SEO optimization per the guidance.
4. Return updated sections in the requested JSON format.
5. Provide a short summary of which recommendations were addressed.
"""
return prompt
__all__ = ["BlogSEORecommendationApplier"]