chore: push all remaining changes

- Blog writer enhancements and bug fixes
- Wix integration improvements
- Frontend UI updates
- GSC dashboard docs cleanup
- Image studio assets
- LinkedIn requirements file
- Various dependency updates
This commit is contained in:
ajaysi
2026-06-12 20:32:03 +05:30
parent 63a0df2536
commit d90d441019
78 changed files with 3963 additions and 2899 deletions

View File

@@ -6,7 +6,7 @@ Provider parity:
- No direct provider coupling here; Google grounding remains in research only
"""
from typing import Any, Dict
from typing import Any, Dict, List
from services.llm_providers.main_text_generation import llm_text_gen
from .source_url_manager import SourceURLManager
@@ -22,11 +22,12 @@ class EnhancedContentGenerator:
self.transitioner = TransitionGenerator()
self.flow = FlowAnalyzer()
async def generate_section(self, section: Any, research: Any, mode: str = "polished", user_id: str = None) -> Dict[str, Any]:
async def generate_section(self, section: Any, research: Any = None, mode: str = "polished", user_id: str = None, competitive_advantage: str = "") -> Dict[str, Any]:
prev_summary = self.memory.build_previous_sections_summary(limit=2)
urls = self.url_manager.pick_relevant_urls(section, research)
prompt = self._build_prompt(section, research, prev_summary, urls)
# Provider-agnostic text generation (respect GPT_PROVIDER & circuit-breaker)
research_context, section_sources = self._build_research_context(section)
urls = self.url_manager.pick_relevant_urls(section, research) if not research_context else []
global_research_context = self._build_global_research_context(research, competitive_advantage)
prompt = self._build_prompt(section, prev_summary, research_context, urls, global_research_context)
content_text: str = ""
try:
ai_resp = llm_text_gen(
@@ -40,29 +41,22 @@ class EnhancedContentGenerator:
elif isinstance(ai_resp, str):
content_text = ai_resp
else:
# Fallback best-effort extraction
content_text = str(ai_resp or "")
except Exception as e:
content_text = ""
result = {
"content": content_text,
"sources": [{"title": u.get("title", ""), "url": u.get("url", "")} for u in urls] if urls else [],
"sources": section_sources,
}
# Generate transition and compute intelligent flow metrics
previous_text = prev_summary
current_text = result.get("content", "")
transition = self.transitioner.generate_transition(previous_text, getattr(section, 'heading', 'This section'), use_llm=True)
metrics = self.flow.assess_flow(previous_text, current_text, use_llm=True)
# Update memory for subsequent sections and store continuity snapshot
if current_text:
self.memory.update_with_section(getattr(section, 'id', 'unknown'), current_text, use_llm=True)
# Return enriched result
result["transition"] = transition
result["continuity_metrics"] = metrics
# Persist a lightweight continuity snapshot for API access
try:
sid = getattr(section, 'id', 'unknown')
if not hasattr(self, "_last_continuity"):
@@ -72,22 +66,188 @@ class EnhancedContentGenerator:
pass
return result
def _build_prompt(self, section: Any, research: Any, prev_summary: str, urls: list) -> str:
def _build_research_context(self, section: Any) -> tuple:
"""Build a rich research context block from the section's mapped sources.
Returns (context_string, sources_list) where context_string is the
formatted research context for the prompt, and sources_list contains
{title, url} dicts for downstream use.
When section.references is empty, returns ("", []) — the caller should
handle this as a research gap and avoid generating unsupported claims.
"""
references = getattr(section, 'references', []) or []
if not references:
return ("", [])
context_parts = []
sources_out = []
for i, ref in enumerate(references, 1):
if isinstance(ref, dict):
title = ref.get('title', '')
excerpt = ref.get('excerpt', '')
highlights = ref.get('highlights', []) or []
summary = ref.get('summary', '')
url = ref.get('url', '')
content = ref.get('content', '') or ''
author = ref.get('author', '') or ''
source_type = ref.get('source_type', '') or ''
credibility_score = ref.get('credibility_score')
published_at = ref.get('published_at', '') or ''
else:
title = getattr(ref, 'title', '')
excerpt = getattr(ref, 'excerpt', '')
highlights = getattr(ref, 'highlights', []) or []
summary = getattr(ref, 'summary', '')
url = getattr(ref, 'url', '')
content = getattr(ref, 'content', '') or ''
author = getattr(ref, 'author', '') or ''
source_type = getattr(ref, 'source_type', '') or ''
credibility_score = getattr(ref, 'credibility_score', None)
published_at = getattr(ref, 'published_at', '') or ''
sources_out.append({"title": title, "url": url})
attribution_parts = []
if author:
attribution_parts.append(f"by {author}")
if source_type:
attribution_parts.append(f"[{source_type}]")
attribution = " ".join(attribution_parts)
credibility_tag = ""
if credibility_score is not None:
try:
score = float(credibility_score)
if score >= 0.9:
credibility_tag = " (high-credibility)"
elif score >= 0.75:
credibility_tag = " (credible)"
except (ValueError, TypeError):
pass
recency_tag = ""
if published_at:
recency_tag = f" (published {published_at[:10]})" if len(published_at) >= 10 else f" (published {published_at})"
header = f"Source {i}: {title}"
if attribution:
header += f" {attribution}"
header += f"{credibility_tag}{recency_tag}"
part = header + "\n"
if summary:
part += f" Summary: {summary[:1000]}\n"
if excerpt:
part += f" Key excerpt: {excerpt[:1000]}\n"
if content and not summary and not excerpt:
part += f" Content: {content[:800]}\n"
if highlights:
part += " Key findings:\n"
for h in highlights[:3]:
h_text = h[:500] if h else ''
if h_text:
part += f" - {h_text}\n"
context_parts.append(part)
return ("\n".join(context_parts), sources_out)
def _build_global_research_context(self, research: Any, competitive_advantage: str = "") -> str:
"""Build global research context from the full BlogResearchResponse object.
Extracts keyword_analysis, competitor_analysis, search_queries,
and competitive_advantage into a compact context block that provides
the LLM with strategic direction beyond per-section sources.
"""
if research is None:
return ""
parts = []
ka = getattr(research, 'keyword_analysis', None) or {}
if ka:
primary = ka.get('primary', [])
secondary = ka.get('secondary', [])
search_intent = ka.get('search_intent', '')
kw_lines = []
if primary:
kw_lines.append(f"Primary keywords: {', '.join(primary[:10])}")
if secondary:
kw_lines.append(f"Secondary keywords: {', '.join(secondary[:10])}")
if search_intent:
kw_lines.append(f"Search intent: {search_intent}")
if kw_lines:
parts.append("=== KEYWORD & SEARCH STRATEGY ===\n" + "\n".join(kw_lines))
ca = getattr(research, 'competitor_analysis', None) or {}
if ca:
ca_lines = []
content_gaps = ca.get('content_gaps', [])
if content_gaps:
ca_lines.append(f"Content gaps (address these): {', '.join(content_gaps[:5])}")
industry_leaders = ca.get('industry_leaders', [])
if industry_leaders:
ca_lines.append(f"Industry leaders: {', '.join(industry_leaders[:5])}")
opportunities = ca.get('opportunities', [])
if opportunities:
ca_lines.append(f"Opportunities: {', '.join(opportunities[:5])}")
if ca_lines:
parts.append("=== COMPETITIVE LANDSCAPE ===\n" + "\n".join(ca_lines))
sq = getattr(research, 'search_queries', None) or []
if sq:
parts.append(f"=== SEARCH INTENT SIGNALS ===\nOriginal search queries: {', '.join(sq[:8])}")
if competitive_advantage:
parts.append(f"=== COMPETITIVE ADVANTAGE ===\nEmphasize this differentiator: {competitive_advantage}")
return "\n\n".join(parts) if parts else ""
def _build_prompt(self, section: Any, prev_summary: str, research_context: str, urls: list, global_research_context: str = "") -> str:
heading = getattr(section, 'heading', 'Section')
key_points = getattr(section, 'key_points', [])
keywords = getattr(section, 'keywords', [])
subheadings = getattr(section, 'subheadings', []) or []
target_words = getattr(section, 'target_words', 300)
url_block = "\n".join([f"- {u.get('title','')} ({u.get('url','')})" for u in urls]) if urls else "(no specific URLs provided)"
return (
prompt = (
f"You are writing the blog section '{heading}'.\n\n"
f"Context summary (previous sections): {prev_summary}\n\n"
f"Authoring requirements:\n"
f"- Target word count: ~{target_words}\n"
f"- Use the following key points: {', '.join(key_points)}\n"
f"- Include these keywords naturally: {', '.join(keywords)}\n"
f"- Cite insights from these sources when relevant (do not output raw URLs):\n{url_block}\n\n"
"Write engaging, well-structured markdown with clear paragraphs (2-4 sentences each) separated by double line breaks."
)
if subheadings:
prompt += f"- Cover these subtopics: {', '.join(subheadings)}\n"
if global_research_context:
prompt += f"\n{global_research_context}\n\n"
if research_context:
prompt += (
f"\nResearch sources for this section (use these facts, statistics, "
f"and insights to support your writing):\n{research_context}\n\n"
"IMPORTANT: Base your writing on the research sources above. "
"Use specific facts, statistics, and data from these sources. "
"Do not invent numbers, statistics, or claims not supported by the research.\n"
)
elif urls:
import logging
logging.getLogger('content_generator').warning(
f"No research context for section '{heading}' — falling back to bare URLs"
)
url_lines = []
for u in urls:
if isinstance(u, dict):
url_lines.append(f"- {u.get('title','')} ({u.get('url','')})")
else:
url_lines.append(f"- {u}")
prompt += f"\nReference URLs (consult for additional context):\n" + "\n".join(url_lines) + "\n"
prompt += (
"\nWrite engaging, well-structured markdown with clear paragraphs "
"(2-4 sentences each) separated by double line breaks."
)
return prompt

View File

@@ -7,10 +7,9 @@ Uses Gemini API for intelligent analysis while minimizing API calls through cach
from typing import Dict, Optional
from loguru import logger
import hashlib
import json
# Import the common gemini provider
from services.llm_providers.gemini_provider import gemini_structured_json_response
# Provider-agnostic LLM dispatcher (respects GPT_PROVIDER env var)
from services.llm_providers.main_text_generation import llm_text_gen
class FlowAnalyzer:
@@ -21,7 +20,7 @@ class FlowAnalyzer:
self._rule_cache: Dict[str, Dict[str, float]] = {}
logger.info("✅ FlowAnalyzer initialized with LLM-based analysis")
def assess_flow(self, previous_text: str, current_text: str, use_llm: bool = True) -> Dict[str, float]:
def assess_flow(self, previous_text: str, current_text: str, use_llm: bool = True, user_id: str = None) -> Dict[str, float]:
"""
Return flow metrics in range 0..1.
@@ -29,6 +28,7 @@ class FlowAnalyzer:
previous_text: Previous section content
current_text: Current section content
use_llm: Whether to use LLM analysis (default: True for significant content)
user_id: Clerk user ID for subscription checking
"""
if not current_text:
return {"flow": 0.0, "consistency": 0.0, "progression": 0.0}
@@ -46,7 +46,7 @@ class FlowAnalyzer:
if should_use_llm:
try:
metrics = self._llm_flow_analysis(previous_text, current_text)
metrics = self._llm_flow_analysis(previous_text, current_text, user_id=user_id)
self._cache[cache_key] = metrics
logger.info("LLM-based flow analysis completed")
return metrics
@@ -71,8 +71,8 @@ class FlowAnalyzer:
# Use LLM if: substantial content (>100 words) OR has meaningful previous context
return word_count > 100 or has_previous
def _llm_flow_analysis(self, previous_text: str, current_text: str) -> Dict[str, float]:
"""Use Gemini API for intelligent flow analysis."""
def _llm_flow_analysis(self, previous_text: str, current_text: str, user_id: str = None) -> Dict[str, float]:
"""Use LLM for intelligent flow analysis (provider-agnostic)."""
# Truncate content to minimize tokens while keeping context
prev_truncated = (previous_text[-300:] if previous_text else "") if previous_text else ""
@@ -103,22 +103,20 @@ Return ONLY a JSON object with these exact keys: flow, consistency, progression
}
try:
result = gemini_structured_json_response(
result = llm_text_gen(
prompt=prompt,
schema=schema,
temperature=0.2, # Low temperature for consistent scoring
max_tokens=1000 # Increased tokens for better analysis
json_struct=schema,
system_prompt=None,
user_id=user_id,
temperature=0.2,
max_tokens=1000
)
if result.parsed:
return {
"flow": float(result.parsed.get("flow", 0.6)),
"consistency": float(result.parsed.get("consistency", 0.6)),
"progression": float(result.parsed.get("progression", 0.6))
}
else:
logger.warning("LLM response parsing failed, using fallback")
return self._rule_based_analysis(previous_text, current_text)
return {
"flow": float(result.get("flow", 0.6)),
"consistency": float(result.get("consistency", 0.6)),
"progression": float(result.get("progression", 0.6))
}
except Exception as e:
logger.error(f"LLM flow analysis error: {e}")

View File

@@ -28,18 +28,17 @@ class IntroductionGenerator:
) -> str:
"""Build a prompt for generating blog introductions."""
# Extract key research insights
keyword_analysis = research.keyword_analysis or {}
content_angles = research.suggested_angles or []
competitor_analysis = research.competitor_analysis or {}
search_queries = research.search_queries or []
# Get a summary of the first few sections for context
section_summaries = []
for i, section in enumerate(outline[:3], 1):
section_id = section.id
content = sections_content.get(section_id, '')
if content:
# Take first 200 chars as summary
summary = content[:200] + '...' if len(content) > 200 else content
summary = content[:300] + '...' if len(content) > 300 else content
section_summaries.append(f"{i}. {section.heading}: {summary}")
sections_text = '\n'.join(section_summaries) if section_summaries else "Content sections are being generated."
@@ -47,13 +46,56 @@ class IntroductionGenerator:
primary_kw_text = ', '.join(primary_keywords) if primary_keywords else "the topic"
content_angle_text = ', '.join(content_angles[:3]) if content_angles else "General insights"
return f"""Generate exactly 3 varied blog introductions for the following blog post.
# Build keyword strategy block from actual keyword_analysis
keyword_block = ""
all_keywords = []
if keyword_analysis:
primary_kw = keyword_analysis.get('primary', [])
secondary_kw = keyword_analysis.get('secondary', [])
if primary_kw:
all_keywords.extend(primary_kw[:5])
if secondary_kw:
all_keywords.extend(secondary_kw[:5])
si = keyword_analysis.get('search_intent', '')
if si:
keyword_block += f"\nSearch intent: {si}"
if all_keywords:
keyword_block = f"Target keywords: {', '.join(all_keywords)}" + keyword_block
# Build competitive landscape block
competitive_block = ""
if competitor_analysis:
gaps = competitor_analysis.get('content_gaps', [])
leaders = competitor_analysis.get('industry_leaders', [])
opportunities = competitor_analysis.get('opportunities', [])
advantages = competitor_analysis.get('competitive_advantages', [])
comp_lines = []
if advantages:
comp_lines.append(f"Key differentiators: {', '.join(advantages[:3])}")
if gaps:
comp_lines.append(f"Content gaps to address: {', '.join(gaps[:3])}")
if leaders:
comp_lines.append(f"Industry leaders: {', '.join(leaders[:3])}")
if opportunities:
comp_lines.append(f"Opportunities: {', '.join(opportunities[:3])}")
if comp_lines:
competitive_block = "\n".join(comp_lines)
# Build search intent context
search_block = ""
if search_queries:
search_block = f"Original search queries: {', '.join(search_queries[:5])}"
prompt = f"""Generate exactly 3 varied blog introductions for the following blog post.
BLOG TITLE: {blog_title}
PRIMARY KEYWORDS: {primary_kw_text}
SEARCH INTENT: {search_intent}
CONTENT ANGLES: {content_angle_text}
{keyword_block}
{f"COMPETITIVE LANDSCAPE:\n{competitive_block}" if competitive_block else ""}
{f"SEARCH CONTEXT:\n{search_block}" if search_block else ""}
BLOG CONTENT SUMMARY:
{sections_text}
@@ -69,6 +111,7 @@ REQUIREMENTS FOR EACH INTRODUCTION:
3. Third: Story/statistic-focused (use a compelling fact or narrative hook)
- Maintain a professional yet engaging tone
- Avoid generic phrases - be specific and benefit-driven
- Where possible, incorporate specific insights from the competitive landscape and search intent above
Return ONLY a JSON array of exactly 3 introductions:
[
@@ -76,6 +119,7 @@ Return ONLY a JSON array of exactly 3 introductions:
"Second introduction (80-120 words, benefit-focused)",
"Third introduction (80-120 words, story/statistic-focused)"
]"""
return prompt
def get_introduction_schema(self) -> Dict[str, Any]:
"""Get the JSON schema for introduction generation."""

View File

@@ -129,9 +129,9 @@ class BlogWriterService:
"""Enhance a section using AI."""
return await self.outline_service.enhance_section_with_ai(section, focus)
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization") -> List[BlogOutlineSection]:
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization", research_context: str = "") -> List[BlogOutlineSection]:
"""Optimize entire outline for better flow and SEO."""
return await self.outline_service.optimize_outline_with_ai(outline, focus)
return await self.outline_service.optimize_outline_with_ai(outline, focus, research_context=research_context)
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
"""Rebalance word count distribution across sections."""
@@ -140,14 +140,15 @@ class BlogWriterService:
# Content Generation Methods
async def generate_section(self, request: BlogSectionRequest, user_id: str = None) -> BlogSectionResponse:
"""Generate section content from outline."""
# Compose research-lite object with minimal continuity summary if available
research_ctx: Any = getattr(request, 'research', None)
research_ctx = request.research
competitive_advantage = request.competitive_advantage
try:
ai_result = await self.content_generator.generate_section(
section=request.section,
research=research_ctx,
mode=(request.mode or "polished"),
user_id=user_id
user_id=user_id,
competitive_advantage=competitive_advantage,
)
markdown = ai_result.get('content') or ai_result.get('markdown') or ''
citations = []
@@ -339,8 +340,19 @@ class BlogWriterService:
)
async def publish(self, request: BlogPublishRequest) -> BlogPublishResponse:
"""Publish content to specified platform."""
# TODO: Move to content module
"""Publish content to specified platform.
NOTE: This endpoint is a STUB / placeholder. The actual publish flow
bypasses this method entirely — the frontend calls platform-specific
endpoints directly:
- Wix: POST /api/wix/publish (wix_routes.py)
- WordPress: POST /api/wordpress/publish (routers/wordpress.py)
TODO: Either remove this stub or wire it as a unified dispatcher that
routes to the correct platform service. Keep alive until the new
unified publish flow (pre-publish checklist + schedule + history) is
built and this becomes the single entry point for all publishing.
"""
return BlogPublishResponse(success=True, platform=request.platform, url="https://example.com/post")
async def generate_medium_blog_with_progress(self, req: MediumBlogGenerateRequest, task_id: str, user_id: str, db: Session = None) -> MediumBlogGenerateResult:
@@ -359,9 +371,11 @@ class BlogWriterService:
async def analyze_flow_basic(self, request: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze flow metrics for entire blog using single AI call (cost-effective)."""
try:
import asyncio
# Extract blog content from request
sections = request.get("sections", [])
title = request.get("title", "Untitled Blog")
user_id = request.get("user_id")
if not sections:
return {"error": "No sections provided for analysis"}
@@ -397,8 +411,7 @@ class BlogWriterService:
Provide detailed analysis with specific, actionable suggestions for improvement.
"""
# Use Gemini for structured analysis
from services.llm_providers.gemini_provider import gemini_structured_json_response
from services.llm_providers.main_text_generation import llm_text_gen
schema = {
"type": "object",
@@ -440,12 +453,17 @@ class BlogWriterService:
"required": ["overall_flow_score", "overall_consistency_score", "overall_progression_score", "overall_coherence_score", "sections", "overall_suggestions"]
}
result = gemini_structured_json_response(
prompt=analysis_prompt,
schema=schema,
temperature=0.3,
max_tokens=4096,
system_prompt=system_prompt
result = await asyncio.to_thread(
llm_text_gen,
analysis_prompt,
system_prompt,
schema,
user_id,
None, # preferred_hf_models
None, # preferred_provider
None, # flow_type
4096, # max_tokens
0.3 # temperature
)
if result and not result.get("error"):
@@ -466,6 +484,7 @@ class BlogWriterService:
# Use the existing enhanced content generator for detailed analysis
sections = request.get("sections", [])
title = request.get("title", "Untitled Blog")
user_id = request.get("user_id")
if not sections:
return {"error": "No sections provided for analysis"}
@@ -485,7 +504,8 @@ class BlogWriterService:
flow_metrics = self.content_generator.flow.assess_flow(
prev_section_content,
section_content,
use_llm=True
use_llm=True,
user_id=user_id
)
results.append({

View File

@@ -241,9 +241,23 @@ class GroundingContextEngine:
else:
authority_distribution['low'] += 1
# Extract actual high-authority sources from chunks
high_authority_sources = []
for chunk in grounding_metadata.grounding_chunks:
chunk_authority = self._calculate_chunk_authority(chunk)
if chunk_authority >= 0.8:
high_authority_sources.append({
'title': chunk.title if chunk.title else 'Unknown Source',
'url': chunk.url if chunk.url else '',
'score': round(chunk_authority, 3)
})
# Sort by authority score descending, keep top 5
high_authority_sources.sort(key=lambda x: x['score'], reverse=True)
high_authority_sources = high_authority_sources[:5]
return {
'average_authority_score': sum(authority_scores) / len(authority_scores) if authority_scores else 0.0,
'high_authority_sources': [{'title': 'High Authority Source', 'url': 'example.com', 'score': 0.9}], # Placeholder
'high_authority_sources': high_authority_sources,
'authority_distribution': dict(authority_distribution)
}

View File

@@ -52,6 +52,44 @@ class OutlineGenerator:
raw_analysis = research.keyword_analysis if research else {}
return self.keyword_curator.curate(raw_analysis)
def _build_optimization_context(self, research) -> str:
"""Build a compact research context for the outline optimizer.
Provides keywords, competitor data, and top source summaries so
the optimizer doesn't run blind to the research."""
if not research:
return ""
parts = []
kw = research.keyword_analysis if research.keyword_analysis else {}
primary = kw.get('primary', [])
if primary:
parts.append(f"Primary keywords: {', '.join(primary[:5])}")
search_intent = kw.get('search_intent', '')
if search_intent:
parts.append(f"Search intent: {search_intent}")
comp = research.competitor_analysis if research.competitor_analysis else {}
top_competitors = comp.get('top_competitors', [])
if top_competitors:
parts.append(f"Top competitors: {', '.join(str(c) for c in top_competitors[:5])}")
content_gaps = kw.get('content_gaps', [])
if content_gaps:
parts.append(f"Content gaps: {'; '.join(str(g) for g in content_gaps[:5])}")
opportunities = comp.get('opportunities', [])
if opportunities:
parts.append(f"Opportunities: {'; '.join(str(o) for o in opportunities[:5])}")
sources = research.sources if research.sources else []
if sources:
top_sources = sorted(sources, key=lambda s: s.credibility_score or 0.8, reverse=True)[:5]
source_lines = []
for s in top_sources:
line = f"- {s.title}"
if s.summary:
line += f": {s.summary[:150]}"
elif s.excerpt:
line += f": {s.excerpt[:150]}"
source_lines.append(line)
parts.append("Key research sources:\n" + "\n".join(source_lines))
return "\n".join(parts)
async def generate(self, request: BlogOutlineRequest, user_id: str) -> BlogOutlineResponse:
"""
Generate AI-powered outline using research results.
@@ -102,7 +140,7 @@ class OutlineGenerator:
# Run parallel processing for speed optimization (user_id required)
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing_async(
outline_sections, research, user_id
outline_sections, research, user_id, competitive_advantage=selected_competitive_advantage or ""
)
# Enhance sections with grounding insights
@@ -113,7 +151,8 @@ class OutlineGenerator:
# Optimize outline for better flow, SEO, and engagement (user_id required)
logger.info("Optimizing outline for better flow and engagement...")
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
optimization_context = self._build_optimization_context(research)
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id, research_context=optimization_context)
# Rebalance word counts for optimal distribution
target_words = request.word_count or 1500
@@ -202,7 +241,7 @@ class OutlineGenerator:
# Run parallel processing for speed optimization (user_id required for subscription checks)
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing(
outline_sections, research, user_id, task_id
outline_sections, research, user_id, task_id, competitive_advantage=selected_competitive_advantage or ""
)
# Enhance sections with grounding insights (depends on both previous tasks)
@@ -213,7 +252,8 @@ class OutlineGenerator:
# Optimize outline for better flow, SEO, and engagement (user_id required for subscription checks)
await task_manager.update_progress(task_id, "🎯 Optimizing outline for better flow and engagement...")
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
optimization_context = self._build_optimization_context(research)
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id, research_context=optimization_context)
# Rebalance word counts for optimal distribution
await task_manager.update_progress(task_id, "⚖️ Rebalancing word count distribution...")

View File

@@ -4,7 +4,7 @@ Outline Optimizer - AI-powered outline optimization and rebalancing.
Optimizes outlines for better flow, SEO, and engagement.
"""
from typing import List
from typing import List, Dict, Any, Optional
from loguru import logger
from models.blog_models import BlogOutlineSection
@@ -13,13 +13,14 @@ from models.blog_models import BlogOutlineSection
class OutlineOptimizer:
"""Optimizes outlines for better flow, SEO, and engagement."""
async def optimize(self, outline: List[BlogOutlineSection], focus: str, user_id: str) -> List[BlogOutlineSection]:
async def optimize(self, outline: List[BlogOutlineSection], focus: str, user_id: str, research_context: str = "") -> List[BlogOutlineSection]:
"""Optimize entire outline for better flow, SEO, and engagement.
Args:
outline: List of outline sections to optimize
focus: Optimization focus (e.g., "general optimization")
user_id: User ID (required for subscription checks and usage tracking)
research_context: Optional research context to ground optimization
Returns:
List of optimized outline sections
@@ -40,19 +41,28 @@ Current Outline:
Optimization Focus: {focus}
Goals: Improve narrative flow, enhance SEO, increase engagement, ensure comprehensive coverage.
"""
if research_context:
optimization_prompt += f"""
Research Context (use this to ground your optimization in real data):
{research_context}
Ensure the optimized outline reflects the research insights above — headings should address the key topics, keywords should align with search intent, and sections should cover the most important angles from the research.
"""
optimization_prompt += """
Return JSON format:
{{
{
"outline": [
{{
{
"heading": "Optimized heading",
"subheadings": ["subheading 1", "subheading 2"],
"key_points": ["point 1", "point 2"],
"target_words": 300,
"keywords": ["keyword1", "keyword2"]
}}
}
]
}}"""
}"""
try:
from services.llm_providers.main_text_generation import llm_text_gen
@@ -112,26 +122,34 @@ Return JSON format:
return outline
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
"""Rebalance word count distribution across sections."""
"""Rebalance word count distribution across sections, weighting by source count."""
total_sections = len(outline)
if total_sections == 0:
return outline
# Calculate target distribution
intro_words = int(target_words * 0.12) # 12% for intro
conclusion_words = int(target_words * 0.12) # 12% for conclusion
intro_words = int(target_words * 0.12)
conclusion_words = int(target_words * 0.12)
main_content_words = target_words - intro_words - conclusion_words
# Distribute main content words across sections
words_per_section = main_content_words // total_sections
remainder = main_content_words % total_sections
# Weight sections by research density (sections with more sources get more words)
main_sections = outline[1:-1] if total_sections > 2 else outline
source_weights = []
for section in main_sections:
ref_count = len(getattr(section, 'references', []) or [])
source_weights.append(1.0 + ref_count * 0.5)
total_weight = sum(source_weights) if source_weights else len(main_sections)
for i, section in enumerate(outline):
if i == 0: # First section (intro)
if i == 0 and total_sections > 2:
section.target_words = intro_words
elif i == total_sections - 1: # Last section (conclusion)
elif i == total_sections - 1 and total_sections > 2:
section.target_words = conclusion_words
else: # Main content sections
section.target_words = words_per_section + (1 if i < remainder else 0)
else:
main_idx = i - 1 if total_sections > 2 else i
if main_idx < len(source_weights):
section.target_words = int(main_content_words * source_weights[main_idx] / total_weight)
else:
section.target_words = main_content_words // max(len(main_sections), 1)
return outline

View File

@@ -233,9 +233,9 @@ class OutlineService:
"""Enhance a section using AI with research context."""
return await self.section_enhancer.enhance(section, focus)
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization") -> List[BlogOutlineSection]:
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization", research_context: str = "") -> List[BlogOutlineSection]:
"""Optimize entire outline for better flow, SEO, and engagement."""
return await self.outline_optimizer.optimize(outline, focus)
return await self.outline_optimizer.optimize(outline, focus, research_context=research_context)
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
"""Rebalance word count distribution across sections."""

View File

@@ -17,7 +17,7 @@ class ParallelProcessor:
self.source_mapper = source_mapper
self.grounding_engine = grounding_engine
async def run_parallel_processing(self, outline_sections, research, user_id: str, task_id: str = None) -> Tuple[Any, Any]:
async def run_parallel_processing(self, outline_sections, research, user_id: str, task_id: str = None, competitive_advantage: str = "") -> Tuple[Any, Any]:
"""
Run source mapping and grounding insights extraction in parallel.
@@ -26,6 +26,7 @@ class ParallelProcessor:
research: Research data object
user_id: User ID (required for subscription checks and usage tracking)
task_id: Optional task ID for progress updates
competitive_advantage: Selected competitive advantage for preferential source matching
Returns:
Tuple of (mapped_sections, grounding_insights)
@@ -44,7 +45,7 @@ class ParallelProcessor:
# Run these tasks in parallel to save time
source_mapping_task = asyncio.create_task(
self._run_source_mapping(outline_sections, research, task_id, user_id)
self._run_source_mapping(outline_sections, research, task_id, user_id, competitive_advantage)
)
grounding_insights_task = asyncio.create_task(
@@ -59,7 +60,7 @@ class ParallelProcessor:
return mapped_sections, grounding_insights
async def run_parallel_processing_async(self, outline_sections, research, user_id: str) -> Tuple[Any, Any]:
async def run_parallel_processing_async(self, outline_sections, research, user_id: str, competitive_advantage: str = "") -> Tuple[Any, Any]:
"""
Run parallel processing without progress updates (for non-progress methods).
@@ -67,6 +68,7 @@ class ParallelProcessor:
outline_sections: List of outline sections to process
research: Research data object
user_id: User ID (required for subscription checks and usage tracking)
competitive_advantage: Selected competitive advantage for preferential source matching
Returns:
Tuple of (mapped_sections, grounding_insights)
@@ -81,7 +83,7 @@ class ParallelProcessor:
# Run these tasks in parallel to save time
source_mapping_task = asyncio.create_task(
self._run_source_mapping_async(outline_sections, research, user_id)
self._run_source_mapping_async(outline_sections, research, user_id, competitive_advantage)
)
grounding_insights_task = asyncio.create_task(
@@ -96,12 +98,12 @@ class ParallelProcessor:
return mapped_sections, grounding_insights
async def _run_source_mapping(self, outline_sections, research, task_id, user_id: str):
async def _run_source_mapping(self, outline_sections, research, task_id, user_id: str, competitive_advantage: str = ""):
"""Run source mapping in parallel."""
if task_id:
from api.blog_writer.task_manager import task_manager
await task_manager.update_progress(task_id, "🔗 Applying intelligent source-to-section mapping...")
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id, competitive_advantage=competitive_advantage)
async def _run_grounding_insights_extraction(self, research, task_id):
"""Run grounding insights extraction in parallel."""
@@ -110,10 +112,10 @@ class ParallelProcessor:
await task_manager.update_progress(task_id, "🧠 Extracting grounding metadata insights...")
return self.grounding_engine.extract_contextual_insights(research.grounding_metadata)
async def _run_source_mapping_async(self, outline_sections, research, user_id: str):
async def _run_source_mapping_async(self, outline_sections, research, user_id: str, competitive_advantage: str = ""):
"""Run source mapping in parallel (async version without progress updates)."""
logger.info("Applying intelligent source-to-section mapping...")
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id, competitive_advantage=competitive_advantage)
async def _run_grounding_insights_extraction_async(self, research):
"""Run grounding insights extraction in parallel (async version without progress updates)."""

View File

@@ -37,27 +37,60 @@ class PromptBuilder:
opportunity_text = ', '.join(research.competitor_analysis.get('opportunities', [])) if research and research.competitor_analysis else "Not available"
advantages_text = ', '.join(research.competitor_analysis.get('competitive_advantages', [])) if research and research.competitor_analysis else "Not available"
competitor_headings_text = ', '.join(research.competitor_analysis.get('competitor_headings', [])[:3]) if research and research.competitor_analysis and research.competitor_analysis.get('competitor_headings') else ""
content_gaps_text = ', '.join(research.competitor_analysis.get('content_gaps', [])) if research and research.competitor_analysis and research.competitor_analysis.get('content_gaps') else ""
industry_leaders_text = ', '.join(research.competitor_analysis.get('industry_leaders', [])) if research and research.competitor_analysis and research.competitor_analysis.get('industry_leaders') else ""
# Extract additional UI-mapped context fields
analysis_insights_text = (research.keyword_analysis.get('analysis_insights', '') or '') if research and research.keyword_analysis else ''
market_positioning_text = (research.competitor_analysis.get('market_positioning', '') or '') if research and research.competitor_analysis else ''
difficulty_score = research.keyword_analysis.get('difficulty', None) if research and research.keyword_analysis else None
# Extract search queries as intent signals
search_queries_text = ', '.join(research.search_queries) if research and hasattr(research, 'search_queries') and research.search_queries else ""
# Extract top 3 authoritative source excerpts as factual data points
# Build numbered source list — all sources with index, title, excerpt, and highlights
# The LLM will reference these indices when assigning sources to sections
source_list_text = ""
if sources:
source_lines = []
for i, src in enumerate(sources, 1):
title = getattr(src, 'title', '') or ''
excerpt = getattr(src, 'excerpt', '') or ''
highlights = getattr(src, 'highlights', []) or []
summary = getattr(src, 'summary', '') or ''
source_type = getattr(src, 'source_type', '') or ''
author = getattr(src, 'author', '') or ''
line = f" [{i}] {title}"
if source_type:
line += f" [{source_type}]"
if author:
line += f" by {author}"
if summary:
line += f"{summary[:1000]}"
elif excerpt:
line += f"{excerpt[:1000]}"
if highlights:
line += f" | Key findings: {'; '.join(h[:250] for h in highlights[:3])}"
source_lines.append(line)
if source_lines:
source_list_text = "RESEARCH SOURCES (numbered for reference):\n" + "\n".join(source_lines)
# Top factual excerpts for depth (keep as supplement)
source_excerpts_text = ""
if sources:
sorted_sources = sorted(
[s for s in sources if (s.excerpt or s.summary)],
key=lambda s: s.credibility_score or 0.8, reverse=True
)[:3]
)[:5]
excerpts = []
for i, src in enumerate(sorted_sources, 1):
excerpt = src.excerpt or src.summary or ""
if len(excerpt) > 300:
excerpt = excerpt[:297] + "..."
if len(excerpt) > 500:
excerpt = excerpt[:497] + "..."
excerpts.append(f" {i}. \"{src.title}\"{excerpt}")
if excerpts:
source_excerpts_text = "FACTUAL DATA POINTS FROM RESEARCH:\n" + "\n".join(excerpts)
source_excerpts_text = "DETAILED FACTS FROM TOP SOURCES:\n" + "\n".join(excerpts)
# Extract recency: newest source publication date
newest_date_str = ""
@@ -76,12 +109,12 @@ class PromptBuilder:
grounding_evidence_text = ""
if research and research.grounding_metadata and research.grounding_metadata.grounding_supports:
supports = research.grounding_metadata.grounding_supports
top_supports = [s for s in supports if s.segment_text and len(s.segment_text) > 20][:3]
top_supports = [s for s in supports if s.segment_text and len(s.segment_text) > 20][:5]
if top_supports:
evidence_parts = []
for i, s in enumerate(top_supports, 1):
text = s.segment_text[:250]
if len(s.segment_text) > 250:
text = s.segment_text[:400]
if len(s.segment_text) > 400:
text += "..."
evidence_parts.append(f" {i}. {text}")
grounding_evidence_text = "VERIFIED EVIDENCE (high-confidence snippets):\n" + "\n".join(evidence_parts)
@@ -151,8 +184,11 @@ Market Opportunities: {opportunity_text}
Competitive Advantages: {advantages_text}
{f"Market Positioning: {market_positioning_text}" if market_positioning_text else ""}
{f"Competitor Headings (AVOID duplicating): {competitor_headings_text}" if competitor_headings_text else ""}
{f"Content Gaps (MUST address these gaps): {content_gaps_text}" if content_gaps_text else ""}
{f"Industry Leaders: {industry_leaders_text}" if industry_leaders_text else ""}
{f"Search Intent Signals: {search_queries_text}" if search_queries_text else ""}
RESEARCH SOURCES: {len(sources)} authoritative sources available
{source_list_text}
{newest_date_str}
{source_excerpts_text}
@@ -168,8 +204,9 @@ STRATEGIC REQUIREMENTS:
- Create SEO-optimized headings with natural keyword integration
- Surface the strongest research-backed angles within the outline
- Build logical narrative flow from problem to solution
- Include data-driven insights from research sources
- Address content gaps and market opportunities
- Include data-driven insights from research sources — use the numbered sources above
- For each section, assign the most relevant source indices using the [N] numbers above
- Address content gaps and market opportunities — if content gaps are listed, dedicate sections to fill those gaps
- Optimize for search intent and user questions
- Ensure engaging, actionable content throughout
@@ -186,7 +223,8 @@ Return JSON format:
"subheadings": ["Subheading 1", "Subheading 2", "Subheading 3"],
"key_points": ["Key point 1", "Key point 2", "Key point 3"],
"target_words": 300,
"keywords": ["keyword 1", "keyword 2"]
"keywords": ["keyword 1", "keyword 2"],
"source_indices": [1, 3, 5]
}}
]
}}"""
@@ -220,9 +258,14 @@ Return JSON format:
"keywords": {
"type": "array",
"items": {"type": "string"}
},
"source_indices": {
"type": "array",
"items": {"type": "integer"},
"description": "Indices of research sources (from the numbered list above) that support this section"
}
},
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
"required": ["heading", "subheadings", "key_points", "target_words", "keywords", "source_indices"]
}
}
},

View File

@@ -100,18 +100,37 @@ class ResponseProcessor:
raise ValueError(f"AI outline generation failed: {error_str}")
def convert_to_sections(self, outline_data: Dict[str, Any], sources: List) -> List[BlogOutlineSection]:
"""Convert outline data to BlogOutlineSection objects."""
"""Convert outline data to BlogOutlineSection objects.
If the LLM assigned source_indices to sections, populate references
directly from those indices. Indices are 1-based (matching the [N]
labels in the prompt) — converted to 0-based for list access.
Sections without source_indices will be populated by the algorithmic
source mapper in a later step.
"""
outline_sections = []
for i, section_data in enumerate(outline_data.get('outline', [])):
if not isinstance(section_data, dict) or 'heading' not in section_data:
continue
# Parse LLM-assigned source indices (1-based)
raw_indices = section_data.get('source_indices', [])
section_refs = []
if raw_indices and sources:
for idx in raw_indices:
try:
source_idx = int(idx) - 1 # Convert 1-based → 0-based
if 0 <= source_idx < len(sources):
section_refs.append(sources[source_idx])
except (ValueError, TypeError):
pass
section = BlogOutlineSection(
id=f"s{i+1}",
heading=section_data.get('heading', f'Section {i+1}'),
subheadings=section_data.get('subheadings', []),
key_points=section_data.get('key_points', []),
references=[], # Will be populated by intelligent mapping
references=section_refs, # LLM-assigned if provided, else []
target_words=section_data.get('target_words', 200),
keywords=section_data.get('keywords', [])
)

View File

@@ -41,10 +41,33 @@ class SourceToSectionMapper:
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'how', 'much', 'many', 'more', 'most',
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'much', 'many', 'more', 'most',
'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
'over', 'under', 'again', 'further', 'then', 'once'
'over', 'under', 'again', 'further', 'then', 'once', 'also', 'into', 'about', 'between',
'through', 'during', 'before', 'after', 'above', 'below', 'from', 'since', 'until', 'while',
'because', 'however', 'therefore', 'thus', 'hence', 'yet', 'still', 'already', 'even'
}
# Common abbreviation/synonym pairs for fuzzy matching
self._synonym_map = {
'ai': ['artificial intelligence', 'machine intelligence'],
'ml': ['machine learning'],
'dl': ['deep learning'],
'nlp': ['natural language processing'],
'iot': ['internet of things'],
'saas': ['software as a service'],
'b2b': ['business to business'],
'b2c': ['business to consumer'],
'cx': ['customer experience'],
'ux': ['user experience'],
'roi': ['return on investment'],
'kpi': ['key performance indicator'],
'crm': ['customer relationship management'],
'erp': ['enterprise resource planning'],
'seo': ['search engine optimization'],
'cto': ['chief technology officer'],
'vp': ['vice president'],
}
logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
@@ -53,15 +76,21 @@ class SourceToSectionMapper:
self,
sections: List[BlogOutlineSection],
research_data: BlogResearchResponse,
user_id: str
user_id: str,
competitive_advantage: str = ""
) -> List[BlogOutlineSection]:
"""
Map research sources to outline sections using intelligent algorithms.
Sections that already have LLM-assigned references (from source_indices
in the outline prompt) are preserved. Algorithmic mapping fills gaps
for sections without LLM-assigned sources.
Args:
sections: List of outline sections to map sources to
research_data: Research data containing sources and metadata
user_id: User ID (required for subscription checks and usage tracking)
competitive_advantage: Selected competitive advantage to preferentially match
Returns:
List of outline sections with intelligently mapped sources
@@ -76,16 +105,39 @@ class SourceToSectionMapper:
logger.warning("No sections or sources to map")
return sections
logger.info(f"Mapping {len(research_data.sources)} sources to {len(sections)} sections")
# Separate sections with LLM-assigned references from those without
sections_with_refs = [s for s in sections if s.references]
sections_without_refs = [s for s in sections if not s.references]
# Step 1: Algorithmic mapping
mapping_results = self._algorithmic_source_mapping(sections, research_data)
logger.info(
f"Mapping {len(research_data.sources)} sources to {len(sections)} sections "
f"({len(sections_with_refs)} with LLM-assigned references, "
f"{len(sections_without_refs)} need algorithmic mapping)"
)
# Step 2: AI validation and improvement (single prompt, user_id required for subscription checks)
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
if sections_without_refs:
# Step 1: Algorithmic mapping for sections without LLM-assigned references
mapping_results = self._algorithmic_source_mapping(sections_without_refs, research_data, competitive_advantage)
# Step 2: AI validation and improvement
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
# Step 3: Apply mapping only to sections that need it
mapped_sections_with = self._apply_mapping_to_sections(sections_without_refs, validated_mapping)
else:
mapped_sections_with = []
# Step 3: Apply validated mapping to sections
mapped_sections = self._apply_mapping_to_sections(sections, validated_mapping)
# Combine: keep LLM-assigned sections as-is, add algorithmically mapped ones
mapped_sections = list(sections_with_refs) + mapped_sections_with
# Preserve original ordering
original_ids = [s.id for s in sections]
mapped_sections.sort(key=lambda s: original_ids.index(s.id) if s.id in original_ids else 999)
# Warn if any section still has zero references
for s in mapped_sections:
if not s.references:
logger.warning(f"Section '{s.heading}' (id={s.id}) has ZERO sources — content generator will use keyword-based fallback")
logger.info("✅ Source-to-section mapping completed successfully")
return mapped_sections
@@ -93,7 +145,8 @@ class SourceToSectionMapper:
def _algorithmic_source_mapping(
self,
sections: List[BlogOutlineSection],
research_data: BlogResearchResponse
research_data: BlogResearchResponse,
competitive_advantage: str = ""
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
"""
Perform algorithmic mapping of sources to sections.
@@ -101,6 +154,7 @@ class SourceToSectionMapper:
Args:
sections: List of outline sections
research_data: Research data with sources
competitive_advantage: Selected competitive advantage to boost matching
Returns:
Dictionary mapping section IDs to list of (source, score) tuples
@@ -114,7 +168,7 @@ class SourceToSectionMapper:
# Calculate multi-dimensional relevance score
semantic_score = self._calculate_semantic_similarity(section, source)
keyword_score = self._calculate_keyword_relevance(section, source, research_data)
contextual_score = self._calculate_contextual_relevance(section, source, research_data)
contextual_score = self._calculate_contextual_relevance(section, source, research_data, competitive_advantage)
# Weighted total score
total_score = (
@@ -140,38 +194,54 @@ class SourceToSectionMapper:
def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
"""
Calculate semantic similarity between section and source.
Args:
section: Outline section
source: Research source
Returns:
Semantic similarity score (0.0 to 1.0)
Uses word overlap, stem matching, bigram overlap, title-boost, and synonym expansion.
"""
# Extract text content for comparison
section_text = self._extract_section_text(section)
source_text = self._extract_source_text(source)
# Calculate word overlap
section_words = self._extract_meaningful_words(section_text)
source_words = self._extract_meaningful_words(source_text)
if not section_words or not source_words:
return 0.0
# Calculate Jaccard similarity
intersection = len(set(section_words) & set(source_words))
union = len(set(section_words) | set(source_words))
section_set = set(section_words)
source_set = set(source_words)
jaccard_similarity = intersection / union if union > 0 else 0.0
# 1. Jaccard similarity on raw words
intersection = len(section_set & source_set)
union = len(section_set | source_set)
jaccard = intersection / union if union > 0 else 0.0
# Boost score for exact phrase matches
phrase_boost = self._calculate_phrase_similarity(section_text, source_text)
# 2. Stem matching — catches word variants (e.g., "running" vs "runs")
section_stems = set(self._stem_word(w) for w in section_words)
source_stems = set(self._stem_word(w) for w in source_words)
stem_intersection = len(section_stems & source_stems)
stem_union = len(section_stems | source_stems)
stem_similarity = stem_intersection / stem_union if stem_union > 0 else 0.0
# Combine Jaccard similarity with phrase boost
semantic_score = min(1.0, jaccard_similarity + phrase_boost)
# 3. Bigram overlap — catches multi-word concepts (e.g., "machine learning")
section_bigrams = set(self._extract_bigrams(section_text))
source_bigrams = set(self._extract_bigrams(source_text))
bigram_overlap = len(section_bigrams & source_bigrams)
bigram_score = min(0.3, bigram_overlap * 0.1) if (section_bigrams or source_bigrams) else 0.0
return semantic_score
# 4. Title-boost — section heading matching source title is a strong signal
heading = (section.heading or '').lower()
source_title = (source.title or '').lower()
heading_words = set(self._extract_meaningful_words(heading))
title_words = set(self._extract_meaningful_words(source_title))
title_overlap = len(heading_words & title_words) / len(heading_words | title_words) if (heading_words or title_words) else 0.0
title_boost = min(0.3, title_overlap * 0.5)
# 5. Synonym expansion — expand abbreviations and match across synonym pairs
synonym_score = self._calculate_synonym_overlap(section_words, source_words)
# Combine: Jaccard + stem give base, bigram + title + synonyms boost
base_similarity = max(jaccard, stem_similarity)
combined = min(1.0, base_similarity + bigram_score + title_boost + synonym_score + 0.0)
return combined
def _calculate_keyword_relevance(
self,
@@ -219,7 +289,8 @@ class SourceToSectionMapper:
self,
section: BlogOutlineSection,
source: ResearchSource,
research_data: BlogResearchResponse
research_data: BlogResearchResponse,
competitive_advantage: str = ""
) -> float:
"""
Calculate contextual relevance based on section content and source context.
@@ -228,6 +299,7 @@ class SourceToSectionMapper:
section: Outline section
source: Research source
research_data: Research data with context
competitive_advantage: Selected competitive advantage to boost matching
Returns:
Contextual relevance score (0.0 to 1.0)
@@ -264,6 +336,15 @@ class SourceToSectionMapper:
industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
contextual_score += industry_score * 0.2
# 4. Competitive advantage boost — sources that match the advantage get a score lift
if competitive_advantage:
advantage_words = set(self._extract_meaningful_words(competitive_advantage.lower()))
if advantage_words:
advantage_in_section = sum(1 for w in advantage_words if w in section_text) / len(advantage_words)
advantage_in_source = sum(1 for w in advantage_words if w in source_text) / len(advantage_words)
if advantage_in_section > 0.3 and advantage_in_source > 0.3:
contextual_score += 0.25 * (advantage_in_section + advantage_in_source)
return min(1.0, contextual_score)
def _ai_validate_mapping(
@@ -360,10 +441,15 @@ class SourceToSectionMapper:
return " ".join(text_parts)
def _extract_source_text(self, source: ResearchSource) -> str:
"""Extract all text content from a source."""
"""Extract all text content from a source, including full text for better matching."""
text_parts = [source.title]
if source.summary:
text_parts.append(source.summary)
if source.excerpt:
text_parts.append(source.excerpt)
content = getattr(source, 'content', '') or ''
if content:
text_parts.append(content[:500])
return " ".join(text_parts)
def _extract_meaningful_words(self, text: str) -> List[str]:
@@ -382,6 +468,41 @@ class SourceToSectionMapper:
return meaningful_words
def _stem_word(self, word: str) -> str:
"""Rudimentary suffix-stripping stemmer for English words."""
if len(word) <= 3:
return word
for suffix in ['ization', 'ation', 'tion', 'sion', 'ment', 'ness', 'ity', 'ing', 'able', 'ible', 'ful', 'less', 'ous', 'ive', 'ally', 'ly', 'er', 'ed', 'es', 's']:
if word.endswith(suffix) and len(word) - len(suffix) >= 3:
return word[:-len(suffix)]
return word
def _extract_bigrams(self, text: str) -> List[str]:
"""Extract meaningful two-word phrases from text."""
words = self._extract_meaningful_words(text)
if len(words) < 2:
return []
return [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]
def _calculate_synonym_overlap(self, section_words: List[str], source_words: List[str]) -> float:
"""Score overlap via abbreviation/synonym expansion."""
section_set = set(section_words)
source_set = set(source_words)
extra_matches = 0
total_terms = len(section_set | source_set) or 1
for abbr, expansions in self._synonym_map.items():
abbr_in_section = abbr in section_set
abbr_in_source = abbr in source_set
for expansion in expansions:
exp_words = set(expansion.split())
exp_in_section = exp_words.issubset(section_set)
exp_in_source = exp_words.issubset(source_set)
if (abbr_in_section and exp_in_source) or (abbr_in_source and exp_in_section):
extra_matches += 1
return min(0.2, extra_matches * 0.05)
def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
"""Calculate phrase similarity boost score."""
if not text1 or not text2:

View File

@@ -18,7 +18,7 @@ class CompetitorAnalyzer:
Analyze the following research content and extract competitor insights:
Research Content:
{content[:3000]}
{content[:8000]}
Extract and analyze:
1. Top competitors mentioned (companies, brands, platforms)

View File

@@ -17,7 +17,7 @@ class ContentAngleGenerator:
Analyze the following research content and create strategic content angles for: {topic} in {industry}
Research Content:
{content[:3000]}
{content[:8000]}
Create 7 compelling content angles that:
1. Leverage current trends and data from the research

View File

@@ -7,6 +7,8 @@ Neural search implementation using Exa API for high-quality, citation-rich resea
from exa_py import Exa
import os
import asyncio
from datetime import datetime
from urllib.parse import urlparse
from typing import List, Dict, Any
from loguru import logger
from models.subscription_models import APIProvider
@@ -355,6 +357,125 @@ class ExaResearchProvider(BaseProvider):
return None
def _calculate_credibility_score(self, result) -> float:
"""Dynamic credibility score based on domain authority, recency, and content substance."""
scores = []
weights = []
# Domain authority (weight: 3) — most important signal
url = result.url if hasattr(result, 'url') else ''
domain_score = self._score_domain_authority(url)
scores.append(domain_score)
weights.append(3)
# Recency (weight: 2) — fresher content is more valuable
recency_score = self._score_recency(result)
scores.append(recency_score)
weights.append(2)
# Content substance (weight: 2) — richer content = more substantive source
substance_score = self._score_substance(result)
scores.append(substance_score)
weights.append(2)
# Exa relevance score (weight: 2) — Exa's own relevance ranking
exa_score = 0.5
if hasattr(result, 'score') and result.score is not None:
exa_score = float(result.score)
scores.append(exa_score)
weights.append(2)
total = sum(s * w for s, w in zip(scores, weights))
total_weight = sum(weights)
return round(total / total_weight, 3)
@staticmethod
def _score_domain_authority(url: str) -> float:
if not url:
return 0.5
try:
domain = urlparse(url).netloc.lower()
except Exception:
return 0.5
if domain.startswith('www.'):
domain = domain[4:]
# Tier 1: Government, educational, major research
if domain.endswith('.gov') or domain.endswith('.edu'):
return 0.95
if domain in ('arxiv.org', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
'scholar.google.com', 'researchgate.net', 'sciencedaily.com',
'nature.com', 'science.org', 'pnas.org'):
return 0.92
# Tier 2: Major established news and professional publications
tier2 = {
'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'npr.org',
'wsj.com', 'nytimes.com', 'economist.com', 'bloomberg.com',
'theguardian.com', 'ft.com', 'washingtonpost.com',
'forbes.com', 'hbr.org', 'techcrunch.com', 'wired.com',
'cnn.com', 'nbcnews.com', 'cbsnews.com', 'abcnews.go.com',
}
# Extract base domain
parts = domain.split('.')
base = '.'.join(parts[-2:]) if len(parts) >= 2 else domain
if base in tier2:
return 0.88
# Tier 3: Industry research and established .org
tier3 = {
'statista.com', 'pewresearch.org', 'gartner.com', 'mckinsey.com',
'deloitte.com', 'pwc.com', 'ey.com', 'kpmg.com',
'hubspot.com', 'moz.com', 'searchengineland.com',
'neilpatel.com', 'backlinko.com', 'copyblogger.com',
}
if base in tier3:
return 0.80
if domain.endswith('.org'):
return 0.75
return 0.60
def _score_recency(self, result) -> float:
if not hasattr(result, 'publishedDate') or not result.publishedDate:
return 0.70
try:
published = datetime.strptime(result.publishedDate[:10], '%Y-%m-%d')
days_old = (datetime.now() - published).days
if days_old < 30:
return 1.0
elif days_old < 180:
return 0.90
elif days_old < 365:
return 0.80
elif days_old < 730:
return 0.65
elif days_old < 1825:
return 0.45
else:
return 0.25
except Exception:
return 0.70
def _score_substance(self, result) -> float:
total_chars = 0
if hasattr(result, 'highlights') and result.highlights:
total_chars += sum(len(h or '') for h in result.highlights)
if hasattr(result, 'summary') and result.summary:
total_chars += len(result.summary)
if hasattr(result, 'text') and result.text:
total_chars += len(result.text)
if total_chars > 2000:
return 0.95
elif total_chars > 1000:
return 0.85
elif total_chars > 500:
return 0.75
elif total_chars > 100:
return 0.60
return 0.40
def _transform_sources(self, results):
"""Transform Exa results to ResearchSource format."""
sources = []
@@ -368,7 +489,7 @@ class ExaResearchProvider(BaseProvider):
'title': result.title if hasattr(result, 'title') else '',
'url': result.url if hasattr(result, 'url') else '',
'excerpt': self._get_excerpt(result),
'credibility_score': 0.85, # Exa results are high quality
'credibility_score': self._calculate_credibility_score(result),
'published_at': result.publishedDate if hasattr(result, 'publishedDate') else None,
'index': idx,
'source_type': source_type,
@@ -388,7 +509,7 @@ class ExaResearchProvider(BaseProvider):
if hasattr(result, 'summary') and result.summary:
return result.summary
if hasattr(result, 'text') and result.text:
return result.text[:500]
return result.text[:1000]
return ''
def _determine_source_type(self, url):

View File

@@ -19,7 +19,7 @@ class KeywordAnalyzer:
Analyze the following research content and extract comprehensive keyword insights for: {', '.join(original_keywords)}
Research Content:
{content[:3000]} # Limit to avoid token limits
{content[:8000]}
Extract and analyze:
1. Primary keywords (main topic terms)

View File

@@ -250,10 +250,32 @@ class ResearchService:
if 'content' not in locals() or 'sources' not in locals():
raise RuntimeError(f"{config.provider.value} research did not return content or sources. Research failed.")
# Build compact all-source summary for richer analysis
analysis_content = self._build_analysis_content(sources)
# Run dedicated competitor search for richer competitor intelligence
competitor_content = analysis_content
try:
comp_query = f"top {industry} companies or competitors {topic}"
comp_results = await exa_provider.simple_search(
query=comp_query, num_results=5, user_id=user_id,
)
if comp_results:
comp_lines = ["COMPETITOR SEARCH RESULTS:"]
for r in comp_results:
title = r.get('title', '')
text = (r.get('text', '') or '')[:400]
comp_lines.append(f"- {title}")
if text:
comp_lines.append(f" {text[:200]}")
competitor_content = "\n".join(comp_lines) + "\n\n" + analysis_content
except Exception as e:
logger.warning(f"Competitor search failed (non-critical): {e}")
# Continue with common analysis (same for both providers)
keyword_analysis = self.keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
competitor_analysis = self.competitor_analyzer.analyze(content, user_id=user_id)
suggested_angles = self.content_angle_generator.generate(content, topic, industry, user_id=user_id)
keyword_analysis = self.keyword_analyzer.analyze(analysis_content, request.keywords, user_id=user_id)
competitor_analysis = self.competitor_analyzer.analyze(competitor_content, user_id=user_id)
suggested_angles = self.content_angle_generator.generate(analysis_content, topic, industry, user_id=user_id)
logger.info(f"Research completed successfully with {len(sources)} sources and {len(search_queries)} search queries")
@@ -586,9 +608,30 @@ class ResearchService:
# Continue with common analysis (same for both providers)
await task_manager.update_progress(task_id, "🔍 Analyzing keywords and content angles...")
keyword_analysis = self.keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
competitor_analysis = self.competitor_analyzer.analyze(content, user_id=user_id)
suggested_angles = self.content_angle_generator.generate(content, topic, industry, user_id=user_id)
analysis_content = self._build_analysis_content(sources)
# Run dedicated competitor search for richer competitor intelligence
competitor_content = analysis_content
try:
comp_query = f"top {industry} companies or competitors {topic}"
comp_results = await exa_provider.simple_search(
query=comp_query, num_results=5, user_id=user_id,
)
if comp_results:
comp_lines = ["COMPETITOR SEARCH RESULTS:"]
for r in comp_results:
title = r.get('title', '')
text = (r.get('text', '') or '')[:400]
comp_lines.append(f"- {title}")
if text:
comp_lines.append(f" {text[:200]}")
competitor_content = "\n".join(comp_lines) + "\n\n" + analysis_content
except Exception as e:
logger.warning(f"Competitor search failed (non-critical): {e}")
keyword_analysis = self.keyword_analyzer.analyze(analysis_content, request.keywords, user_id=user_id)
competitor_analysis = self.competitor_analyzer.analyze(competitor_content, user_id=user_id)
suggested_angles = self.content_angle_generator.generate(analysis_content, topic, industry, user_id=user_id)
await task_manager.update_progress(task_id, "💾 Caching results for future use...")
logger.info(f"Research completed successfully with {len(sources)} sources and {len(search_queries)} search queries")
@@ -780,6 +823,33 @@ class ResearchService:
web_search_queries=search_queries or [],
)
def _build_analysis_content(self, sources: List[Dict[str, Any]]) -> str:
"""Build compact all-source summary for LLM analysis.
Each source is distilled to one line with title, key content, and highlights.
This ensures ALL sources are visible to keyword, competitor, and angle
analyzers instead of only the first few (raw content[:3000]).
"""
if not sources:
return ""
lines = []
for src in sources:
title = src.get('title', '') or ''
summary = src.get('summary', '') or ''
highlights = src.get('highlights', []) or []
excerpt = src.get('excerpt', '') or ''
part = f"{title}"
if summary:
part += f"{summary[:250]}"
elif excerpt:
part += f"{excerpt[:250]}"
if highlights:
findings = [h[:120] for h in highlights[:2] if h]
if findings:
part += f" | {'; '.join(findings)}"
lines.append(part)
return "\n".join(lines)
def _normalize_cached_research_data(self, cached_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Normalize cached research data to fix None values in confidence_scores.

View File

@@ -6,6 +6,7 @@ Leverages existing non-AI SEO tools and uses single AI prompt for structured ana
"""
import asyncio
import math
import re
import textstat
from datetime import datetime
@@ -34,7 +35,7 @@ class BlogContentSEOAnalyzer:
logger.info("BlogContentSEOAnalyzer initialized")
async def analyze_blog_content(self, blog_content: str, research_data: Dict[str, Any], blog_title: Optional[str] = None, user_id: str = None) -> Dict[str, Any]:
async def analyze_blog_content(self, blog_content: str, research_data: Dict[str, Any], blog_title: Optional[str] = None, user_id: str = None, outline: Optional[List[Dict[str, Any]]] = None, competitive_advantage: Optional[str] = None) -> Dict[str, Any]:
"""
Main analysis method with parallel processing
@@ -43,6 +44,8 @@ class BlogContentSEOAnalyzer:
research_data: Research data containing keywords and other insights
blog_title: Optional blog title
user_id: Clerk user ID for subscription checking (required)
outline: Optional outline sections for context-aware analysis
competitive_advantage: Optional competitive advantage for context
Returns:
Comprehensive SEO analysis results
@@ -52,21 +55,24 @@ class BlogContentSEOAnalyzer:
try:
logger.info("Starting blog content SEO analysis")
# Extract keywords from research data
keywords_data = self._extract_keywords_from_research(research_data)
logger.info(f"Extracted keywords: {keywords_data}")
# Extract research context (keywords + competitor data + search queries)
research_context = self._extract_research_context(research_data)
logger.info(f"Extracted research context with {len(research_context.get('primary', []))} primary keywords")
# Phase 1: Run non-AI analyzers in parallel
logger.info("Running non-AI analyzers in parallel")
non_ai_results = await self._run_non_ai_analyzers(blog_content, keywords_data)
non_ai_results = await self._run_non_ai_analyzers(blog_content, research_context)
# Phase 2: Single AI analysis for structured insights
# Phase 2: Single AI analysis for structured insights (with outline + competitive context)
logger.info("Running AI analysis")
ai_insights = await self._run_ai_analysis(blog_content, keywords_data, non_ai_results, user_id=user_id)
ai_insights = await self._run_ai_analysis(
blog_content, research_context, non_ai_results, user_id=user_id,
outline=outline, competitive_advantage=competitive_advantage
)
# Phase 3: Compile and format results
logger.info("Compiling results")
results = self._compile_blog_seo_results(non_ai_results, ai_insights, keywords_data)
results = self._compile_blog_seo_results(non_ai_results, ai_insights, research_context)
logger.info(f"SEO analysis completed. Overall score: {results.get('overall_score', 0)}")
return results
@@ -76,14 +82,19 @@ class BlogContentSEOAnalyzer:
# Fail fast - don't return fallback data
raise e
def _extract_keywords_from_research(self, research_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract keywords from research data"""
def _extract_research_context(self, research_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract research context from research data including keywords, competitor data, and search queries.
Previously only extracted keyword_analysis. Now also extracts:
- competitor_analysis (content_gaps, industry_leaders, opportunities, competitive_advantages)
- search_queries
- suggested_angles
"""
try:
logger.info(f"Extracting keywords from research data: {research_data}")
logger.info(f"Extracting research context from research data")
# Extract keywords from research data structure
keyword_analysis = research_data.get('keyword_analysis', {})
logger.info(f"Found keyword_analysis: {keyword_analysis}")
# Handle different possible structures
primary_keywords = []
@@ -109,17 +120,37 @@ class BlogContentSEOAnalyzer:
'long_tail': long_tail_keywords,
'semantic': semantic_keywords,
'all_keywords': all_keywords,
'search_intent': keyword_analysis.get('search_intent', 'informational')
'search_intent': keyword_analysis.get('search_intent', 'informational'),
}
logger.info(f"Extracted keywords: {result}")
# Extract competitor analysis
competitor_analysis = research_data.get('competitor_analysis', {})
if competitor_analysis:
result['content_gaps'] = competitor_analysis.get('content_gaps', [])
result['industry_leaders'] = competitor_analysis.get('industry_leaders', [])
result['opportunities'] = competitor_analysis.get('opportunities', [])
result['competitive_advantages'] = competitor_analysis.get('competitive_advantages', [])
else:
result['content_gaps'] = []
result['industry_leaders'] = []
result['opportunities'] = []
result['competitive_advantages'] = []
# Extract search queries
search_queries = research_data.get('search_queries', [])
result['search_queries'] = search_queries if isinstance(search_queries, list) else []
# Extract suggested angles
suggested_angles = research_data.get('suggested_angles', [])
result['suggested_angles'] = suggested_angles if isinstance(suggested_angles, list) else []
logger.info(f"Extracted research context: {len(primary_keywords)} primary keywords, {len(result.get('content_gaps', []))} content gaps, {len(result.get('search_queries', []))} search queries")
return result
except Exception as e:
logger.error(f"Failed to extract keywords from research data: {e}")
logger.error(f"Failed to extract research context from research data: {e}")
logger.error(f"Research data structure: {research_data}")
# Fail fast - don't return empty keywords
raise ValueError(f"Keyword extraction failed: {e}")
raise ValueError(f"Research context extraction failed: {e}")
async def _run_non_ai_analyzers(self, blog_content: str, keywords_data: Dict[str, Any]) -> Dict[str, Any]:
"""Run all non-AI analyzers in parallel for maximum performance"""
@@ -170,10 +201,24 @@ class BlogContentSEOAnalyzer:
sentences = len(re.findall(r'[.!?]+', content))
# Blog-specific structure analysis
has_introduction = any('introduction' in line.lower() or 'overview' in line.lower()
for line in lines[:10])
has_conclusion = any('conclusion' in line.lower() or 'summary' in line.lower()
for line in lines[-10:])
content_lower = content.lower()
first_500 = content_lower[:500] if len(content) > 500 else content_lower
last_500 = content_lower[-500:] if len(content) > 500 else content_lower
has_introduction = any('introduction' in line.lower() or 'overview' in line.lower()
for line in lines[:10]) or any(
phrase in first_500 for phrase in [
'in this', 'this article', 'this guide', 'this post',
'we will', "you'll learn", "let's explore", "whether you're",
'in this section', 'this blog post', 'here we', 'today we',
"we'll explore", "we'll cover", "we'll dive"
])
has_conclusion = any('conclusion' in line.lower() or 'summary' in line.lower()
for line in lines[-10:]) or any(
phrase in last_500 for phrase in [
'in conclusion', 'to summarize', 'in summary', 'bottom line',
'key takeaways', 'remember that', "as we've seen", 'wrapping up',
'final thoughts', 'to conclude', 'in short', 'overall'
])
has_cta = any('call to action' in line.lower() or 'learn more' in line.lower()
for line in lines)
@@ -187,7 +232,7 @@ class BlogContentSEOAnalyzer:
'has_conclusion': has_conclusion,
'has_call_to_action': has_cta,
'structure_score': structure_score,
'recommendations': self._get_structure_recommendations(sections, has_introduction, has_conclusion)
'recommendations': self._get_structure_recommendations(sections, has_introduction, has_conclusion, content)
}
except Exception as e:
logger.error(f"Content structure analysis failed: {e}")
@@ -332,33 +377,36 @@ class BlogContentSEOAnalyzer:
raise e
# Helper methods for calculations and scoring
@staticmethod
def _sigmoid(x: float, midpoint: float = 0.0, steepness: float = 1.0) -> float:
"""Sigmoid function for smooth scoring curves. Returns 0-1."""
try:
return 1.0 / (1.0 + math.exp(-steepness * (x - midpoint)))
except OverflowError:
return 0.0 if x < midpoint else 1.0
def _calculate_structure_score(self, sections: int, paragraphs: int, has_intro: bool, has_conclusion: bool) -> int:
"""Calculate content structure score"""
score = 0
# Section count (optimal: 3-8 sections)
if 3 <= sections <= 8:
score += 30
elif sections < 3:
score += 15
else:
score += 20
# Paragraph count (optimal: 8-20 paragraphs)
if 8 <= paragraphs <= 20:
score += 30
elif paragraphs < 8:
score += 15
else:
score += 20
# Introduction and conclusion
if has_intro:
score += 20
if has_conclusion:
score += 20
return min(score, 100)
"""Calculate content structure score using continuous curves instead of rigid brackets.
Sections: optimal around 5, steep penalties below 3 or above 10.
Paragraphs: optimal around 12, steep penalties below 5 or above 25.
Intro/conclusion: binary bonuses.
"""
# Section score: peaks around 4-6, decays smoothly for low or high counts
section_score = self._sigmoid(sections, midpoint=4, steepness=0.8) * 40
if sections > 8:
section_score = max(section_score * 0.7, 10)
# Paragraph score: peaks around 12, decays for low or high counts
para_score = self._sigmoid(paragraphs, midpoint=10, steepness=0.3) * 40
if paragraphs > 25:
para_score = max(para_score * 0.6, 8)
intro_score = 10 if has_intro else 0
conclusion_score = 10 if has_conclusion else 0
return int(min(max(section_score + para_score + intro_score + conclusion_score, 5), 100))
def _calculate_keyword_density(self, content: str, keyword: str) -> float:
"""Calculate keyword density percentage"""
@@ -397,21 +445,20 @@ class BlogContentSEOAnalyzer:
return total_words / len(paragraphs)
def _calculate_readability_score(self, metrics: Dict[str, float]) -> int:
"""Calculate overall readability score"""
# Flesch Reading Ease (0-100, higher is better)
flesch_score = metrics.get('flesch_reading_ease', 0)
# Convert to 0-100 scale
if flesch_score >= 80:
return 90
elif flesch_score >= 60:
return 80
elif flesch_score >= 40:
return 70
elif flesch_score >= 20:
return 60
else:
return 50
"""Calculate readability score using a continuous sigmoid curve on Flesch Reading Ease.
Maps Flesch 0-100 to a score that:
- Below 30: 25-45 (hard to read)
- 30-50: 45-65 (moderate)
- 50-70: 65-85 (good range)
- 70-90: 85-95 (excellent)
- Above 90: 95-100 (very easy)
"""
flesch = metrics.get('flesch_reading_ease', 0)
score = self._sigmoid(flesch, midpoint=50, steepness=0.06) * 70 + 25
if flesch > 80:
score = min(score + 5, 100)
return int(min(max(score, 20), 100))
def _determine_target_audience(self, metrics: Dict[str, float]) -> str:
"""Determine target audience based on readability metrics"""
@@ -427,183 +474,228 @@ class BlogContentSEOAnalyzer:
return "Graduate level"
def _calculate_content_depth_score(self, word_count: int, vocabulary_diversity: float) -> int:
"""Calculate content depth score"""
score = 0
# Word count (optimal: 800-2000 words)
if 800 <= word_count <= 2000:
score += 50
elif word_count < 800:
score += 30
else:
score += 40
# Vocabulary diversity (optimal: 0.4-0.7)
if 0.4 <= vocabulary_diversity <= 0.7:
score += 50
elif vocabulary_diversity < 0.4:
score += 30
else:
score += 40
return min(score, 100)
"""Calculate content depth score using continuous curves.
Word count: sigmoid peaks around 1200, gentle decay for long content.
Vocabulary diversity: sigmoid peaks around 0.55, decay for low or high diversity.
"""
# Word count score: optimal around 1000-1500, smooth decay below 500
word_score = self._sigmoid(word_count, midpoint=800, steepness=0.005) * 55
if word_count > 3000:
word_score = min(word_score, 40)
elif word_count < 300:
word_score = min(word_score, 15)
# Vocabulary diversity score: optimal around 0.5-0.65, too high is repetitive, too low is shallow
diversity_score = self._sigmoid(vocabulary_diversity, midpoint=0.45, steepness=12) * 45
if vocabulary_diversity < 0.3:
diversity_score = min(diversity_score, 15)
return int(min(max(word_score + diversity_score, 5), 100))
def _calculate_flow_score(self, transition_count: int, word_count: int) -> int:
"""Calculate content flow score"""
"""Calculate content flow score using continuous curve.
Transition density is typically low (most content has 0.5-3 per 100 words
of the specific transition words we track). The sigmoid midpoint is set at 1.0
with moderate steepness to produce a reasonable spread.
"""
if word_count == 0:
return 0
return 15
transition_density = transition_count / (word_count / 100)
# Optimal transition density: 1-3 per 100 words
if 1 <= transition_density <= 3:
return 90
elif transition_density < 1:
return 60
else:
return 70
# Sigmoid centered at 1.0 (decent density), moderate steepness
score = self._sigmoid(transition_density, midpoint=1.0, steepness=2.5) * 50 + 40
if transition_density > 5:
score = max(score - 10, 35)
return int(min(max(score, 15), 100))
def _calculate_heading_hierarchy_score(self, h1: List[str], h2: List[str], h3: List[str]) -> int:
"""Calculate heading hierarchy score"""
score = 0
# Should have exactly 1 H1
if len(h1) == 1:
score += 40
elif len(h1) == 0:
score += 20
"""Calculate heading hierarchy score using continuous curves.
H1: 1 is ideal, score decays for 0 or 2+.
H2: 4-6 is ideal, score decays for low or high counts.
H3: presence adds bonus.
"""
# H1 score: clear peak at 1
h1_count = len(h1)
if h1_count == 1:
h1_score = 40
elif h1_count == 0:
h1_score = 15
else:
score += 10
# Should have 3-8 H2 headings
if 3 <= len(h2) <= 8:
score += 40
elif len(h2) < 3:
score += 20
else:
score += 30
# H3 headings are optional but good for structure
if len(h3) > 0:
score += 20
return min(score, 100)
h1_score = max(40 // h1_count, 8)
# H2 score: sigmoid peaks around 4-6
h2_count = len(h2)
h2_score = self._sigmoid(h2_count, midpoint=4, steepness=1.0) * 40
if h2_count == 0:
h2_score = 5
elif h2_count > 10:
h2_score = max(h2_score * 0.6, 10)
# H3 bonus: presence is good, diminishing returns
h3_score = min(len(h3) * 5, 20)
return int(min(max(h1_score + h2_score + h3_score, 10), 100))
def _calculate_keyword_score(self, keyword_analysis: Dict[str, Any]) -> int:
"""Calculate keyword optimization score"""
score = 0
# Check keyword density (optimal: 1-3%)
"""Calculate keyword optimization score using continuous curves.
Density: sigmoid centered at 2%, smooth peak.
Heading presence: binary bonus per keyword.
Early occurrence: sigmoid bonus.
Missing/over-optimization: smooth penalties.
"""
density_score = 0
heading_bonus = 0
early_bonus = 0
densities = keyword_analysis.get('keyword_density', {})
keyword_count = max(len(densities), 1)
for keyword, density in densities.items():
if 1 <= density <= 3:
score += 30
elif density < 1:
score += 15
else:
score += 10
# Check keyword distribution
# Density score: smooth peak at 1-3%, sigmoid curve
density_contribution = self._sigmoid(density, midpoint=2.0, steepness=2.0) * 30
if density > 4:
density_contribution *= 0.5 # penalty for over-optimization
density_score += density_contribution
density_score = density_score / keyword_count
# Heading presence bonus
distributions = keyword_analysis.get('keyword_distribution', {})
for keyword, dist in distributions.items():
if dist.get('in_headings', False):
score += 20
if dist.get('first_occurrence', -1) < 100: # Early occurrence
score += 20
# Penalize missing keywords
missing = len(keyword_analysis.get('missing_keywords', []))
score -= missing * 10
# Penalize over-optimization
over_opt = len(keyword_analysis.get('over_optimization', []))
score -= over_opt * 15
return max(0, min(score, 100))
heading_bonus += 15
first_occ = dist.get('first_occurrence', -1)
if isinstance(first_occ, (int, float)) and 0 <= first_occ < 150:
early_bonus += int(self._sigmoid(first_occ, midpoint=75, steepness=-0.04) * 15)
# Penalize missing keywords and over-optimization
missing_penalty = len(keyword_analysis.get('missing_keywords', [])) * 8
over_opt_penalty = len(keyword_analysis.get('over_optimization', [])) * 12
raw = density_score + heading_bonus + early_bonus - missing_penalty - over_opt_penalty
return int(min(max(raw, 5), 100))
def _calculate_weighted_score(self, scores: Dict[str, int]) -> int:
"""Calculate weighted overall score"""
"""Calculate weighted overall score.
AI insight engagement_score is unreliable (no ground truth) so it's excluded
from the overall score. The remaining 5 categories are re-weighted to sum to 1.0.
AI insights are still reported in category_scores for display but don't affect
the overall score.
"""
weights = {
'structure': 0.2,
'structure': 0.20,
'keywords': 0.25,
'readability': 0.2,
'quality': 0.15,
'headings': 0.1,
'ai_insights': 0.1
'readability': 0.20,
'quality': 0.20,
'headings': 0.15,
}
weighted_sum = sum(scores.get(key, 0) * weight for key, weight in weights.items())
return int(weighted_sum)
return int(min(max(weighted_sum, 0), 100))
# Recommendation methods
def _get_structure_recommendations(self, sections: int, has_intro: bool, has_conclusion: bool) -> List[str]:
"""Get structure recommendations"""
def _get_structure_recommendations(self, sections: int, has_intro: bool, has_conclusion: bool, content: str = '') -> List[str]:
"""Get structure recommendations based on actual content analysis"""
recommendations = []
if sections < 3:
recommendations.append("Add more sections to improve content structure")
recommendations.append("Add more sections to improve content structure and topic coverage")
elif sections > 8:
recommendations.append("Consider combining some sections for better flow")
if not has_intro:
recommendations.append("Add an introduction section to set context")
if not has_conclusion:
recommendations.append("Add a conclusion section to summarize key points")
recommendations.append("Consider combining some sections for better flow and readability")
# More robust intro detection: check first 200 chars for first-person address,
# question, or general hook — not just keyword matching
first_200 = (content[:500] if content else '').lower()
intro_indicators = any([
has_intro,
'?' in first_200[:200],
any(phrase in first_200 for phrase in ['in this', 'this article', 'this guide', 'this post', 'we will', "you'll learn", "let's explore", "whether you're"]),
first_200.strip().startswith('# '),
])
if not intro_indicators:
recommendations.append("Add an introduction that hooks the reader and previews key topics")
# More robust conclusion detection
last_500 = (content[-500:] if content else '').lower()
conclusion_indicators = any([
has_conclusion,
any(phrase in last_500 for phrase in ['in conclusion', 'to summarize', 'in summary', 'bottom line', 'key takeaways', 'remember that', 'as we\'ve seen']),
])
if not conclusion_indicators:
recommendations.append("Add a conclusion to summarize key points and provide next steps")
return recommendations
def _get_readability_recommendations(self, metrics: Dict[str, float], avg_sentence_length: float) -> List[str]:
"""Get readability recommendations"""
"""Get readability recommendations with specific, actionable guidance"""
recommendations = []
flesch_score = metrics.get('flesch_reading_ease', 0)
if flesch_score < 60:
recommendations.append("Simplify language and use shorter sentences")
if avg_sentence_length > 20:
recommendations.append("Break down long sentences for better readability")
if flesch_score > 80:
recommendations.append("Consider adding more technical depth for expert audience")
if flesch_score < 30:
recommendations.append("Content is very difficult to read — shorten sentences, use simpler words, and break up complex ideas")
elif flesch_score < 50:
recommendations.append("Content is fairly complex — consider simplifying some sentences and adding more plain-language explanations")
if avg_sentence_length > 25:
recommendations.append(f"Average sentence length is {avg_sentence_length:.0f} words — aim for 15-20 words per sentence for better readability")
elif avg_sentence_length > 20:
recommendations.append("Some sentences may be too long — try breaking a few into shorter ones for easier reading")
if flesch_score > 80 and flesch_score < 95:
recommendations.append("Readability is very good — consider adding slightly more technical depth for expert credibility")
return recommendations
def _get_content_quality_recommendations(self, word_count: int, vocabulary_diversity: float, transition_count: int) -> List[str]:
"""Get content quality recommendations"""
"""Get content quality recommendations with specific, actionable guidance"""
recommendations = []
if word_count < 800:
recommendations.append("Expand content with more detailed explanations")
elif word_count > 2000:
recommendations.append("Consider breaking into multiple posts")
if vocabulary_diversity < 0.4:
recommendations.append("Use more varied vocabulary to improve engagement")
if transition_count < 3:
recommendations.append("Add more transition words to improve flow")
if word_count < 400:
recommendations.append("Content is significantly underdeveloped — expand with detailed explanations, examples, and supporting evidence")
elif word_count < 800:
recommendations.append("Content is thin — add depth with specific examples, data points, and detailed explanations for each section")
elif word_count > 3000:
recommendations.append("Content is very long — consider whether all sections are necessary or if some could be a separate post")
if vocabulary_diversity < 0.35:
recommendations.append("Vocabulary is highly repetitive — use synonyms and varied phrasing to improve engagement")
elif vocabulary_diversity < 0.45:
recommendations.append("Vocabulary variety could be improved — try rephrasing repeated terms for more natural flow")
if transition_count < 2:
recommendations.append("Very few transition words found — add connectors like 'however', 'therefore', 'furthermore' between ideas")
elif transition_count < 5:
recommendations.append("Add more transition words to improve the flow between paragraphs and sections")
return recommendations
def _get_heading_recommendations(self, h1: List[str], h2: List[str], h3: List[str]) -> List[str]:
"""Get heading recommendations"""
"""Get heading recommendations with specific, actionable guidance"""
recommendations = []
if len(h1) == 0:
recommendations.append("Add a main H1 heading")
recommendations.append("Add a main H1 heading — this is the primary title for both readers and search engines")
elif len(h1) > 1:
recommendations.append("Use only one H1 heading per post")
recommendations.append(f"Found {len(h1)} H1 headings — use only one H1 per post for clarity. Convert extras to H2.")
if len(h2) < 3:
recommendations.append("Add more H2 headings to structure content")
elif len(h2) > 8:
recommendations.append("Consider using H3 headings for better hierarchy")
recommendations.append(f"Only {len(h2)} H2 headings found — add section headings to break up content and improve scanning")
elif len(h2) > 10:
recommendations.append(f"{len(h2)} H2 headings may be too many — consider using H3 subheadings within sections for better hierarchy")
if len(h2) >= 3 and len(h3) == 0 and len(h2) > 5:
recommendations.append("Consider adding H3 subheadings within longer H2 sections for better content hierarchy")
return recommendations
async def _run_ai_analysis(self, blog_content: str, keywords_data: Dict[str, Any], non_ai_results: Dict[str, Any], user_id: str = None) -> Dict[str, Any]:
async def _run_ai_analysis(self, blog_content: str, keywords_data: Dict[str, Any], non_ai_results: Dict[str, Any], user_id: str = None, outline: Optional[List[Dict[str, Any]]] = None, competitive_advantage: Optional[str] = None) -> Dict[str, Any]:
"""Run single AI analysis for structured insights (provider-agnostic)"""
if not user_id:
raise ValueError("user_id is required for subscription checking. Please provide Clerk user ID.")
@@ -612,7 +704,9 @@ class BlogContentSEOAnalyzer:
context = {
'blog_content': blog_content,
'keywords_data': keywords_data,
'non_ai_results': non_ai_results
'non_ai_results': non_ai_results,
'outline': outline or [],
'competitive_advantage': competitive_advantage or '',
}
# Create AI prompt for structured analysis
@@ -624,10 +718,18 @@ class BlogContentSEOAnalyzer:
"content_quality_insights": {
"type": "object",
"properties": {
"engagement_score": {"type": "number"},
"value_proposition": {"type": "string"},
"content_gaps": {"type": "array", "items": {"type": "string"}},
"improvement_suggestions": {"type": "array", "items": {"type": "string"}}
"improvement_suggestions": {"type": "array", "items": {"type": "string"}},
"content_depth_indicators": {
"type": "object",
"properties": {
"has_specific_data_points": {"type": "boolean"},
"has_examples_or_illustrations": {"type": "boolean"},
"has_actionable_takeaways": {"type": "boolean"},
"depth_assessment": {"type": "string"}
}
}
}
},
"seo_optimization_insights": {
@@ -648,13 +750,12 @@ class BlogContentSEOAnalyzer:
"ux_improvements": {"type": "array", "items": {"type": "string"}}
}
},
"competitive_analysis": {
"content_strengths": {
"type": "object",
"properties": {
"content_differentiation": {"type": "string"},
"unique_value": {"type": "string"},
"competitive_advantages": {"type": "array", "items": {"type": "string"}},
"market_positioning": {"type": "string"}
"strongest_sections": {"type": "array", "items": {"type": "string"}},
"unique_value_points": {"type": "array", "items": {"type": "string"}},
"reader_value_assessment": {"type": "string"}
}
}
}
@@ -675,37 +776,85 @@ class BlogContentSEOAnalyzer:
raise e
def _create_ai_analysis_prompt(self, context: Dict[str, Any]) -> str:
"""Create AI analysis prompt"""
"""Create AI analysis prompt with research context and outline awareness"""
blog_content = context['blog_content']
keywords_data = context['keywords_data']
non_ai_results = context['non_ai_results']
outline = context.get('outline', [])
competitive_advantage = context.get('competitive_advantage', '')
# Build outline context
outline_text = ""
if outline:
section_names = []
for sec in outline[:8]:
heading = sec.get('heading', '') if isinstance(sec, dict) else getattr(sec, 'heading', '')
subheadings = sec.get('subheadings', []) if isinstance(sec, dict) else getattr(sec, 'subheadings', [])
sub_text = f" (subtopics: {', '.join(subheadings[:4])})" if subheadings else ""
target_words = sec.get('target_words', '') if isinstance(sec, dict) else getattr(sec, 'target_words', '')
word_text = f" [~{target_words} words]" if target_words else ""
section_names.append(f" - {heading}{sub_text}{word_text}")
outline_text = "\n".join(section_names)
# Build research context block
research_block = ""
content_gaps = keywords_data.get('content_gaps', [])
competitive_advantages = keywords_data.get('competitive_advantages', [])
search_queries = keywords_data.get('search_queries', [])
suggested_angles = keywords_data.get('suggested_angles', [])
industry_leaders = keywords_data.get('industry_leaders', [])
if content_gaps:
research_block += f"\nCONTENT GAPS (from competitor analysis): {', '.join(content_gaps[:5])}"
if competitive_advantages:
research_block += f"\nOUR COMPETITIVE ADVANTAGES: {', '.join(competitive_advantages[:3])}"
if competitive_advantage:
research_block += f"\nFOCUSED COMPETITIVE ADVANTAGE: {competitive_advantage}"
if search_queries:
research_block += f"\nORIGINAL SEARCH QUERIES: {', '.join(search_queries[:5])}"
if suggested_angles:
research_block += f"\nPLANNED CONTENT ANGLES: {', '.join(suggested_angles[:3])}"
if industry_leaders:
research_block += f"\nINDUSTRY LEADERS: {', '.join(industry_leaders[:3])}"
prompt = f"""
Analyze this blog content for SEO optimization and user experience. Provide structured insights based on the content and keyword data.
Analyze this blog content for SEO optimization and user experience. Provide structured insights based ONLY on what is actually present in the content and keyword data. Do NOT fabricate data, statistics, competitor names, or case studies that are not in the content.
BLOG CONTENT:
{blog_content[:2000]}...
{blog_content[:3000]}...
KEYWORDS DATA:
Primary Keywords: {keywords_data.get('primary', [])}
Long-tail Keywords: {keywords_data.get('long_tail', [])}
Semantic Keywords: {keywords_data.get('semantic', [])}
Search Intent: {keywords_data.get('search_intent', 'informational')}
Search Intent: {keywords_data.get('search_intent', 'informational')}{research_block}
NON-AI ANALYSIS RESULTS:
Structure Score: {non_ai_results.get('content_structure', {}).get('structure_score', 0)}
Readability Score: {non_ai_results.get('readability_analysis', {}).get('readability_score', 0)}
Content Quality Score: {non_ai_results.get('content_quality', {}).get('content_depth_score', 0)}
MEASURED ANALYSIS RESULTS:
Structure Score: {non_ai_results.get('content_structure', {}).get('structure_score', 0)}/100
Readability Score: {non_ai_results.get('readability_analysis', {}).get('readability_score', 0)}/100
Content Quality Score: {non_ai_results.get('content_quality', {}).get('content_depth_score', 0)}/100
Heading Hierarchy Score: {non_ai_results.get('heading_structure', {}).get('heading_hierarchy_score', 0)}/100
Word Count: {non_ai_results.get('content_quality', {}).get('word_count', 0)}
Sections: {non_ai_results.get('content_structure', {}).get('total_sections', 0)}
Has Introduction: {non_ai_results.get('content_structure', {}).get('has_introduction', False)}
Has Conclusion: {non_ai_results.get('content_structure', {}).get('has_conclusion', False)}{f"""
Please provide:
1. Content Quality Insights: Assess engagement potential, value proposition, content gaps, and improvement suggestions
2. SEO Optimization Insights: Evaluate keyword optimization, content relevance, search intent alignment, and SEO improvements
3. User Experience Insights: Analyze content flow, readability, engagement factors, and UX improvements
4. Competitive Analysis: Identify content differentiation, unique value, competitive advantages, and market positioning
PLANNED OUTLINE STRUCTURE:
{outline_text}""" if outline_text else ""}
{f"""
Focus on actionable insights that can improve the blog's performance and user engagement.
FOCUSED ADVANTAGE: {competitive_advantage}""" if competitive_advantage else ""}
IMPORTANT: SEO metadata (title tag, meta description, Open Graph tags, Twitter cards, JSON-LD schema) will be generated in a separate step. Do NOT recommend adding or improving meta descriptions, title tags, OG tags, or structured data markup — focus only on content-level improvements.
Provide:
1. Content Quality Insights: Assess the value proposition based on actual content. Identify specific content gaps (what TOPICS from the planned outline or competitor analysis are missing; do NOT suggest adding case studies unless the content references specific studies). Suggest improvements grounded in what the content currently covers.
2. Content Depth Indicators: Objectively assess whether the content contains specific data points, examples, or actionable takeaways. These are binary assessments based on what's actually in the text.
3. SEO Optimization Insights: Evaluate keyword optimization based on the provided keyword data. Assess content relevance and search intent alignment relative to the original search queries.
4. User Experience Insights: Analyze content flow and readability. Identify engagement factors present in the text.
5. Content Strengths: Identify the strongest sections of the content by heading name. Note unique value points the content provides. Do NOT invent competitive advantages — only describe what makes THIS content valuable based on the competitive advantages and content gaps listed above.
"""
return prompt
def _compile_blog_seo_results(self, non_ai_results: Dict[str, Any], ai_insights: Dict[str, Any], keywords_data: Dict[str, Any]) -> Dict[str, Any]:
@@ -719,13 +868,28 @@ class BlogContentSEOAnalyzer:
raise ValueError("AI insights are missing")
# Calculate category scores
# Compute ai_depth_score from measurable content_depth_indicators instead of
# hallucinated engagement_score. If depth_indicators are present, score based on
# boolean flags; otherwise default to 50 (neutral).
ai_quality = ai_insights.get('content_quality_insights', {})
depth_indicators = ai_quality.get('content_depth_indicators', {})
if depth_indicators:
depth_flags = [
depth_indicators.get('has_specific_data_points', False),
depth_indicators.get('has_examples_or_illustrations', False),
depth_indicators.get('has_actionable_takeaways', False),
]
depth_score = 40 + (sum(depth_flags) * 20) # 40 baseline + 20 per true flag = 40-100
else:
depth_score = 50
category_scores = {
'structure': non_ai_results.get('content_structure', {}).get('structure_score', 0),
'keywords': self._calculate_keyword_score(non_ai_results.get('keyword_analysis', {})),
'readability': non_ai_results.get('readability_analysis', {}).get('readability_score', 0),
'quality': non_ai_results.get('content_quality', {}).get('content_depth_score', 0),
'headings': non_ai_results.get('heading_structure', {}).get('heading_hierarchy_score', 0),
'ai_insights': ai_insights.get('content_quality_insights', {}).get('engagement_score', 0)
'ai_insights': depth_score
}
# Calculate overall score
@@ -757,7 +921,15 @@ class BlogContentSEOAnalyzer:
def _compile_actionable_recommendations(self, non_ai_results: Dict[str, Any], ai_insights: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Compile actionable recommendations from all sources"""
recommendations = []
# Metadata-related keywords to filter out (handled by metadata generator)
metadata_keywords = ['meta description', 'title tag', 'og tag', 'open graph',
'twitter card', 'json-ld', 'schema markup', 'structured data markup']
def _is_metadata_rec(rec_text: str) -> bool:
rec_lower = rec_text.lower()
return any(kw in rec_lower for kw in metadata_keywords)
# Structure recommendations
structure_recs = non_ai_results.get('content_structure', {}).get('recommendations', [])
for rec in structure_recs:
@@ -767,7 +939,7 @@ class BlogContentSEOAnalyzer:
'recommendation': rec,
'impact': 'Improves content organization and user experience'
})
# Keyword recommendations
keyword_recs = non_ai_results.get('keyword_analysis', {}).get('recommendations', [])
for rec in keyword_recs:
@@ -777,7 +949,7 @@ class BlogContentSEOAnalyzer:
'recommendation': rec,
'impact': 'Improves search engine visibility'
})
# Readability recommendations
readability_recs = non_ai_results.get('readability_analysis', {}).get('recommendations', [])
for rec in readability_recs:
@@ -787,17 +959,40 @@ class BlogContentSEOAnalyzer:
'recommendation': rec,
'impact': 'Improves user engagement and comprehension'
})
# AI insights recommendations
# AI insights recommendations (filter out metadata-related recs)
ai_recs = ai_insights.get('content_quality_insights', {}).get('improvement_suggestions', [])
for rec in ai_recs:
if not _is_metadata_rec(rec):
recommendations.append({
'category': 'Content Quality',
'priority': 'Medium',
'recommendation': rec,
'impact': 'Enhances content value and engagement'
})
# SEO improvement recommendations (filter metadata recs)
seo_recs = ai_insights.get('seo_optimization_insights', {}).get('seo_improvements', [])
for rec in seo_recs:
if not _is_metadata_rec(rec):
recommendations.append({
'category': 'SEO',
'priority': 'Medium',
'recommendation': rec,
'impact': 'Improves search engine optimization'
})
# Content strengths as informational (lower priority)
content_strengths = ai_insights.get('content_strengths', {})
strong_sections = content_strengths.get('strongest_sections', [])
if strong_sections:
recommendations.append({
'category': 'Content Quality',
'priority': 'Medium',
'recommendation': rec,
'impact': 'Enhances content value and engagement'
'category': 'Strengths',
'priority': 'Low',
'recommendation': f"Strongest sections: {', '.join(strong_sections[:3])}. Consider expanding these areas further.",
'impact': 'Leverages existing content strengths'
})
return recommendations
def _create_visualization_data(self, category_scores: Dict[str, int], non_ai_results: Dict[str, Any]) -> Dict[str, Any]:
@@ -851,7 +1046,7 @@ class BlogContentSEOAnalyzer:
'weakest_category': weakest_category[0],
'key_strengths': self._identify_key_strengths(category_scores),
'key_weaknesses': self._identify_key_weaknesses(category_scores),
'ai_summary': ai_insights.get('content_quality_insights', {}).get('value_proposition', '')
'ai_summary': ai_insights.get('content_quality_insights', {}).get('value_proposition', 'Content analysis completed.')
}
def _identify_key_strengths(self, category_scores: Dict[str, int]) -> List[str]:

View File

@@ -84,14 +84,14 @@ class BlogSEOMetadataGenerator:
raise e
def _extract_keywords_from_research(self, research_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract keywords and context from research data"""
"""Extract keywords and context from research data, including competitor analysis and content gaps."""
try:
keyword_analysis = research_data.get('keyword_analysis', {})
# Handle both 'semantic' and 'semantic_keywords' field names
semantic_keywords = keyword_analysis.get('semantic', []) or keyword_analysis.get('semantic_keywords', [])
return {
result = {
'primary_keywords': keyword_analysis.get('primary', []),
'long_tail_keywords': keyword_analysis.get('long_tail', []),
'semantic_keywords': semantic_keywords,
@@ -100,6 +100,30 @@ class BlogSEOMetadataGenerator:
'target_audience': research_data.get('target_audience', 'general'),
'industry': research_data.get('industry', 'general')
}
# Extract competitor analysis context
competitor_analysis = research_data.get('competitor_analysis', {})
if competitor_analysis:
result['content_gaps'] = competitor_analysis.get('content_gaps', [])
result['industry_leaders'] = competitor_analysis.get('industry_leaders', [])
result['opportunities'] = competitor_analysis.get('opportunities', [])
result['competitive_advantages'] = competitor_analysis.get('competitive_advantages', [])
else:
result['content_gaps'] = []
result['industry_leaders'] = []
result['opportunities'] = []
result['competitive_advantages'] = []
# Extract search queries
search_queries = research_data.get('search_queries', [])
result['search_queries'] = search_queries if isinstance(search_queries, list) else []
# Extract suggested angles
suggested_angles = research_data.get('suggested_angles', [])
result['suggested_angles'] = suggested_angles if isinstance(suggested_angles, list) else []
return result
except Exception as e:
logger.error(f"Failed to extract keywords from research: {e}")
return {
@@ -109,7 +133,13 @@ class BlogSEOMetadataGenerator:
'all_keywords': [],
'search_intent': 'informational',
'target_audience': 'general',
'industry': 'general'
'industry': 'general',
'content_gaps': [],
'industry_leaders': [],
'opportunities': [],
'competitive_advantages': [],
'search_queries': [],
'suggested_angles': []
}
async def _generate_core_metadata(
@@ -194,18 +224,20 @@ class BlogSEOMetadataGenerator:
# Check if we got a valid response
if not ai_response or not isinstance(ai_response, dict):
logger.error("Core metadata generation failed: Invalid response from LLM")
# Return fallback response
primary_keywords = ', '.join(keywords_data.get('primary_keywords', ['content']))
# Return fallback response using content-derived values
primary_kw = keywords_data.get('primary_keywords', ['content'])
primary_kw_first = primary_kw[0] if primary_kw else 'content'
word_count = len(blog_content.split())
slug = re.sub(r'[^a-z0-9]+', '-', blog_title.lower())[:50].strip('-')
return {
'seo_title': blog_title,
'meta_description': f'Learn about {primary_keywords.split(", ")[0] if primary_keywords else "this topic"}.',
'url_slug': blog_title.lower().replace(' ', '-').replace(':', '').replace(',', '')[:50],
'blog_tags': primary_keywords.split(', ') if primary_keywords else ['content'],
'blog_categories': ['Content Marketing', 'Technology'],
'social_hashtags': ['#content', '#marketing', '#technology'],
'meta_description': f'Discover insights about {primary_kw_first}. Comprehensive guide with practical tips and expert analysis.',
'url_slug': slug,
'blog_tags': primary_kw[:5] if isinstance(primary_kw, list) else [primary_kw_first],
'blog_categories': [primary_kw_first.title(), 'Guide'],
'social_hashtags': [f'#{primary_kw_first.replace(" ", "")}', '#guide', '#tips'],
'reading_time': max(1, word_count // 200),
'focus_keyword': primary_keywords.split(', ')[0] if primary_keywords else 'content'
'focus_keyword': primary_kw_first
}
logger.info(f"Core metadata generation completed. Response keys: {list(ai_response.keys())}")
@@ -302,36 +334,41 @@ class BlogSEOMetadataGenerator:
# Check if we got a valid response
if not ai_response or not isinstance(ai_response, dict) or not ai_response.get('open_graph') or not ai_response.get('twitter_card') or not ai_response.get('json_ld_schema'):
logger.error("Social metadata generation failed: Invalid or empty response from LLM")
# Return fallback response
# Return fallback response using content-derived values
primary_kw = keywords_data.get('primary_keywords', ['content'])
primary_kw_first = primary_kw[0] if primary_kw else 'content'
slug = re.sub(r'[^a-z0-9]+', '-', blog_title.lower())[:50].strip('-')
word_count = len(blog_content.split())
current_date = datetime.now().isoformat()
return {
'open_graph': {
'title': blog_title,
'description': f'Learn about {keywords_data.get("primary_keywords", ["this topic"])[0] if keywords_data.get("primary_keywords") else "this topic"}.',
'image': 'https://example.com/image.jpg',
'description': f'Discover insights about {primary_kw_first}. Comprehensive guide with practical tips.',
'image': '',
'type': 'article',
'site_name': 'Your Website',
'url': 'https://example.com/blog'
'site_name': '',
'url': f'https://example.com/blog/{slug}'
},
'twitter_card': {
'card': 'summary_large_image',
'title': blog_title,
'description': f'Learn about {keywords_data.get("primary_keywords", ["this topic"])[0] if keywords_data.get("primary_keywords") else "this topic"}.',
'image': 'https://example.com/image.jpg',
'site': '@yourwebsite',
'creator': '@author'
'description': f'Explore our guide on {primary_kw_first}.',
'image': '',
'site': '',
'creator': ''
},
'json_ld_schema': {
'@context': 'https://schema.org',
'@type': 'Article',
'headline': blog_title,
'description': f'Learn about {keywords_data.get("primary_keywords", ["this topic"])[0] if keywords_data.get("primary_keywords") else "this topic"}.',
'author': {'@type': 'Person', 'name': 'Author Name'},
'publisher': {'@type': 'Organization', 'name': 'Your Website'},
'datePublished': '2025-01-01T00:00:00Z',
'dateModified': '2025-01-01T00:00:00Z',
'mainEntityOfPage': 'https://example.com/blog',
'keywords': keywords_data.get('primary_keywords', ['content']),
'wordCount': len(blog_content.split())
'description': f'Comprehensive guide about {primary_kw_first}.',
'author': {'@type': 'Person', 'name': ''},
'publisher': {'@type': 'Organization', 'name': ''},
'datePublished': current_date,
'dateModified': current_date,
'mainEntityOfPage': f'https://example.com/blog/{slug}',
'keywords': primary_kw[:5] if isinstance(primary_kw, list) else [primary_kw_first],
'wordCount': word_count
}
}
@@ -408,21 +445,53 @@ OUTLINE STRUCTURE:
- Content hierarchy: Well-structured with {len(outline)} main sections
"""
# Extract SEO analysis insights
# Extract SEO analysis insights with weakness-aware guidance
seo_context = ""
if seo_analysis:
overall_score = seo_analysis.get('overall_score', seo_analysis.get('seo_score', 0))
category_scores = seo_analysis.get('category_scores', {})
applied_recs = seo_analysis.get('applied_recommendations', [])
applied_recs = seo_analysis.get('applied_recommendations') or []
# Build weakness-specific guidance for metadata
weakness_guidance = []
kw_score = category_scores.get('keywords', category_scores.get('Keywords', 0))
if kw_score < 70:
weakness_guidance.append("Keyword optimization is weak — ensure title and description prominently feature primary keywords")
read_score = category_scores.get('readability', category_scores.get('Readability', 0))
if read_score < 70:
weakness_guidance.append("Readability needs improvement — use clear, accessible language in the meta description")
struct_score = category_scores.get('structure', category_scores.get('Structure', 0))
if struct_score < 70:
weakness_guidance.append("Content structure needs improvement — the title should clearly signal the content structure")
seo_context = f"""
SEO ANALYSIS RESULTS:
- Overall SEO Score: {overall_score}/100
- Category Scores: Structure {category_scores.get('structure', category_scores.get('Structure', 0))}, Keywords {category_scores.get('keywords', category_scores.get('Keywords', 0))}, Readability {category_scores.get('readability', category_scores.get('Readability', 0))}
- Category Scores: Structure {struct_score}, Keywords {kw_score}, Readability {read_score}
- Applied Recommendations: {len(applied_recs)} SEO optimizations have been applied
- Content Quality: Optimized for search engines with keyword focus
{f"- WEAKNESS GUIDANCE: {'; '.join(weakness_guidance)}" if weakness_guidance else ""}
"""
# Build research context block
research_block = ""
content_gaps = keywords_data.get('content_gaps', [])
competitive_advantages = keywords_data.get('competitive_advantages', [])
search_queries = keywords_data.get('search_queries', [])
suggested_angles = keywords_data.get('suggested_angles', [])
industry_leaders = keywords_data.get('industry_leaders', [])
if content_gaps:
research_block += f"\nCONTENT GAPS (from competitor analysis): {', '.join(content_gaps[:5])}"
if competitive_advantages:
research_block += f"\nOUR KEY DIFFERENTIATORS: {', '.join(competitive_advantages[:3])}"
if search_queries:
research_block += f"\nORIGINAL SEARCH QUERIES: {', '.join(search_queries[:5])}"
if suggested_angles:
research_block += f"\nCONTENT ANGLES: {', '.join(suggested_angles[:3])}"
if industry_leaders:
research_block += f"\nINDUSTRY LEADERS: {', '.join(industry_leaders[:3])}"
# Get more content context (key sections instead of just first 1000 chars)
content_preview = self._extract_content_highlights(blog_content)
@@ -443,6 +512,7 @@ SEMANTIC KEYWORDS: {semantic_keywords}
SEARCH INTENT: {search_intent}
TARGET AUDIENCE: {target_audience}
INDUSTRY: {industry}
{research_block}
{seo_context}
@@ -525,6 +595,18 @@ Generate metadata that is personalized, compelling, and SEO-optimized.
overall_score = seo_analysis.get('overall_score', seo_analysis.get('seo_score', 0))
seo_context = f"\nSEO SCORE: {overall_score}/100 (optimized content)\n"
# Build research context for social metadata
research_block = ""
content_gaps = keywords_data.get('content_gaps', [])
competitive_advantages = keywords_data.get('competitive_advantages', [])
search_queries = keywords_data.get('search_queries', [])
if content_gaps:
research_block += f"\nCONTENT GAPS: {', '.join(content_gaps[:3])}"
if competitive_advantages:
research_block += f"\nDIFFERENTIATORS: {', '.join(competitive_advantages[:3])}"
if search_queries:
research_block += f"\nSEARCH QUERIES: {', '.join(search_queries[:4])}"
content_preview = self._extract_content_highlights(blog_content, 1500)
prompt = f"""
@@ -539,6 +621,7 @@ KEYWORDS: {primary_keywords}
TARGET AUDIENCE: {target_audience}
INDUSTRY: {industry}
CURRENT DATE: {current_date}
{research_block}
=== GENERATION REQUIREMENTS ===
@@ -551,20 +634,20 @@ CURRENT DATE: {current_date}
- url: Generate canonical URL structure
2. TWITTER CARD:
- card: "summary_large_image"
- title: 70 chars max, optimized for Twitter audience
- description: 200 chars max with relevant hashtags inline
- image: Match Open Graph image
- site: @yourwebsite (placeholder, user should update)
- creator: @author (placeholder, user should update)
- card: "summary_large_image"
- title: 70 chars max, optimized for Twitter audience
- description: 200 chars max with relevant hashtags inline
- image: Match Open Graph image
- site: Leave empty string (user will add their Twitter handle)
- creator: Leave empty string (user will add author Twitter handle)
3. JSON-LD SCHEMA (Article):
- @context: "https://schema.org"
- @type: "Article"
- headline: Article title (optimized)
- description: Article description (150-200 chars)
- author: {{"@type": "Person", "name": "Author Name"}} (placeholder)
- publisher: {{"@type": "Organization", "name": "Site Name", "logo": {{"@type": "ImageObject", "url": "logo-url"}}}}
- @context: "https://schema.org"
- @type: "Article"
- headline: Article title (optimized)
- description: Article description (150-200 chars)
- author: {{"@type": "Person", "name": ""}} (leave empty, user will add author name)
- publisher: {{"@type": "Organization", "name": ""}} (leave empty, user will add site name)
- datePublished: {current_date}
- dateModified: {current_date}
- mainEntityOfPage: {{"@type": "WebPage", "@id": "canonical-url"}}
@@ -633,35 +716,109 @@ Make it engaging, personalized for {target_audience}, and optimized for {industr
raise e
def _calculate_optimization_score(self, core_metadata: Dict[str, Any], social_metadata: Dict[str, Any]) -> int:
"""Calculate overall optimization score for the generated metadata"""
"""Calculate metadata quality score based on content relevance and adherence to best practices.
Unlike the old completeness-based score (which just checked field existence),
this assigns quality-weighted points based on how well each field is optimized.
"""
try:
score = 0
# Check core metadata completeness
if core_metadata.get('seo_title'):
score += 15
if core_metadata.get('meta_description'):
score += 15
if core_metadata.get('url_slug'):
score += 10
if core_metadata.get('blog_tags'):
score += 10
if core_metadata.get('blog_categories'):
score += 10
if core_metadata.get('social_hashtags'):
score += 10
if core_metadata.get('focus_keyword'):
score += 10
# Title quality (0-15): Length in 50-60 chars is optimal
seo_title = core_metadata.get('seo_title', '')
if seo_title:
title_len = len(seo_title)
if 50 <= title_len <= 60:
score += 15
elif 40 <= title_len <= 70:
score += 10
elif title_len > 0:
score += 5
# Check social metadata completeness
if social_metadata.get('open_graph'):
# Meta description quality (0-15): Length in 150-160 chars is optimal, has CTA
meta_desc = core_metadata.get('meta_description', '')
if meta_desc:
desc_len = len(meta_desc)
desc_lower = meta_desc.lower()
has_cta = any(phrase in desc_lower for phrase in ['learn', 'discover', 'find', 'get', 'explore', 'how to', 'why', 'tips', 'guide', 'try', 'start'])
if 150 <= desc_len <= 160 and has_cta:
score += 15
elif 120 <= desc_len <= 170:
score += 10 if has_cta else 7
elif desc_len > 0:
score += 4
# URL slug quality (0-10): Short, keyword-rich, no stop words
url_slug = core_metadata.get('url_slug', '')
if url_slug:
slug_parts = url_slug.strip('/').split('/')
slug_words = slug_parts[-1].split('-') if slug_parts else []
if 2 <= len(slug_words) <= 5:
score += 10
elif len(slug_words) > 0:
score += 5
# Tags and categories quality (0-20)
blog_tags = core_metadata.get('blog_tags', [])
blog_categories = core_metadata.get('blog_categories', [])
if blog_tags and len(blog_tags) >= 3:
score += 10
if social_metadata.get('twitter_card'):
elif blog_tags:
score += 5
if social_metadata.get('json_ld_schema'):
if blog_categories and len(blog_categories) >= 1:
score += 10
elif blog_categories:
score += 5
return min(score, 100) # Cap at 100
# Social hashtags (0-10): Relevant and non-spammy
social_hashtags = core_metadata.get('social_hashtags', [])
if social_hashtags and 3 <= len(social_hashtags) <= 8:
score += 10
elif social_hashtags:
score += 5
# Focus keyword (0-10): Present and relevant
focus_keyword = core_metadata.get('focus_keyword', '')
if focus_keyword and seo_title and focus_keyword.lower() in seo_title.lower():
score += 10
elif focus_keyword:
score += 4
# Open Graph quality (0-10): Has title, description, correct type
og = social_metadata.get('open_graph', {})
if og:
og_score = 0
if og.get('title') and len(og.get('title', '')) > 10:
og_score += 4
if og.get('description') and 100 <= len(og.get('description', '')) <= 200:
og_score += 4
if og.get('type') == 'article':
og_score += 2
score += og_score
# Twitter Card quality (0-5)
twitter = social_metadata.get('twitter_card', {})
if twitter:
tw_score = 0
if twitter.get('title') and len(twitter.get('title', '')) > 10:
tw_score += 3
if twitter.get('card') == 'summary_large_image':
tw_score += 2
score += tw_score
# JSON-LD quality (0-5): Has headline, description, datePublished
json_ld = social_metadata.get('json_ld_schema', {})
if json_ld:
jl_score = 0
if json_ld.get('headline'):
jl_score += 2
if json_ld.get('description'):
jl_score += 2
if json_ld.get('datePublished'):
jl_score += 1
score += jl_score
return min(score, 100)
except Exception as e:
logger.error(f"Failed to calculate optimization score: {e}")

View File

@@ -2,6 +2,13 @@
Applies actionable SEO recommendations to existing blog content using the
provider-agnostic `llm_text_gen` dispatcher. Ensures GPT_PROVIDER parity.
Key design principles:
- Make TARGETED edits, not full rewrites
- Preserve existing content structure and factual claims
- Only modify sections that have applicable recommendations
- Never fabricate statistics, case studies, or citations
- Ground changes in research sources when available
"""
import asyncio
@@ -15,7 +22,7 @@ logger = get_service_logger("blog_seo_recommendation_applier")
class BlogSEORecommendationApplier:
"""Apply actionable SEO recommendations to blog content."""
"""Apply actionable SEO recommendations to blog content with targeted edits."""
def __init__(self):
logger.debug("Initialized BlogSEORecommendationApplier")
@@ -35,6 +42,7 @@ class BlogSEORecommendationApplier:
persona = payload.get("persona", {})
tone = payload.get("tone")
audience = payload.get("audience")
competitive_advantage = payload.get("competitive_advantage", "")
if not sections:
return {"success": False, "error": "No sections provided for recommendation application"}
@@ -43,16 +51,21 @@ class BlogSEORecommendationApplier:
logger.warning("apply_recommendations called without recommendations")
return {"success": True, "title": title, "sections": sections, "applied": []}
# Determine which sections actually need changes based on recommendations
sections_to_edit = self._identify_affected_sections(sections, recommendations)
prompt = self._build_prompt(
title=title,
introduction=introduction,
sections=sections,
sections_to_edit=sections_to_edit,
outline=outline,
research=research,
recommendations=recommendations,
persona=persona,
tone=tone,
audience=audience,
competitive_advantage=competitive_advantage,
)
schema = {
@@ -87,14 +100,14 @@ class BlogSEORecommendationApplier:
"required": ["sections"],
}
logger.info("Applying SEO recommendations via llm_text_gen")
logger.info("Applying SEO recommendations via llm_text_gen (targeted edit mode)")
result = await asyncio.to_thread(
llm_text_gen,
prompt,
None,
schema,
user_id, # Pass user_id for subscription checking
user_id,
max_tokens=8192,
)
@@ -106,14 +119,12 @@ class BlogSEORecommendationApplier:
raw_sections = result.get("sections", []) or []
normalized_sections: List[Dict[str, Any]] = []
# Warn if LLM returned different number of sections (may miss intro/conclusion added as new sections)
if len(raw_sections) != len(sections):
logger.warning(
f"LLM returned {len(raw_sections)} sections but {len(sections)} were sent. "
"Extra sections will be ignored; missing sections fall back to original content."
)
# Build lookup table from updated sections using their identifiers
updated_map: Dict[str, Dict[str, Any]] = {}
for updated in raw_sections:
section_id = str(
@@ -156,7 +167,6 @@ class BlogSEORecommendationApplier:
mapped = updated_map.get(fallback_id)
if not mapped and raw_sections:
# Fall back to positional match if identifier lookup failed
candidate = raw_sections[index] if index < len(raw_sections) else {}
heading = (
candidate.get("heading")
@@ -176,7 +186,6 @@ class BlogSEORecommendationApplier:
}
if not mapped:
# Fallback to original content if nothing else available
mapped = {
"id": fallback_id,
"heading": original.get("heading") or original.get("title") or f"Section {index + 1}",
@@ -190,12 +199,11 @@ class BlogSEORecommendationApplier:
logger.info("SEO recommendations applied successfully")
# Extract updated introduction from LLM response if available
updated_introduction = result.get("introduction") or ""
if updated_introduction and updated_introduction != introduction:
logger.info(f"Introduction updated: {len(updated_introduction)} chars")
elif not updated_introduction:
updated_introduction = introduction # fall back to original
updated_introduction = introduction
return {
"success": True,
@@ -205,37 +213,133 @@ class BlogSEORecommendationApplier:
"applied": applied,
}
def _identify_affected_sections(self, sections: List[Dict[str, Any]], recommendations: List[Dict[str, Any]]) -> List[str]:
"""Identify which section IDs are likely affected by the recommendations.
Maps recommendation categories to section headings for targeted editing.
Returns a list of section IDs that should be edited.
"""
affected_ids = set()
for rec in recommendations:
category = (rec.get("category") or "").lower()
rec_text = (rec.get("recommendation") or "").lower()
# Structure recommendations affect first/last sections or all sections
if category == "structure":
if sections:
affected_ids.add(str(sections[0].get("id", "section_1")))
affected_ids.add(str(sections[-1].get("id", f"section_{len(sections)}")))
# "Add more sections" or "too many sections" affects all
if "more section" in rec_text or "combine" in rec_text or "flow" in rec_text:
for s in sections:
affected_ids.add(str(s.get("id", "")))
continue
# Keyword recommendations affect all sections (keywords should be spread)
if category == "keywords":
for s in sections:
affected_ids.add(str(s.get("id", "")))
continue
# Readability affects all sections
if category == "readability":
for s in sections:
affected_ids.add(str(s.get("id", "")))
continue
# Content quality — try to match recommendation to specific section headings
if category in ("content quality", "content", "seo"):
heading_keywords = {
s.get("heading", "").lower(): str(s.get("id", ""))
for s in sections
}
matched = False
for heading_lower, section_id in heading_keywords.items():
rec_words = rec_text.split()
if any(word in heading_lower for word in rec_words if len(word) > 3):
affected_ids.add(section_id)
matched = True
if not matched:
# Affect first and last sections (intro/conclusion) as common targets
if sections:
affected_ids.add(str(sections[0].get("id", "section_1")))
affected_ids.add(str(sections[-1].get("id", f"section_{len(sections)}")))
# Filter out empty IDs and return
return [sid for sid in affected_ids if sid]
def _build_prompt(
self,
*,
title: str,
introduction: str,
sections: List[Dict[str, Any]],
sections_to_edit: List[str],
outline: List[Dict[str, Any]],
research: Dict[str, Any],
recommendations: List[Dict[str, Any]],
persona: Dict[str, Any],
tone: str | None,
audience: str | None,
competitive_advantage: str = "",
) -> str:
"""Construct prompt for applying recommendations."""
"""Construct prompt for applying targeted recommendations."""
sections_str = []
# Build research context block
research_block = ""
keyword_analysis = research.get("keyword_analysis", {}) if research else {}
primary_keywords = ", ".join(keyword_analysis.get("primary", [])[:8]) or "None"
competitor_analysis = research.get("competitor_analysis", {}) if research else {}
search_queries = research.get("search_queries", []) if research else []
suggested_angles = research.get("suggested_angles", []) if research else []
content_gaps = competitor_analysis.get("content_gaps", []) if competitor_analysis else []
competitive_advantages = competitor_analysis.get("competitive_advantages", []) if competitor_analysis else []
research_block += f"\nPRIMARY KEYWORDS: {primary_keywords}"
if content_gaps:
research_block += f"\nCONTENT GAPS (address these in your edits): {', '.join(content_gaps[:5])}"
if competitive_advantages:
research_block += f"\nKEY DIFFERENTIATORS (emphasize these): {', '.join(competitive_advantages[:3])}"
if competitive_advantage:
research_block += f"\nPRIMARY ADVANTAGE: {competitive_advantage}"
if search_queries:
research_block += f"\nTARGET SEARCH QUERIES: {', '.join(search_queries[:5])}"
if suggested_angles:
research_block += f"\nCONTENT ANGLES: {', '.join(suggested_angles[:3])}"
# Build per-section content with edit markers
sections_content = []
for section in sections:
sections_str.append(
f"ID: {section.get('id', 'section')}, Heading: {section.get('heading', 'Untitled')}\n"
f"Current Content:\n{section.get('content', '')}\n"
)
section_id = str(section.get("id", "section"))
heading = section.get("heading", "Untitled")
content = section.get("content", "")
needs_edit = section_id in sections_to_edit
section_text = f"--- SECTION (ID: {section_id}, Heading: \"{heading}\")"
if needs_edit:
section_text += " [NEEDS EDITS based on recommendations]"
else:
section_text += " [KEEP AS-IS - no changes needed]"
section_text += f" ---\n{content}\n"
sections_content.append(section_text)
sections_str = "\n\n".join(sections_content)
outline_str = "\n".join(
[
f"- {item.get('heading', 'Section')} (Target words: {item.get('target_words', 'N/A')})"
for item in outline
]
)
research_summary = research.get("keyword_analysis", {}) if research else {}
primary_keywords = ", ".join(research_summary.get("primary", [])[:10]) or "None"
# Build outline with subheadings and key points
outline_parts = []
for item in outline:
heading = item.get("heading", "Section")
target_words = item.get("target_words", "N/A")
subheadings = item.get("subheadings", [])
key_points = item.get("key_points", [])
line = f"- {heading} (Target: {target_words} words)"
if subheadings:
line += f" | Subheadings: {', '.join(subheadings[:4])}"
if key_points:
line += f" | Key points: {', '.join(key_points[:4])}"
outline_parts.append(line)
outline_str = "\n".join(outline_parts) if outline_parts else "No outline supplied"
recommendations_str = []
for rec in recommendations:
@@ -248,7 +352,7 @@ class BlogSEORecommendationApplier:
persona_str = (
f"Persona: {persona}\n"
if persona
else "Persona: (not provided)\n"
else ""
)
style_guidance = []
@@ -258,44 +362,47 @@ class BlogSEORecommendationApplier:
style_guidance.append(f"Target audience: {audience}")
style_str = "\n".join(style_guidance) if style_guidance else "Maintain current tone and audience alignment."
prompt = f"""
You are an expert SEO content strategist. Update the blog content to apply the actionable recommendations.
intro_text = introduction if introduction else "(No introduction currently — write one ONLY if a recommendation specifically asks for it)"
Current Title: {title}
prompt = f"""You are a careful SEO content editor making TARGETED edits to an existing blog post. Your job is to apply specific SEO recommendations with PRECISION — not to rewrite the entire post.
Current Introduction:
{introduction if introduction else '(No introduction exists — write a compelling one if the recommendations require it)'}
CRITICAL RULES — YOU MUST FOLLOW THESE:
1. PRESERVE existing content. Only make MINIMAL, targeted changes to address specific recommendations. Do NOT rewrite sections that are working well.
2. NEVER fabricate statistics, case studies, expert quotes, research data, or specific numbers unless they are explicitly stated in the research context below.
3. NEVER add content that contradicts or goes beyond what the research sources support.
4. KEEP the same emotional tone and writing style as the original content.
5. Return EXACTLY the same number of sections with EXACTLY the same IDs. Do NOT add, remove, or rename sections.
6. For sections marked [KEEP AS-IS], return the content UNCHANGED — copy it verbatim.
7. For sections marked [NEEDS EDITS], make ONLY the specific changes needed to address the applicable recommendations.
8. Do NOT add introductions, conclusions, or case studies unless a recommendation EXPLICITLY asks for one.
Primary Keywords (for context): {primary_keywords}
{research_block}
Outline Overview:
{outline_str or 'No outline supplied'}
PLANNED OUTLINE STRUCTURE:
{outline_str}
Existing Sections:
{''.join(sections_str)}
CURRENT TITLE: {title}
Actionable Recommendations to Apply:
CURRENT INTRODUCTION:
{intro_text}
CURRENT SECTIONS:
{sections_str}
RECOMMENDATIONS TO APPLY:
{''.join(recommendations_str)}
{persona_str}{style_str}
{persona_str}
{style_str}
Instructions:
1. Carefully apply the recommendations while preserving factual accuracy and research alignment.
2. You MUST return EXACTLY the same number of sections, with EXACTLY the same IDs as provided above. Do NOT add or remove sections.
3. If a recommendation says content is MISSING (e.g. missing introduction or conclusion), incorporate that missing content into the MOST APPROPRIATE existing section:
- Missing introduction → PREPEND introductory content to the FIRST section's existing content.
- Missing conclusion → APPEND concluding content to the LAST section's existing content.
- For other missing content, add it to the section whose heading best matches the recommendation.
4. Additionally, if an introduction is missing or weak, write a compelling introduction in the "introduction" field of your response. If the current introduction is adequate, return it unchanged.
5. Improve clarity, flow, and SEO optimization per the guidance.
6. Return updated sections in the requested JSON format.
7. Provide a short summary of which recommendations were addressed.
INSTRUCTIONS:
- For sections marked [KEEP AS-IS]: Copy the content EXACTLY as provided. Do not change a single word.
- For sections marked [NEEDS EDITS]: Make the MINIMUM changes needed to address the recommendations. If a recommendation says "add transition words", add 2-3 transitions — do not rewrite the paragraph. If it says "use more varied vocabulary", replace 2-3 repetitive words — do not rewrite the section.
- If a recommendation asks for an introduction and none exists, write a brief 2-3 sentence introduction that naturally leads into the first section. Do NOT fabricate hooks or statistics.
- If a recommendation asks for a conclusion, append 2-3 sentences summarizing key takeaways to the LAST section. Do NOT fabricate conclusions that don't follow from the actual content.
- Return ALL sections, including the ones you did NOT change.
- Provide a summary of which recommendations you addressed and what specific changes you made.
"""
return prompt
__all__ = ["BlogSEORecommendationApplier"]
__all__ = ["BlogSEORecommendationApplier"]

View File

@@ -66,19 +66,20 @@ class WixAuthService:
response.raise_for_status()
return response.json()
def get_site_info(self, access_token: str, meta_site_id: Optional[str] = None) -> Dict[str, Any]:
def get_site_info(self, access_token: str) -> Dict[str, Any]:
headers = {
'Authorization': f'Bearer {access_token}',
'Content-Type': 'application/json',
}
if self.client_id:
headers['wix-client-id'] = self.client_id
if meta_site_id:
headers['wix-site-id'] = meta_site_id
response = requests.get(f"{self.base_url}/sites/v1/site", headers=headers)
if response.status_code == 404:
logger.warning("Wix site info not found (404) — user may not have a published site or token lacks sites scope")
return {"_no_site": True, "error": "No Wix site found for this account"}
if response.status_code == 401:
logger.warning("Wix site info request unauthorized (401) — token expired or invalid")
return {"_auth_failed": True, "error": "Token expired or invalid — reconnect required"}
response.raise_for_status()
return response.json()

View File

@@ -3,6 +3,7 @@ import requests
from loguru import logger
from .retry import wix_api_call_with_retry, WixAPIError
from .auth_utils import get_wix_headers
class WixBlogService:
@@ -14,40 +15,7 @@ class WixBlogService:
def headers(self, access_token: str, extra: Optional[Dict[str, str]] = None) -> Dict[str, str]:
"""Build headers with automatic token type detection."""
h: Dict[str, str] = {
'Content-Type': 'application/json',
}
if access_token:
# Normalize token to string if needed
if not isinstance(access_token, str):
from .utils import normalize_token_string
normalized = normalize_token_string(access_token)
if normalized:
access_token = normalized
else:
access_token = str(access_token)
token = access_token.strip()
if token:
if token.startswith('OauthNG.JWS.'):
h['Authorization'] = f'Bearer {token}'
logger.debug("Using Wix OAuth token with Bearer prefix (OauthNG.JWS. format detected)")
elif token.startswith('IST.'):
h['Authorization'] = token
logger.debug("Using Wix API key for authorization (IST. format detected)")
elif token.count('.') == 2:
h['Authorization'] = f'Bearer {token}'
logger.debug("Using OAuth Bearer token for authorization (JWT: 2 dots)")
else:
h['Authorization'] = token
logger.debug("Using token as-is for authorization")
if self.client_id:
h['wix-client-id'] = self.client_id
if extra:
h.update(extra)
return h
return get_wix_headers(access_token, client_id=self.client_id, extra=extra)
def create_draft_post(self, access_token: str, payload: Dict[str, Any], extra_headers: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
"""Create draft post with retry logic and consolidated logging."""
@@ -144,9 +112,9 @@ class WixBlogService:
"""Create a blog tag with retry logic."""
url = f"{self.base_url}/blog/v3/tags"
headers = self.headers(access_token, extra_headers)
payload: Dict[str, Any] = {'label': label, 'fieldsets': ['URL']}
payload: Dict[str, Any] = {'tag': {'label': label}, 'fieldsets': ['URL']}
if language:
payload['language'] = language
payload['tag']['language'] = language
try:
return wix_api_call_with_retry('POST', url, headers, json_payload=payload, max_attempts=3)

View File

@@ -171,6 +171,16 @@ def validate_ricos_content(ricos_content: Dict[str, Any]) -> Dict[str, Any]:
return ricos_content
_UUID_RE = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
def _looks_like_uuid(value: str) -> bool:
try:
uuid.UUID(value)
return True
except (ValueError, AttributeError):
return bool(_UUID_RE.match(value))
def validate_payload_no_none(obj, path=""):
"""Recursively validate that no None values exist in the payload"""
if obj is None:
@@ -224,6 +234,7 @@ def create_blog_post(
"""
# ===== PRE-FLIGHT VALIDATION =====
errors = []
warnings = []
if not member_id:
errors.append("memberId is required for third-party apps creating blog posts")
@@ -279,6 +290,18 @@ def create_blog_post(
except Exception:
pass
# Add wix-site-id to headers for all API calls (categories, tags, draft post)
resolved_site_id = site_id or meta_site_id or os.getenv('WIX_SITE_ID')
if resolved_site_id:
headers['wix-site-id'] = resolved_site_id
logger.info(f"Using wix-site-id: {resolved_site_id[:8]}... (source: {'param' if site_id else 'token' if meta_site_id else 'env'})")
else:
token_str = str(access_token)
if token_str.startswith('IST.'):
logger.error("IST. API key requires WIX_SITE_ID environment variable or site_id parameter.")
else:
logger.warning("No wix-site-id found — API calls may fail if token requires it")
# Quick permission test (only log failures)
try:
test_headers = get_wix_headers(access_token)
@@ -296,14 +319,34 @@ def create_blog_post(
# Convert markdown to Ricos
# PRIMARY: Use Wix Ricos Documents API for best formatting support (tables, complex markdown, etc.)
# FALLBACK: Use custom parser if Wix API fails
# FALLBACK: Use custom parser if Wix API fails (no length limit, handles tables natively)
has_table = bool(re.search(r'^\|.*\|', content, re.MULTILINE))
# Pre-check: Wix Ricos API has a 10,000 character limit for HTML input.
# Estimate HTML length from markdown (~1.4x expansion) to avoid silent truncation.
# If HTML would exceed limit, skip Wix API and use custom parser.
use_wix_api = True
MAX_HTML_LIMIT = 9800
estimated_html_len = len(content) * 1.4
if estimated_html_len > MAX_HTML_LIMIT:
logger.warning(f"Content too long for Wix Ricos API (est. HTML: {estimated_html_len:.0f} > {MAX_HTML_LIMIT}) — using custom parser")
use_wix_api = False
ricos_content = None
try:
logger.info("Converting markdown via Wix Ricos Documents API...")
ricos_content = convert_via_wix_api(content, access_token, base_url)
logger.info(f"Wix API conversion succeeded: {len(ricos_content.get('nodes', []))} nodes")
except Exception as e:
logger.warning(f"Wix API conversion failed, falling back to custom parser: {e}")
if use_wix_api:
try:
logger.info("Converting markdown via Wix Ricos Documents API...")
ricos_content = convert_via_wix_api(content, access_token, base_url)
logger.info(f"Wix API conversion succeeded: {len(ricos_content.get('nodes', []))} nodes")
except Exception as e:
logger.warning(f"Wix API conversion failed, falling back to custom parser: {e}")
# If markdown had tables and Wix API didn't produce TABLE nodes, fall back to custom parser
if has_table and ricos_content:
node_types = [n.get('type', '') for n in ricos_content.get('nodes', [])]
if 'TABLE' not in node_types:
logger.info("Markdown had tables but Wix API produced no TABLE nodes — using custom parser for table support")
ricos_content = None
if not ricos_content or not isinstance(ricos_content, dict) or 'nodes' not in ricos_content:
logger.info("Using custom markdown parser for Ricos conversion")
@@ -414,44 +457,50 @@ def create_blog_post(
logger.info(f"Cover image imported: {media_id[:16]}...")
else:
logger.warning(f"Cover image import returned no valid media_id (type={type(media_id)}). Continuing without cover image.")
warnings.append("Cover image could not be imported — post published without cover image.")
except Exception as e:
logger.warning(f"Cover image import failed (non-fatal): {e}. Continuing without cover image.")
warnings.append(f"Cover image import failed: {str(e)[:100]}")
# Handle categories - can be either IDs (list of strings) or names (for lookup)
category_ids_to_use = None
if category_ids:
# Check if these are IDs (UUIDs) or names
if isinstance(category_ids, list) and len(category_ids) > 0:
# Assume IDs if first item looks like UUID (has hyphens and is long)
# Use proper UUID detection instead of fragile heuristic
first_item = str(category_ids[0])
if '-' in first_item and len(first_item) > 30:
if _looks_like_uuid(first_item):
category_ids_to_use = category_ids
elif lookup_categories_func:
# These are names, need to lookup/create
extra_headers = {}
if 'wix-site-id' in headers:
extra_headers['wix-site-id'] = headers['wix-site-id']
if resolved_site_id:
extra_headers['wix-site-id'] = resolved_site_id
category_ids_to_use = lookup_categories_func(
access_token, category_ids, extra_headers if extra_headers else None
)
if not category_ids_to_use:
warnings.append(f"Categories could not be created ({len(category_ids)} requested) — OAuth app may lack BLOG.CREATE-DRAFT scope.")
# Handle tags - can be either IDs (list of strings) or names (for lookup)
tag_ids_to_use = None
if tag_ids:
# Check if these are IDs (UUIDs) or names
if isinstance(tag_ids, list) and len(tag_ids) > 0:
# Assume IDs if first item looks like UUID (has hyphens and is long)
# Use proper UUID detection instead of fragile heuristic
first_item = str(tag_ids[0])
if '-' in first_item and len(first_item) > 30:
if _looks_like_uuid(first_item):
tag_ids_to_use = tag_ids
elif lookup_tags_func:
# These are names, need to lookup/create
extra_headers = {}
if 'wix-site-id' in headers:
extra_headers['wix-site-id'] = headers['wix-site-id']
if resolved_site_id:
extra_headers['wix-site-id'] = resolved_site_id
tag_ids_to_use = lookup_tags_func(
access_token, tag_ids, extra_headers if extra_headers else None
)
if not tag_ids_to_use:
warnings.append(f"Tags could not be created ({len(tag_ids)} requested) — OAuth app may lack BLOG scope for tag management.")
# Add categories if we have IDs (must be non-empty list of strings)
# CRITICAL: Wix API rejects empty arrays or arrays with None/empty strings
@@ -491,24 +540,12 @@ def create_blog_post(
logger.debug("No SEO metadata provided to create_blog_post")
try:
# Extract wix-site-id from token, parameter, or env var
extra_headers = {}
wix_site_id = site_id or os.getenv('WIX_SITE_ID')
if not wix_site_id:
from .utils import extract_meta_from_token
meta_info = extract_meta_from_token(access_token)
wix_site_id = meta_info.get('metaSiteId')
# Use wix-site-id already resolved earlier
extra_headers_final = {}
wix_site_id = resolved_site_id
if wix_site_id:
extra_headers['wix-site-id'] = wix_site_id
logger.info(f"Using wix-site-id: {wix_site_id[:8]}... (source: {'param' if site_id else 'env' if os.getenv('WIX_SITE_ID') else 'token'})")
else:
token_str = str(access_token)
if token_str.startswith('IST.'):
logger.error("❌ IST. API key requires WIX_SITE_ID environment variable or site_id parameter. "
"The token's tenant.id is the account ID, not the site ID. "
"Please set WIX_SITE_ID in your .env file to your Wix site's metaSiteId.")
else:
logger.warning("No wix-site-id found — API calls may fail if token requires it")
extra_headers_final['wix-site-id'] = wix_site_id
logger.info(f"Using wix-site-id for draft post: {wix_site_id[:8]}...")
except Exception as e:
logger.debug(f"Could not extract wix-site-id from token: {e}")
@@ -564,13 +601,17 @@ def create_blog_post(
logger.info(f"📤 Publishing to Wix: title='{blog_data['draftPost'].get('title', '')}', "
f"nodes={len(rc.get('nodes', []))}")
result = blog_service.create_draft_post(access_token, blog_data, extra_headers or None)
result = blog_service.create_draft_post(access_token, blog_data, extra_headers_final or None)
draft_post = result.get('draftPost', {})
post_id = draft_post.get('id', 'N/A')
wix_logger.log_operation_result("Create Draft Post", True, result)
logger.success(f"✅ Wix: Blog post created - ID: {post_id}")
if warnings:
result['_warnings'] = warnings
logger.info(f"Publish completed with {len(warnings)} warnings: {'; '.join(warnings)}")
return result
except TypeError as e:
import traceback

View File

@@ -192,6 +192,120 @@ def _make_horizontal_rule_node() -> Dict[str, Any]:
}
def _parse_markdown_table(lines: List[str], start_idx: int) -> tuple:
"""
Parse a markdown table starting at start_idx.
Returns (table_rows, alignments, next_idx) where table_rows is a list of lists of cell text,
and alignments is a list of column alignments ('left', 'center', 'right', None).
Markdown tables look like:
| Header 1 | Header 2 |
|----------|----------|
| Cell 1 | Cell 2 |
Alignment is detected from the separator row:
|:--------|:--------:|--------:|
"""
rows = []
alignments = None
i = start_idx
while i < len(lines):
line = lines[i].strip()
if not line or '|' not in line:
break
cells = [cell.strip() for cell in line.strip('|').split('|')]
# Detect separator row (contains only dashes, colons, pipes, spaces)
if i > start_idx and all(
set(cell.strip()) <= set('-:| ') for cell in cells
):
alignments = []
for cell in cells:
cell = cell.strip()
if cell.startswith(':') and cell.endswith(':'):
alignments.append('center')
elif cell.endswith(':'):
alignments.append('right')
elif cell.startswith(':'):
alignments.append('left')
else:
alignments.append(None)
i += 1
continue
rows.append(cells)
i += 1
return rows, alignments or [None] * (len(rows[0]) if rows else 1), i
def _make_table_node(header_row: List[str], body_rows: List[List[str]], alignments: List) -> Dict[str, Any]:
"""Create a Ricos TABLE node with header and body rows, with formatting."""
table_rows = []
all_rows = [header_row] + body_rows
for row_idx, row_cells in enumerate(all_rows):
cell_nodes = []
for col_idx, cell_text in enumerate(row_cells):
text_nodes = parse_markdown_inline(cell_text)
# Bold header row cells
if row_idx == 0 and text_nodes:
for node in text_nodes:
if node.get('type') == 'TEXT':
decs = node['textData'].get('decorations', [])
if not any(d.get('type') == 'BOLD' for d in decs if isinstance(d, dict)):
decs_copy = decs.copy()
decs_copy.append({'type': 'BOLD'})
node['textData']['decorations'] = decs_copy
paragraph_node = {
'id': str(uuid.uuid4()),
'type': 'PARAGRAPH',
'nodes': text_nodes if text_nodes else [{
'id': str(uuid.uuid4()),
'type': 'TEXT',
'nodes': [],
'textData': {'text': cell_text or ' ', 'decorations': []}
}],
}
cell_style = {'verticalAlign': 'top'}
if row_idx == 0:
cell_style['borderWidth'] = {'top': 2, 'bottom': 1, 'left': 1, 'right': 1}
# Apply column alignment
if alignments and col_idx < len(alignments) and alignments[col_idx]:
cell_style['textAlign'] = alignments[col_idx]
cell_node = {
'id': str(uuid.uuid4()),
'type': 'TABLE_CELL',
'nodes': [paragraph_node],
'tableCellData': {'style': cell_style},
}
cell_nodes.append(cell_node)
row_node = {
'id': str(uuid.uuid4()),
'type': 'TABLE_ROW',
'nodes': cell_nodes,
}
table_rows.append(row_node)
num_cols = max(len(row) for row in all_rows) if all_rows else 1
return {
'id': str(uuid.uuid4()),
'type': 'TABLE',
'nodes': table_rows,
'tableData': {
'cols': num_cols,
'rows': len(table_rows),
'headerRow': 0 if header_row else -1,
},
}
def convert_content_to_ricos(content: str, images: List[str] = None) -> Dict[str, Any]:
"""
Convert markdown content into valid Ricos JSON format.
@@ -205,6 +319,7 @@ def convert_content_to_ricos(content: str, images: List[str] = None) -> Dict[str
- Code blocks (```language ... ```)
- Inline images (![alt](url))
- Horizontal rules (---, ***, ___)
- Tables (| Header | Header |)
"""
if not content:
content = "This is a post from ALwrity."
@@ -245,6 +360,16 @@ def convert_content_to_ricos(content: str, images: List[str] = None) -> Dict[str
i += 1
continue
# Markdown tables (lines starting with |)
if stripped.startswith('|') and i + 1 < len(lines) and '|' in lines[i + 1]:
table_rows, alignments, next_idx = _parse_markdown_table(lines, i)
if table_rows and len(table_rows) >= 1:
header_row = table_rows[0]
body_rows = table_rows[1:] if len(table_rows) > 1 else []
nodes.append(_make_table_node(header_row, body_rows, alignments))
i = next_idx
continue
# Headings
if stripped.startswith('#'):
level = len(stripped) - len(stripped.lstrip('#'))
@@ -280,12 +405,11 @@ def convert_content_to_ricos(content: str, images: List[str] = None) -> Dict[str
})
continue
# Unordered lists
# Unordered lists (including task lists)
if (stripped.startswith('- ') or stripped.startswith('* ') or
(stripped.startswith('-') and len(stripped) > 1 and stripped[1] != '-') or
(stripped.startswith('*') and len(stripped) > 1 and stripped[1] != '*')):
list_items = []
list_marker = '- ' if stripped.startswith('-') else '* '
while i < len(lines):
current_line = lines[i].strip()
@@ -323,7 +447,14 @@ def convert_content_to_ricos(content: str, images: List[str] = None) -> Dict[str
list_node_items = []
for item_text in list_items:
text_nodes = parse_markdown_inline(item_text)
# Detect task list items: "- [ ] task" or "- [x] task"
task_match = re.match(r'^\[([ xX])\]\s*(.*)', item_text)
if task_match:
checked = task_match.group(1).lower() == 'x'
prefix = '' if checked else ''
text_nodes = parse_markdown_inline(prefix + task_match.group(2))
else:
text_nodes = parse_markdown_inline(item_text)
paragraph_node = {
'id': str(uuid.uuid4()),
'type': 'PARAGRAPH',
@@ -414,6 +545,7 @@ def convert_content_to_ricos(content: str, images: List[str] = None) -> Dict[str
next_line.startswith('>') or
next_line.startswith('![') or
next_line.startswith('```') or
next_line.startswith('|') or
re.match(r'^(---+|\*\*\*|___+)$', next_line) or
re.match(r'^\d+\.\s+', next_line)):
break

View File

@@ -75,7 +75,10 @@ class WixLogger:
logger.debug(f" Payload: {', '.join(parts)}")
if error_body and status_code >= 400:
error_msg = error_body.get('message', 'Unknown error')
if isinstance(error_body, dict):
error_msg = error_body.get('message', 'Unknown error')
else:
error_msg = str(error_body)
logger.error(f" Error: {error_msg}")
if status_code == 500:
logger.error(" ⚠️ Internal server error - check Wix API status")

View File

@@ -1,17 +1,35 @@
from typing import Any, Dict, Optional
import requests
from urllib.parse import urlparse
from loguru import logger
from .retry import wix_api_call_with_retry, WixAPIError
def _is_valid_image_url(url: str) -> bool:
"""Check if a URL looks like a valid, publicly accessible image URL for Wix import."""
if not url or not isinstance(url, str):
return False
url = url.strip()
if url.startswith('data:'):
return False
parsed = urlparse(url)
if parsed.scheme not in ('http', 'https'):
return False
host = parsed.hostname or ''
if host in ('localhost', '127.0.0.1', 'example.com') or host.endswith('.example.com'):
return False
return True
class WixMediaService:
"""Service for Wix Media Manager operations with retry logic and error handling."""
def __init__(self, base_url: str):
self.base_url = base_url
def import_image(self, access_token: str, image_url: str, display_name: str) -> Optional[Dict[str, Any]]:
def import_image(self, access_token: str, image_url: str, display_name: str,
client_id: Optional[str] = None, site_id: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""
Import external image to Wix Media Manager.
@@ -22,6 +40,8 @@ class WixMediaService:
access_token: Valid access token
image_url: URL of the image to import
display_name: Display name for the image
client_id: Optional Wix client ID for wix-client-id header
site_id: Optional Wix metaSiteId for wix-site-id header
Returns:
Media result dict with 'file' key, or None on failure
@@ -29,10 +49,23 @@ class WixMediaService:
Raises:
WixAPIError: On non-retryable failure or after retries exhausted
"""
if not _is_valid_image_url(image_url):
logger.warning(f"Skipping image import — URL not valid for Wix: {image_url[:80]}...")
return None
logger.info(f"Importing image to Wix: url={image_url[:80]}..., display_name={display_name}")
headers = {
'Authorization': f'Bearer {access_token}',
'Content-Type': 'application/json',
}
if client_id:
headers['wix-client-id'] = client_id
if not site_id:
from .utils import extract_meta_from_token
meta_info = extract_meta_from_token(access_token)
site_id = meta_info.get('metaSiteId')
if site_id:
headers['wix-site-id'] = site_id
payload = {
'url': image_url,
'mediaType': 'IMAGE',

View File

@@ -26,10 +26,6 @@ def build_seo_data(seo_metadata: Dict[str, Any], default_title: str = None) -> O
Wix seoData object with settings.keywords and tags array, or None if empty
"""
seo_data = {
'settings': {
'keywords': [],
'preventAutoRedirect': False # Required by Wix API schema
},
'tags': []
}
@@ -77,11 +73,7 @@ def build_seo_data(seo_metadata: Dict[str, Any], default_title: str = None) -> O
# Keep main keyword + next 4 most important
keywords_list = keywords_list[:5]
seo_data['settings']['keywords'] = keywords_list
# Validate keywords list is not empty (or ensure at least one keyword exists)
if not seo_data['settings']['keywords']:
logger.warning("No keywords found in SEO metadata, adding empty keywords array")
seo_data['settings'] = {'keywords': keywords_list}
# Build tags array (meta tags, Open Graph, etc.)
tags_list = []

View File

@@ -6,6 +6,7 @@ from sqlalchemy.orm import Session
from models.daily_workflow_models import DailyWorkflowPlan, DailyWorkflowTask
from models.agent_activity_models import AgentAlert
from models.content_planning import CalendarEvent, ContentStrategy
from services.agent_activity_service import AgentActivityService, build_agent_event_payload
from services.llm_providers.main_text_generation import llm_text_gen
from services.database import get_all_user_ids, get_session_for_user
@@ -17,6 +18,82 @@ PILLAR_IDS = ["plan", "generate", "publish", "analyze", "engage", "remarket"]
MIN_TASK_EVIDENCE_LINKS = 1
PLAN_CONTEXT_THRESHOLD = 0.65
# Calendar → Workflow mapping
CALENDAR_CONTENT_PILLAR = "generate"
_PLATFORM_ACTION_URL = {
"linkedin": "/linkedin-writer",
"facebook": "/facebook-writer",
"twitter": "/twitter-writer",
"instagram": "/instagram-writer",
"youtube": "/youtube-writer",
"tiktok": "/tiktok-writer",
}
_CONTENT_ACTION_URL = {
"blog_post": "/blog-writer",
"linkedin_post": "/linkedin-writer",
"facebook_post": "/facebook-writer",
"seo_page": "/seo-dashboard",
"video": "/video-writer",
}
_CONTENT_ESTIMATED_TIME = {
"blog_post": 45, "linkedin_post": 20, "facebook_post": 15,
"twitter_post": 10, "instagram_post": 15, "seo_page": 30, "video": 60,
}
def _resolve_calendar_action_url(content_type: str, platform: str) -> Optional[str]:
platform_lower = (platform or "").strip().lower()
if platform_lower in _PLATFORM_ACTION_URL:
return _PLATFORM_ACTION_URL[platform_lower]
ct_lower = (content_type or "").strip().lower()
if ct_lower in _CONTENT_ACTION_URL:
return _CONTENT_ACTION_URL[ct_lower]
logger.warning("No action_url mapping for calendar event content_type={!r} platform={!r}", content_type, platform)
return None
def _resolve_calendar_estimated_time(content_type: str) -> int:
return _CONTENT_ESTIMATED_TIME.get((content_type or "").strip().lower(), 30)
def _generate_calendar_event_plan(date: str, grounding: Dict[str, Any]) -> Dict[str, Any]:
calendar_events = grounding.get("calendar_events_today", [])
if not calendar_events:
return {"date": date, "tasks": []}
tasks = []
for event in calendar_events:
action_url = _resolve_calendar_action_url(
event.get("content_type", ""), event.get("platform", "")
)
if action_url is None:
continue
task = {
"pillarId": CALENDAR_CONTENT_PILLAR,
"title": (event.get("title") or "Untitled").strip()[:255],
"description": (event.get("description") or "").strip(),
"priority": "high",
"estimatedTime": _resolve_calendar_estimated_time(event.get("content_type", "")),
"actionType": "navigate",
"actionUrl": action_url,
"enabled": True,
"dependencies": [],
"metadata": {
"source": "calendar_event",
"source_event_id": event.get("id"),
"calendar_title": event.get("title"),
"content_type": event.get("content_type"),
"platform": event.get("platform"),
},
}
tasks.append(task)
return {"date": date, "tasks": tasks}
def _today_date_str() -> str:
return datetime.now(timezone.utc).date().isoformat()
@@ -47,70 +124,6 @@ def _proposal_order_key(proposal: Any) -> tuple:
)
def _fallback_tasks(date: str) -> List[Dict[str, Any]]:
return [
{
"pillarId": "plan",
"title": "Review todays plan",
"description": "Confirm priorities and adjust the content calendar for today.",
"priority": "high",
"estimatedTime": 15,
"actionType": "navigate",
"actionUrl": "/content-planning-dashboard",
"enabled": True,
},
{
"pillarId": "generate",
"title": "Generate one core content asset",
"description": "Create a draft aligned with your current strategy and voice.",
"priority": "high",
"estimatedTime": 45,
"actionType": "navigate",
"actionUrl": "/blog-writer",
"enabled": True,
},
{
"pillarId": "publish",
"title": "Publish or schedule todays content",
"description": "Publish or schedule content across the selected channel(s).",
"priority": "medium",
"estimatedTime": 20,
"actionType": "navigate",
"actionUrl": "/content-planning-dashboard",
"enabled": True,
},
{
"pillarId": "analyze",
"title": "Check semantic health and performance",
"description": "Review semantic health metrics and key performance indicators.",
"priority": "medium",
"estimatedTime": 15,
"actionType": "navigate",
"actionUrl": "/seo-dashboard",
"enabled": True,
},
{
"pillarId": "engage",
"title": "Engage on one channel",
"description": "Respond to comments and share one post to keep momentum.",
"priority": "medium",
"estimatedTime": 15,
"actionType": "navigate",
"actionUrl": "/linkedin-writer",
"enabled": True,
},
{
"pillarId": "remarket",
"title": "Repurpose and remarket content",
"description": "Create one repurposed snippet and distribute it to increase reach.",
"priority": "low",
"estimatedTime": 20,
"actionType": "navigate",
"actionUrl": "/facebook-writer",
"enabled": True,
},
]
def _is_coverage_guardrail_enabled(grounding: Dict[str, Any]) -> bool:
workflow_config = grounding.get("workflow_config", {}) if isinstance(grounding, dict) else {}
@@ -315,9 +328,6 @@ def _ensure_pillar_coverage(
return sanitized_tasks
covered_pillars = {task["pillarId"] for task in sanitized_tasks}
fallback_by_pillar = {
task["pillarId"]: task for task in (_sanitize_task(t) for t in _fallback_tasks(date)) if task
}
for pillar_id in PILLAR_IDS:
if pillar_id in covered_pillars:
@@ -327,15 +337,6 @@ def _ensure_pillar_coverage(
if generated:
sanitized_tasks.append(generated)
covered_pillars.add(pillar_id)
continue
controlled_fallback = fallback_by_pillar.get(pillar_id)
if controlled_fallback:
metadata = controlled_fallback.get("metadata") if isinstance(controlled_fallback.get("metadata"), dict) else {}
metadata["source"] = "controlled_fallback"
controlled_fallback["metadata"] = metadata
sanitized_tasks.append(controlled_fallback)
covered_pillars.add(pillar_id)
return sanitized_tasks
@@ -367,6 +368,28 @@ def build_grounding_context(db: Session, user_id: str, date: str) -> Dict[str, A
if "workflow_config" not in onboarding_context:
onboarding_context["workflow_config"] = {}
# 3. Fetch calendar events for today
calendar_events_today = []
try:
from datetime import datetime as dt_func, timedelta
today_start = dt_func.strptime(date, "%Y-%m-%d").replace(hour=0, minute=0, second=0)
today_end = today_start + timedelta(days=1)
calendar_events_today = (
db.query(CalendarEvent)
.join(ContentStrategy, CalendarEvent.strategy_id == ContentStrategy.id)
.filter(
ContentStrategy.user_id == user_id,
CalendarEvent.scheduled_date >= today_start,
CalendarEvent.scheduled_date < today_end,
CalendarEvent.status.in_(["draft", "scheduled"]),
)
.all()
)
except Exception as e:
logger.warning(f"Failed to fetch calendar events for grounding context: {e}")
return {
"recent_agent_alerts": [
{
@@ -379,7 +402,19 @@ def build_grounding_context(db: Session, user_id: str, date: str) -> Dict[str, A
for a in unread_agent_alerts
],
"onboarding_data": onboarding_context,
"workflow_config": onboarding_context.get("workflow_config", {})
"workflow_config": onboarding_context.get("workflow_config", {}),
"calendar_events_today": [
{
"id": event.id,
"title": event.title,
"description": event.description,
"content_type": event.content_type,
"platform": event.platform,
"status": event.status,
"scheduled_date": event.scheduled_date.isoformat() if event.scheduled_date else None,
}
for event in calendar_events_today
],
}
@@ -406,7 +441,7 @@ async def generate_agent_enhanced_plan(
orchestrator = await orchestration_service.get_or_create_orchestrator(user_id)
except Exception as e:
logger.error(f"Failed to get orchestrator: {e}")
return {"date": date, "tasks": _fallback_tasks(date)}
return {"date": date, "tasks": []}
# 2. Parallel "Committee" Proposal Gathering
logger.info(f"Gathering daily task proposals from agent committee for user {user_id}")
@@ -689,21 +724,21 @@ async def generate_agent_enhanced_plan(
try:
result = json.loads(raw)
except Exception:
result = {"date": date, "tasks": _fallback_tasks(date)}
result = {"date": date, "tasks": []}
except Exception as e:
activity.log_event(
event_type="warning",
severity="warning",
message=str(e)[:2000],
payload=build_agent_event_payload(phase="generation", step="llm_failed_fallback", tool_name="llm_text_gen", progress_percent=70, output_summary="LLM generation failed, using fallback tasks", decision_reason="Exception during workflow generation", safe_debug=False, metadata={"fallback": True}),
payload=build_agent_event_payload(phase="generation", step="llm_failed", tool_name="llm_text_gen", progress_percent=70, output_summary="LLM generation failed, returning empty tasks", decision_reason="Exception during workflow generation", safe_debug=False, metadata={"error": str(e)[:200]}),
run_id=run.id,
agent_type="TodayWorkflowGenerator",
)
result = {"date": date, "tasks": _fallback_tasks(date)}
result = {"date": date, "tasks": []}
tasks = result.get("tasks") if isinstance(result, dict) else None
if not isinstance(tasks, list) or not tasks:
tasks = _fallback_tasks(date)
if not isinstance(tasks, list):
tasks = []
result = {
"date": date,
"tasks": _ensure_pillar_coverage(tasks, user_id, date, grounding),
@@ -744,23 +779,38 @@ async def get_or_create_daily_workflow_plan(
return existing, False
grounding = build_grounding_context(db, user_id, date_str)
plan_data = await generate_agent_enhanced_plan(db, user_id, date_str, grounding=grounding)
# Step 1: Calendar events → generate pillar (SSOT for content creation)
calendar_plan = _generate_calendar_event_plan(date_str, grounding)
calendar_task_titles = {t.get("title") for t in calendar_plan.get("tasks", []) if t.get("title")}
# Step 2: Agent committee → proposals for plan + analyze + engage + publish + remarket
agent_plan_data = await generate_agent_enhanced_plan(db, user_id, date_str, grounding=grounding, strict_contextuality=False)
# Filter agent proposals: keep only non-generate pillars, dedup by title
committee_pillars = {"plan", "analyze", "engage", "publish", "remarket"}
filtered_agent_tasks = [
t for t in agent_plan_data.get("tasks", [])
if t.get("pillarId") in committee_pillars
and t.get("title") not in calendar_task_titles
]
# Step 3: Merge — calendar wins for generate, agents fill other pillars
all_tasks = calendar_plan.get("tasks", []) + filtered_agent_tasks
calendar_source = bool(calendar_plan.get("tasks"))
# Step 4: Pillar coverage — LLM backfill for any pillar still uncovered
all_tasks = _ensure_pillar_coverage(all_tasks, user_id, date_str, grounding)
# Step 5: Validation
plan_data = {**agent_plan_data, "tasks": all_tasks}
validation = validate_plan_contextuality(plan_data, grounding)
if not validation.get("is_contextual"):
logger.info("Plan contextuality below threshold for user {}. Running strict regeneration.", user_id)
regenerated_plan = await generate_agent_enhanced_plan(
db,
user_id,
date_str,
grounding=grounding,
strict_contextuality=True,
)
regenerated_validation = validate_plan_contextuality(regenerated_plan, grounding)
plan_data = regenerated_plan
validation = regenerated_validation
plan_data["quality_status"] = "contextual" if validation.get("is_contextual") else "low_context"
plan_data["quality_status"] = (
"calendar_driven" if calendar_source
else "contextual" if validation.get("is_contextual")
else "low_context"
)
plan_data["contextuality_validation"] = validation
tasks = plan_data.get("tasks", [])
@@ -769,9 +819,9 @@ async def get_or_create_daily_workflow_plan(
user_id=user_id,
date=date_str,
source=creation_source,
generation_mode=_derive_generation_mode(plan_data),
generation_mode="calendar_driven" if calendar_source else _derive_generation_mode(plan_data),
committee_agent_count=_count_committee_agents(tasks),
fallback_used=_plan_uses_fallback(tasks),
fallback_used=False,
plan_json=plan_data,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow(),
@@ -824,15 +874,17 @@ def _derive_generation_mode(plan_data: Dict[str, Any]) -> str:
metadata = metadata if isinstance(metadata, dict) else {}
source_agent = str(metadata.get("source_agent") or "").strip()
source = str(metadata.get("source") or "").strip()
if source == "calendar_event":
return "calendar_driven"
if source_agent:
source_modes.add("agent_committee")
elif source in {"controlled_fallback", "llm_pillar_backfill"}:
elif source in {"llm_pillar_backfill"}:
source_modes.add(source)
if "calendar_driven" in source_modes:
return "calendar_driven"
if "agent_committee" in source_modes:
return "agent_committee"
if "controlled_fallback" in source_modes:
return "controlled_fallback"
if "llm_pillar_backfill" in source_modes:
return "llm_pillar_backfill"
return "llm_generation"
@@ -929,4 +981,28 @@ def update_task_status(
db.add(task)
db.commit()
db.refresh(task)
# If a calendar-sourced task is completed, mark the calendar event as published
if status == "completed" and task.metadata_json:
source = task.metadata_json.get("source")
source_event_id = task.metadata_json.get("source_event_id")
if source == "calendar_event" and source_event_id:
try:
cal_event = (
db.query(CalendarEvent)
.join(ContentStrategy, CalendarEvent.strategy_id == ContentStrategy.id)
.filter(
CalendarEvent.id == source_event_id,
ContentStrategy.user_id == user_id,
)
.first()
)
if cal_event and cal_event.status != "published":
cal_event.status = "published"
cal_event.updated_at = datetime.utcnow()
db.add(cal_event)
db.commit()
except Exception as e:
logger.warning(f"Failed to update calendar event {source_event_id} on task completion: {e}")
return task

View File

@@ -91,6 +91,17 @@ PLATFORM_SPECS: List[PlatformSpec] = [
formats=["mp4"],
description="Square video format for LinkedIn",
),
PlatformSpec(
platform=Platform.LINKEDIN,
name="LinkedIn Video (Portrait)",
aspect_ratio="9:16",
width=1080,
height=1920,
max_duration=600.0, # 10 minutes
max_file_size_mb=5000.0, # 5GB
formats=["mp4"],
description="Portrait video format for LinkedIn mobile feed",
),
PlatformSpec(
platform=Platform.FACEBOOK,
name="Facebook Video",

View File

@@ -148,10 +148,8 @@ class WixService:
token_str = normalize_token_string(access_token)
if not token_str:
return {"_no_site": True, "error": "Invalid access token format"}
meta = extract_meta_from_token(token_str)
meta_site_id = meta.get("metaSiteId")
try:
return self.auth_service.get_site_info(token_str, meta_site_id=meta_site_id)
return self.auth_service.get_site_info(token_str)
except requests.RequestException as e:
logger.warning(f"Failed to get site info: {e}")
return {"_no_site": True, "error": str(e)}
@@ -181,26 +179,34 @@ class WixService:
def _normalize_token_string(self, access_token: Any) -> Optional[str]:
return normalize_token_string(access_token)
def check_blog_permissions(self, access_token: str) -> Dict[str, Any]:
def check_blog_permissions(self, access_token: str, site_id: Optional[str] = None) -> Dict[str, Any]:
"""
Check if the app has required blog permissions
Args:
access_token: Valid access token
site_id: Optional Wix metaSiteId for multi-site token context
Returns:
Permission status
"""
extra_headers = {}
if not site_id:
meta_info = extract_meta_from_token(access_token)
site_id = meta_info.get('metaSiteId')
if site_id:
extra_headers['wix-site-id'] = site_id
headers = {
'Authorization': f'Bearer {access_token}',
'Content-Type': 'application/json',
'wix-client-id': self.client_id or ''
}
headers.update(extra_headers)
try:
# Try to list blog categories to check permissions
response = requests.get(
f"{self.base_url}/blog/v1/categories",
f"{self.base_url}/blog/v3/categories",
headers=headers
)
@@ -215,13 +221,23 @@ class WixService:
'has_permissions': False,
'can_create_posts': False,
'can_publish': False,
'error': 'Insufficient permissions'
'error': 'Insufficient permissions — OAuth app lacks blog scopes'
}
elif response.status_code == 404:
return {
'has_permissions': False,
'error': 'Blog feature not available or site ID not recognized'
}
elif response.status_code == 401:
return {
'has_permissions': False,
'error': 'Token expired or invalid'
}
else:
response.raise_for_status()
except requests.RequestException as e:
logger.error(f"Failed to check blog permissions: {e}")
logger.warning(f"Failed to check blog permissions: {e}")
return {
'has_permissions': False,
'error': str(e)
@@ -243,7 +259,8 @@ class WixService:
result = self.media_service.import_image(
access_token,
image_url,
display_name or f'Imported Image {datetime.now().strftime("%Y%m%d_%H%M%S")}'
display_name or f'Imported Image {datetime.now().strftime("%Y%m%d_%H%M%S")}',
client_id=self.client_id,
)
if result and isinstance(result, dict) and 'file' in result:
media_id = result['file'].get('id')
@@ -431,8 +448,8 @@ class WixService:
return category_ids
except requests.RequestException as e:
logger.error(f"Failed to lookup/create categories: {e}")
except Exception as e:
logger.warning(f"Failed to lookup/create categories (will skip): {e}")
return []
def lookup_or_create_tags(self, access_token: str, tag_names: List[str],
@@ -497,8 +514,8 @@ class WixService:
return tag_ids
except requests.RequestException as e:
logger.error(f"Failed to lookup/create tags: {e}")
except Exception as e:
logger.warning(f"Failed to lookup/create tags (will skip): {e}")
return []
def publish_draft_post(self, access_token: str, draft_post_id: str) -> Dict[str, Any]: