feat: image generation overhaul (model-aware text, dim clamping, \.30 pricing), event-driven dashboard cache invalidation, SEO insights (AI visibility, GSC, keyword gap), YouTube OAuth/publish, blog writer & content planning improvements, scheduler monitoring updates
This commit is contained in:
194
backend/services/blog_writer/outline/keyword_curator.py
Normal file
194
backend/services/blog_writer/outline/keyword_curator.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
Keyword Curator - Smart keyword selection engine for SEO-optimized outline generation.
|
||||
|
||||
Instead of dumping all discovered keywords into the LLM prompt (which causes
|
||||
keyword stuffing and dilutes topical focus), this module selects a highly
|
||||
curated subset based on SEO best practices and assigns each keyword a
|
||||
specific structural role in the outline.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
|
||||
class KeywordCurator:
|
||||
"""
|
||||
Curates a strict, minimal keyword set for outline generation.
|
||||
|
||||
Selection Rules (SEO Best Practice):
|
||||
1. Primary (H1 Focus) → top 2 — brand name + core topic
|
||||
2. Secondary (H2 Focus) → top 2 — feature/benefit anchors
|
||||
3. Long-tail (H3 Focus) → top 2 — informational intent phrases
|
||||
4. Semantic (Body Context) → top 4 — prevent topical drift
|
||||
5. Trending (Mention) → top 2 — brief contextual mentions
|
||||
6. Content Gap (Edge) → top 1 — competitive differentiator
|
||||
"""
|
||||
|
||||
# How many keywords to select from each category
|
||||
SLOTS: Dict[str, int] = {
|
||||
"primary": 2,
|
||||
"secondary": 2,
|
||||
"long_tail": 2,
|
||||
"semantic": 4,
|
||||
"trending": 2,
|
||||
"content_gap": 1,
|
||||
}
|
||||
|
||||
def curate(
|
||||
self,
|
||||
keyword_analysis: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Apply selection rules and return a structured, minimal keyword payload.
|
||||
|
||||
Args:
|
||||
keyword_analysis: Raw keyword_analysis dict from research
|
||||
(keys: primary, secondary, long_tail,
|
||||
semantic_keywords, trending_terms, content_gaps, ...)
|
||||
|
||||
Returns:
|
||||
Dict with curated keyword groups plus all other analysis fields preserved.
|
||||
"""
|
||||
curated: Dict[str, Any] = {}
|
||||
|
||||
# --- Select from keyword lists ---
|
||||
curated["primary"] = self._pick(keyword_analysis, "primary")
|
||||
curated["secondary"] = self._pick(keyword_analysis, "secondary")
|
||||
curated["long_tail"] = self._pick(keyword_analysis, "long_tail")
|
||||
|
||||
# semantic_keywords is the actual key in the research data
|
||||
curated["semantic"] = self._pick(keyword_analysis, "semantic_keywords", slot_key="semantic")
|
||||
curated["trending"] = self._pick(keyword_analysis, "trending_terms", slot_key="trending")
|
||||
curated["content_gap"] = self._pick(keyword_analysis, "content_gaps", slot_key="content_gap")
|
||||
|
||||
# --- Build a flat "locked" set for quick reference ---
|
||||
locked: List[str] = []
|
||||
for group in curated.values():
|
||||
if isinstance(group, list):
|
||||
locked.extend(group)
|
||||
curated["locked_keywords"] = locked
|
||||
|
||||
# --- Track counts for transparency ---
|
||||
total_raw = 0
|
||||
total_curated = 0
|
||||
for source_key, limit in self.SLOTS.items():
|
||||
raw_key = self._source_key(source_key)
|
||||
raw_list = keyword_analysis.get(raw_key, [])
|
||||
total_raw += len(raw_list) if isinstance(raw_list, list) else 0
|
||||
curated_list = curated.get(source_key, [])
|
||||
total_curated += len(curated_list) if isinstance(curated_list, list) else 0
|
||||
curated["stats"] = {
|
||||
"total_raw": total_raw,
|
||||
"total_curated": total_curated,
|
||||
"reduction_pct": round((1 - total_curated / max(total_raw, 1)) * 100, 1),
|
||||
}
|
||||
|
||||
# --- Preserve non-keyword analysis fields ---
|
||||
for field in ("search_intent", "difficulty", "analysis_insights"):
|
||||
if field in keyword_analysis:
|
||||
curated[field] = keyword_analysis[field]
|
||||
|
||||
return curated
|
||||
|
||||
def format_for_prompt(self, curated: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Format the curated keyword payload into a strict structural prompt section.
|
||||
|
||||
Returns a string ready to be injected into the outline prompt.
|
||||
"""
|
||||
lines: List[str] = []
|
||||
lines.append("## KEYWORD PLACEMENT DIRECTIVES\n")
|
||||
|
||||
# H1 — primary
|
||||
primary = curated.get("primary", [])
|
||||
if primary:
|
||||
h1_text = " | ".join(primary)
|
||||
lines.append(f"### H1 (must contain, in order of priority): {h1_text}")
|
||||
lines.append(" → Anchor the title and main heading on these terms.")
|
||||
else:
|
||||
lines.append("### H1: No primary keywords provided — derive from topic context.")
|
||||
|
||||
# H2 — secondary
|
||||
secondary = curated.get("secondary", [])
|
||||
if secondary:
|
||||
lines.append(f"### H2 sections must anchor on (one per major section): {', '.join(secondary)}")
|
||||
lines.append(" → Each secondary keyword should map to a distinct H2 section.")
|
||||
|
||||
# H3 — long-tail
|
||||
long_tail = curated.get("long_tail", [])
|
||||
if long_tail:
|
||||
lines.append(f"### H3 / Subsection anchors for informational intent: {', '.join(long_tail)}")
|
||||
lines.append(" → Use these as deeper-dive subsections under the relevant H2.")
|
||||
|
||||
# Body-level — semantic
|
||||
semantic = curated.get("semantic", [])
|
||||
if semantic:
|
||||
lines.append(f"### Body-level semantic signals (use naturally, max 1-2 mentions each): {', '.join(semantic)}")
|
||||
lines.append(" → These prevent topical drift. Weave into paragraph text, not headings.")
|
||||
|
||||
# Trending — brief
|
||||
trending = curated.get("trending", [])
|
||||
if trending:
|
||||
lines.append(f"### Trending context (mention subtly if relevant): {', '.join(trending)}")
|
||||
lines.append(" → Optional. Only include if it strengthens timeliness/narrative.")
|
||||
|
||||
# Content gap — competitive edge
|
||||
content_gap = curated.get("content_gap", [])
|
||||
if content_gap:
|
||||
lines.append(f"### Competitive advantage signal (must weave into narrative): {content_gap[0]}")
|
||||
lines.append(" → This is your primary differentiation hook. Surface it prominently in the unique value section.")
|
||||
|
||||
lines.append("")
|
||||
lines.append("GUIDELINE: Treat these as the primary keyword anchors. You may include closely related")
|
||||
lines.append("intent-matching variations where natural, but avoid inserting every raw research keyword.")
|
||||
lines.append("Quality over density — each keyword earns its place by serving a clear structural purpose.")
|
||||
|
||||
stats = curated.get("stats", {})
|
||||
if stats:
|
||||
lines.append(
|
||||
f"\n[From {stats.get('total_raw', '?')} raw research keywords "
|
||||
f"→ curated to {stats.get('total_curated', '?')} locked keywords "
|
||||
f"({stats.get('reduction_pct', '?')}% reduction)]"
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _source_key(slot_key: str) -> str:
|
||||
"""Map internal slot key to the actual field name in keyword_analysis."""
|
||||
mapping = {
|
||||
"primary": "primary",
|
||||
"secondary": "secondary",
|
||||
"long_tail": "long_tail",
|
||||
"semantic": "semantic_keywords",
|
||||
"trending": "trending_terms",
|
||||
"content_gap": "content_gaps",
|
||||
}
|
||||
return mapping.get(slot_key, slot_key)
|
||||
|
||||
def _pick(
|
||||
self,
|
||||
data: Dict[str, Any],
|
||||
source_key: str,
|
||||
slot_key: Optional[str] = None,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Pick up to N items from a keyword list.
|
||||
|
||||
Args:
|
||||
data: The raw keyword_analysis dict.
|
||||
source_key: The actual key in the dict (e.g. 'semantic_keywords').
|
||||
slot_key: The internal slot name for looking up the limit.
|
||||
Falls back to source_key if not provided.
|
||||
Returns:
|
||||
Sliced list of at most N strings.
|
||||
"""
|
||||
limit_key = slot_key or source_key
|
||||
limit = self.SLOTS.get(limit_key, 5)
|
||||
raw: Any = data.get(source_key, [])
|
||||
if not isinstance(raw, list):
|
||||
return []
|
||||
return raw[:limit]
|
||||
Reference in New Issue
Block a user