Files
ALwrity/backend/services/blog_writer/outline/keyword_curator.py

195 lines
7.9 KiB
Python

"""
Keyword Curator - Smart keyword selection engine for SEO-optimized outline generation.
Instead of dumping all discovered keywords into the LLM prompt (which causes
keyword stuffing and dilutes topical focus), this module selects a highly
curated subset based on SEO best practices and assigns each keyword a
specific structural role in the outline.
"""
from typing import Dict, Any, List, Optional
class KeywordCurator:
"""
Curates a strict, minimal keyword set for outline generation.
Selection Rules (SEO Best Practice):
1. Primary (H1 Focus) → top 2 — brand name + core topic
2. Secondary (H2 Focus) → top 2 — feature/benefit anchors
3. Long-tail (H3 Focus) → top 2 — informational intent phrases
4. Semantic (Body Context) → top 4 — prevent topical drift
5. Trending (Mention) → top 2 — brief contextual mentions
6. Content Gap (Edge) → top 1 — competitive differentiator
"""
# How many keywords to select from each category
SLOTS: Dict[str, int] = {
"primary": 2,
"secondary": 2,
"long_tail": 2,
"semantic": 4,
"trending": 2,
"content_gap": 1,
}
def curate(
self,
keyword_analysis: Dict[str, Any],
) -> Dict[str, Any]:
"""
Apply selection rules and return a structured, minimal keyword payload.
Args:
keyword_analysis: Raw keyword_analysis dict from research
(keys: primary, secondary, long_tail,
semantic_keywords, trending_terms, content_gaps, ...)
Returns:
Dict with curated keyword groups plus all other analysis fields preserved.
"""
curated: Dict[str, Any] = {}
# --- Select from keyword lists ---
curated["primary"] = self._pick(keyword_analysis, "primary")
curated["secondary"] = self._pick(keyword_analysis, "secondary")
curated["long_tail"] = self._pick(keyword_analysis, "long_tail")
# semantic_keywords is the actual key in the research data
curated["semantic"] = self._pick(keyword_analysis, "semantic_keywords", slot_key="semantic")
curated["trending"] = self._pick(keyword_analysis, "trending_terms", slot_key="trending")
curated["content_gap"] = self._pick(keyword_analysis, "content_gaps", slot_key="content_gap")
# --- Build a flat "locked" set for quick reference ---
locked: List[str] = []
for group in curated.values():
if isinstance(group, list):
locked.extend(group)
curated["locked_keywords"] = locked
# --- Track counts for transparency ---
total_raw = 0
total_curated = 0
for source_key, limit in self.SLOTS.items():
raw_key = self._source_key(source_key)
raw_list = keyword_analysis.get(raw_key, [])
total_raw += len(raw_list) if isinstance(raw_list, list) else 0
curated_list = curated.get(source_key, [])
total_curated += len(curated_list) if isinstance(curated_list, list) else 0
curated["stats"] = {
"total_raw": total_raw,
"total_curated": total_curated,
"reduction_pct": round((1 - total_curated / max(total_raw, 1)) * 100, 1),
}
# --- Preserve non-keyword analysis fields ---
for field in ("search_intent", "difficulty", "analysis_insights"):
if field in keyword_analysis:
curated[field] = keyword_analysis[field]
return curated
def format_for_prompt(self, curated: Dict[str, Any]) -> str:
"""
Format the curated keyword payload into a strict structural prompt section.
Returns a string ready to be injected into the outline prompt.
"""
lines: List[str] = []
lines.append("## KEYWORD PLACEMENT DIRECTIVES\n")
# H1 — primary
primary = curated.get("primary", [])
if primary:
h1_text = " | ".join(primary)
lines.append(f"### H1 (must contain, in order of priority): {h1_text}")
lines.append(" → Anchor the title and main heading on these terms.")
else:
lines.append("### H1: No primary keywords provided — derive from topic context.")
# H2 — secondary
secondary = curated.get("secondary", [])
if secondary:
lines.append(f"### H2 sections must anchor on (one per major section): {', '.join(secondary)}")
lines.append(" → Each secondary keyword should map to a distinct H2 section.")
# H3 — long-tail
long_tail = curated.get("long_tail", [])
if long_tail:
lines.append(f"### H3 / Subsection anchors for informational intent: {', '.join(long_tail)}")
lines.append(" → Use these as deeper-dive subsections under the relevant H2.")
# Body-level — semantic
semantic = curated.get("semantic", [])
if semantic:
lines.append(f"### Body-level semantic signals (use naturally, max 1-2 mentions each): {', '.join(semantic)}")
lines.append(" → These prevent topical drift. Weave into paragraph text, not headings.")
# Trending — brief
trending = curated.get("trending", [])
if trending:
lines.append(f"### Trending context (mention subtly if relevant): {', '.join(trending)}")
lines.append(" → Optional. Only include if it strengthens timeliness/narrative.")
# Content gap — competitive edge
content_gap = curated.get("content_gap", [])
if content_gap:
lines.append(f"### Competitive advantage signal (must weave into narrative): {content_gap[0]}")
lines.append(" → This is your primary differentiation hook. Surface it prominently in the unique value section.")
lines.append("")
lines.append("GUIDELINE: Treat these as the primary keyword anchors. You may include closely related")
lines.append("intent-matching variations where natural, but avoid inserting every raw research keyword.")
lines.append("Quality over density — each keyword earns its place by serving a clear structural purpose.")
stats = curated.get("stats", {})
if stats:
lines.append(
f"\n[From {stats.get('total_raw', '?')} raw research keywords "
f"→ curated to {stats.get('total_curated', '?')} locked keywords "
f"({stats.get('reduction_pct', '?')}% reduction)]"
)
return "\n".join(lines)
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
@staticmethod
def _source_key(slot_key: str) -> str:
"""Map internal slot key to the actual field name in keyword_analysis."""
mapping = {
"primary": "primary",
"secondary": "secondary",
"long_tail": "long_tail",
"semantic": "semantic_keywords",
"trending": "trending_terms",
"content_gap": "content_gaps",
}
return mapping.get(slot_key, slot_key)
def _pick(
self,
data: Dict[str, Any],
source_key: str,
slot_key: Optional[str] = None,
) -> List[str]:
"""
Pick up to N items from a keyword list.
Args:
data: The raw keyword_analysis dict.
source_key: The actual key in the dict (e.g. 'semantic_keywords').
slot_key: The internal slot name for looking up the limit.
Falls back to source_key if not provided.
Returns:
Sliced list of at most N strings.
"""
limit_key = slot_key or source_key
limit = self.SLOTS.get(limit_key, 5)
raw: Any = data.get(source_key, [])
if not isinstance(raw, list):
return []
return raw[:limit]