221 lines
9.5 KiB
Python
221 lines
9.5 KiB
Python
"""
|
|
Keyword Curator - Smart keyword selection engine for SEO-optimized outline generation.
|
|
|
|
Instead of dumping all discovered keywords into the LLM prompt (which causes
|
|
keyword stuffing and dilutes topical focus), this module selects a highly
|
|
curated subset based on SEO best practices and assigns each keyword a
|
|
specific structural role in the outline.
|
|
"""
|
|
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
|
|
class KeywordCurator:
|
|
"""
|
|
Curates a strict, minimal keyword set for outline generation.
|
|
|
|
Selection Rules (SEO Best Practice):
|
|
1. Primary (H1 Focus) → top 2 — brand name + core topic
|
|
2. Secondary (H2 Focus) → top 2 — feature/benefit anchors
|
|
3. Long-tail (H3 Focus) → top 2 — informational intent phrases
|
|
4. Semantic (Body Context) → top 4 — prevent topical drift
|
|
5. Trending (Mention) → top 2 — brief contextual mentions
|
|
6. Content Gap (Edge) → top 1 — competitive differentiator
|
|
"""
|
|
|
|
# How many keywords to select from each category
|
|
SLOTS: Dict[str, int] = {
|
|
"primary": 2,
|
|
"secondary": 2,
|
|
"long_tail": 2,
|
|
"semantic": 4,
|
|
"trending": 2,
|
|
"content_gap": 1,
|
|
}
|
|
|
|
def curate(
|
|
self,
|
|
keyword_analysis: Dict[str, Any],
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Apply selection rules and return a structured, minimal keyword payload.
|
|
|
|
Args:
|
|
keyword_analysis: Raw keyword_analysis dict from research
|
|
(keys: primary, secondary, long_tail,
|
|
semantic_keywords, trending_terms, content_gaps, ...)
|
|
|
|
Returns:
|
|
Dict with curated keyword groups plus all other analysis fields preserved.
|
|
"""
|
|
curated: Dict[str, Any] = {}
|
|
|
|
# --- Select from keyword lists ---
|
|
curated["primary"] = self._pick(keyword_analysis, "primary")
|
|
curated["secondary"] = self._pick(keyword_analysis, "secondary")
|
|
curated["long_tail"] = self._pick(keyword_analysis, "long_tail")
|
|
|
|
# semantic_keywords is the actual key in the research data
|
|
curated["semantic"] = self._pick(keyword_analysis, "semantic_keywords", slot_key="semantic")
|
|
curated["trending"] = self._pick(keyword_analysis, "trending_terms", slot_key="trending")
|
|
curated["content_gap"] = self._pick(keyword_analysis, "content_gaps", slot_key="content_gap")
|
|
|
|
# --- Build a flat "locked" set for quick reference ---
|
|
locked: List[str] = []
|
|
for group in curated.values():
|
|
if isinstance(group, list):
|
|
locked.extend(group)
|
|
curated["locked_keywords"] = locked
|
|
|
|
# --- Track counts for transparency ---
|
|
total_raw = 0
|
|
total_curated = 0
|
|
for source_key, limit in self.SLOTS.items():
|
|
raw_key = self._source_key(source_key)
|
|
raw_list = keyword_analysis.get(raw_key, [])
|
|
total_raw += len(raw_list) if isinstance(raw_list, list) else 0
|
|
curated_list = curated.get(source_key, [])
|
|
total_curated += len(curated_list) if isinstance(curated_list, list) else 0
|
|
curated["stats"] = {
|
|
"total_raw": total_raw,
|
|
"total_curated": total_curated,
|
|
"reduction_pct": round((1 - total_curated / max(total_raw, 1)) * 100, 1),
|
|
}
|
|
|
|
# --- Preserve non-keyword analysis fields ---
|
|
for field in ("search_intent", "difficulty", "analysis_insights"):
|
|
if field in keyword_analysis:
|
|
curated[field] = keyword_analysis[field]
|
|
|
|
return curated
|
|
|
|
def format_for_prompt(self, curated: Dict[str, Any]) -> str:
|
|
"""
|
|
Format the curated keyword payload into a strict structural prompt section.
|
|
|
|
Returns a string ready to be injected into the outline prompt.
|
|
"""
|
|
lines: List[str] = []
|
|
lines.append("## KEYWORD PLACEMENT DIRECTIVES\n")
|
|
|
|
# H1 — primary
|
|
primary = curated.get("primary", [])
|
|
if primary:
|
|
h1_text = " | ".join(primary)
|
|
lines.append(f"### H1 (must contain, in order of priority): {h1_text}")
|
|
lines.append(" → Anchor the title and main heading on these terms.")
|
|
else:
|
|
lines.append("### H1: No primary keywords provided — derive from topic context.")
|
|
|
|
# H2 — secondary
|
|
secondary = curated.get("secondary", [])
|
|
if secondary:
|
|
lines.append(f"### H2 sections must anchor on (one per major section): {', '.join(secondary)}")
|
|
lines.append(" → Each secondary keyword should map to a distinct H2 section.")
|
|
|
|
# H3 — long-tail
|
|
long_tail = curated.get("long_tail", [])
|
|
if long_tail:
|
|
lines.append(f"### H3 / Subsection anchors for informational intent: {', '.join(long_tail)}")
|
|
lines.append(" → Use these as deeper-dive subsections under the relevant H2.")
|
|
|
|
# Body-level — semantic
|
|
semantic = curated.get("semantic", [])
|
|
if semantic:
|
|
lines.append(f"### Body-level semantic signals (use naturally, max 1-2 mentions each): {', '.join(semantic)}")
|
|
lines.append(" → These prevent topical drift. Weave into paragraph text, not headings.")
|
|
|
|
# Trending — brief
|
|
trending = curated.get("trending", [])
|
|
if trending:
|
|
lines.append(f"### Trending context (mention subtly if relevant): {', '.join(trending)}")
|
|
lines.append(" → Optional. Only include if it strengthens timeliness/narrative.")
|
|
|
|
# Content gap — competitive edge
|
|
content_gap = curated.get("content_gap", [])
|
|
if content_gap:
|
|
lines.append(f"### Competitive advantage signal (must weave into narrative): {content_gap[0]}")
|
|
lines.append(" → This is your primary differentiation hook. Surface it prominently in the unique value section.")
|
|
|
|
lines.append("")
|
|
lines.append("### SUGGESTED SECTION → KEYWORD MAPPING")
|
|
lines.append("Map each outline section's keyword focus according to its narrative role:")
|
|
lines.append("- Hook / Introduction → lead with primary and trending keywords for timeliness & relevance")
|
|
lines.append("- Problem / Pain Point → anchor on secondary and long-tail keywords (informational intent)")
|
|
lines.append("- Solution / How-To → weave in primary and secondary keywords for solution-oriented search")
|
|
lines.append("- Comparison / Analysis → embed semantic keywords to prevent topical drift into tangents")
|
|
lines.append("- Case Studies / Evidence → surface content gap keywords as differentiation proof points")
|
|
lines.append("- Future / Trends → leverage trending and content gap keywords for forward-looking authority")
|
|
lines.append("")
|
|
lines.append("GUIDELINE: Treat these as the primary keyword anchors. You may include closely related")
|
|
lines.append("intent-matching variations where natural, but avoid inserting every raw research keyword.")
|
|
lines.append("Quality over density — each keyword earns its place by serving a clear structural purpose.")
|
|
|
|
stats = curated.get("stats", {})
|
|
if stats:
|
|
lines.append(
|
|
f"\n[From {stats.get('total_raw', '?')} raw research keywords "
|
|
f"→ curated to {stats.get('total_curated', '?')} locked keywords "
|
|
f"({stats.get('reduction_pct', '?')}% reduction)]"
|
|
)
|
|
|
|
return "\n".join(lines)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Internal helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def _source_key(slot_key: str) -> str:
|
|
"""Map internal slot key to the actual field name in keyword_analysis."""
|
|
mapping = {
|
|
"primary": "primary",
|
|
"secondary": "secondary",
|
|
"long_tail": "long_tail",
|
|
"semantic": "semantic_keywords",
|
|
"trending": "trending_terms",
|
|
"content_gap": "content_gaps",
|
|
}
|
|
return mapping.get(slot_key, slot_key)
|
|
|
|
def _pick(
|
|
self,
|
|
data: Dict[str, Any],
|
|
source_key: str,
|
|
slot_key: Optional[str] = None,
|
|
) -> List[str]:
|
|
"""
|
|
Pick up to N items from a keyword list with diversity sampling.
|
|
|
|
When the raw list is significantly larger than the limit, selects
|
|
evenly-spaced entries to capture semantic diversity rather than
|
|
just the first N entries.
|
|
|
|
Args:
|
|
data: The raw keyword_analysis dict.
|
|
source_key: The actual key in the dict (e.g. 'semantic_keywords').
|
|
slot_key: The internal slot name for looking up the limit.
|
|
Falls back to source_key if not provided.
|
|
Returns:
|
|
List of at most N strings with diversity sampling.
|
|
"""
|
|
limit_key = slot_key or source_key
|
|
limit = self.SLOTS.get(limit_key, 5)
|
|
raw: Any = data.get(source_key, [])
|
|
if not isinstance(raw, list):
|
|
return []
|
|
if len(raw) <= limit:
|
|
return raw
|
|
if len(raw) <= limit * 2:
|
|
return raw[:limit]
|
|
indices = set()
|
|
if limit >= 2:
|
|
indices.add(0)
|
|
indices.add(len(raw) - 1)
|
|
step = (len(raw) - 1) / max(limit - 1, 1)
|
|
for i in range(1, limit - 1):
|
|
indices.add(int(round(i * step)))
|
|
else:
|
|
indices.add(0)
|
|
return [raw[i] for i in sorted(indices) if i < len(raw)][:limit]
|