ALwrity/backend/services/blog_writer/outline/keyword_curator.py

"""
Keyword Curator - Smart keyword selection engine for SEO-optimized outline generation.

Instead of dumping all discovered keywords into the LLM prompt (which causes
keyword stuffing and dilutes topical focus), this module selects a highly
curated subset based on SEO best practices and assigns each keyword a
specific structural role in the outline.
"""

from typing import Dict, Any, List, Optional


class KeywordCurator:
    """
    Curates a strict, minimal keyword set for outline generation.

    Selection Rules (SEO Best Practice):
    1. Primary (H1 Focus)   → top 2 — brand name + core topic
    2. Secondary (H2 Focus) → top 2 — feature/benefit anchors
    3. Long-tail (H3 Focus) → top 2 — informational intent phrases
    4. Semantic (Body Context) → top 4 — prevent topical drift
    5. Trending (Mention)   → top 2 — brief contextual mentions
    6. Content Gap (Edge)   → top 1 — competitive differentiator
    """

    # How many keywords to select from each category
    SLOTS: Dict[str, int] = {
        "primary": 2,
        "secondary": 2,
        "long_tail": 2,
        "semantic": 4,
        "trending": 2,
        "content_gap": 1,
    }

    def curate(
        self,
        keyword_analysis: Dict[str, Any],
    ) -> Dict[str, Any]:
        """
        Apply selection rules and return a structured, minimal keyword payload.

        Args:
            keyword_analysis: Raw keyword_analysis dict from research
                             (keys: primary, secondary, long_tail,
                              semantic_keywords, trending_terms, content_gaps, ...)

        Returns:
            Dict with curated keyword groups plus all other analysis fields preserved.
        """
        curated: Dict[str, Any] = {}

        # --- Select from keyword lists ---
        curated["primary"] = self._pick(keyword_analysis, "primary")
        curated["secondary"] = self._pick(keyword_analysis, "secondary")
        curated["long_tail"] = self._pick(keyword_analysis, "long_tail")

        # semantic_keywords is the actual key in the research data
        curated["semantic"] = self._pick(keyword_analysis, "semantic_keywords", slot_key="semantic")
        curated["trending"] = self._pick(keyword_analysis, "trending_terms", slot_key="trending")
        curated["content_gap"] = self._pick(keyword_analysis, "content_gaps", slot_key="content_gap")

        # --- Build a flat "locked" set for quick reference ---
        locked: List[str] = []
        for group in curated.values():
            if isinstance(group, list):
                locked.extend(group)
        curated["locked_keywords"] = locked

        # --- Track counts for transparency ---
        total_raw = 0
        total_curated = 0
        for source_key, limit in self.SLOTS.items():
            raw_key = self._source_key(source_key)
            raw_list = keyword_analysis.get(raw_key, [])
            total_raw += len(raw_list) if isinstance(raw_list, list) else 0
            curated_list = curated.get(source_key, [])
            total_curated += len(curated_list) if isinstance(curated_list, list) else 0
        curated["stats"] = {
            "total_raw": total_raw,
            "total_curated": total_curated,
            "reduction_pct": round((1 - total_curated / max(total_raw, 1)) * 100, 1),
        }

        # --- Preserve non-keyword analysis fields ---
        for field in ("search_intent", "difficulty", "analysis_insights"):
            if field in keyword_analysis:
                curated[field] = keyword_analysis[field]

        return curated

    def format_for_prompt(self, curated: Dict[str, Any]) -> str:
        """
        Format the curated keyword payload into a strict structural prompt section.

        Returns a string ready to be injected into the outline prompt.
        """
        lines: List[str] = []
        lines.append("## KEYWORD PLACEMENT DIRECTIVES\n")

        # H1 — primary
        primary = curated.get("primary", [])
        if primary:
            h1_text = " | ".join(primary)
            lines.append(f"### H1 (must contain, in order of priority): {h1_text}")
            lines.append("   → Anchor the title and main heading on these terms.")
        else:
            lines.append("### H1: No primary keywords provided — derive from topic context.")

        # H2 — secondary
        secondary = curated.get("secondary", [])
        if secondary:
            lines.append(f"### H2 sections must anchor on (one per major section): {', '.join(secondary)}")
            lines.append("   → Each secondary keyword should map to a distinct H2 section.")

        # H3 — long-tail
        long_tail = curated.get("long_tail", [])
        if long_tail:
            lines.append(f"### H3 / Subsection anchors for informational intent: {', '.join(long_tail)}")
            lines.append("   → Use these as deeper-dive subsections under the relevant H2.")

        # Body-level — semantic
        semantic = curated.get("semantic", [])
        if semantic:
            lines.append(f"### Body-level semantic signals (use naturally, max 1-2 mentions each): {', '.join(semantic)}")
            lines.append("   → These prevent topical drift. Weave into paragraph text, not headings.")

        # Trending — brief
        trending = curated.get("trending", [])
        if trending:
            lines.append(f"### Trending context (mention subtly if relevant): {', '.join(trending)}")
            lines.append("   → Optional. Only include if it strengthens timeliness/narrative.")

        # Content gap — competitive edge
        content_gap = curated.get("content_gap", [])
        if content_gap:
            lines.append(f"### Competitive advantage signal (must weave into narrative): {content_gap[0]}")
            lines.append("   → This is your primary differentiation hook. Surface it prominently in the unique value section.")

        lines.append("")
        lines.append("### SUGGESTED SECTION → KEYWORD MAPPING")
        lines.append("Map each outline section's keyword focus according to its narrative role:")
        lines.append("- Hook / Introduction → lead with primary and trending keywords for timeliness & relevance")
        lines.append("- Problem / Pain Point → anchor on secondary and long-tail keywords (informational intent)")
        lines.append("- Solution / How-To → weave in primary and secondary keywords for solution-oriented search")
        lines.append("- Comparison / Analysis → embed semantic keywords to prevent topical drift into tangents")
        lines.append("- Case Studies / Evidence → surface content gap keywords as differentiation proof points")
        lines.append("- Future / Trends → leverage trending and content gap keywords for forward-looking authority")
        lines.append("")
        lines.append("GUIDELINE: Treat these as the primary keyword anchors. You may include closely related")
        lines.append("intent-matching variations where natural, but avoid inserting every raw research keyword.")
        lines.append("Quality over density — each keyword earns its place by serving a clear structural purpose.")

        stats = curated.get("stats", {})
        if stats:
            lines.append(
                f"\n[From {stats.get('total_raw', '?')} raw research keywords "
                f"→ curated to {stats.get('total_curated', '?')} locked keywords "
                f"({stats.get('reduction_pct', '?')}% reduction)]"
            )

        return "\n".join(lines)

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _source_key(slot_key: str) -> str:
        """Map internal slot key to the actual field name in keyword_analysis."""
        mapping = {
            "primary": "primary",
            "secondary": "secondary",
            "long_tail": "long_tail",
            "semantic": "semantic_keywords",
            "trending": "trending_terms",
            "content_gap": "content_gaps",
        }
        return mapping.get(slot_key, slot_key)

    def _pick(
        self,
        data: Dict[str, Any],
        source_key: str,
        slot_key: Optional[str] = None,
    ) -> List[str]:
        """
        Pick up to N items from a keyword list with diversity sampling.

        When the raw list is significantly larger than the limit, selects
        evenly-spaced entries to capture semantic diversity rather than
        just the first N entries.

        Args:
            data: The raw keyword_analysis dict.
            source_key: The actual key in the dict (e.g. 'semantic_keywords').
            slot_key: The internal slot name for looking up the limit.
                      Falls back to source_key if not provided.
        Returns:
            List of at most N strings with diversity sampling.
        """
        limit_key = slot_key or source_key
        limit = self.SLOTS.get(limit_key, 5)
        raw: Any = data.get(source_key, [])
        if not isinstance(raw, list):
            return []
        if len(raw) <= limit:
            return raw
        if len(raw) <= limit * 2:
            return raw[:limit]
        indices = set()
        if limit >= 2:
            indices.add(0)
            indices.add(len(raw) - 1)
            step = (len(raw) - 1) / max(limit - 1, 1)
            for i in range(1, limit - 1):
                indices.add(int(round(i * step)))
        else:
            indices.add(0)
        return [raw[i] for i in sorted(indices) if i < len(raw)][:limit]