ALwrity/backend/services/intelligence/agents/content_gap_radar_agent.py

"""
Content Gap Radar Agent

Scores and prioritizes content opportunities by combining SIF semantic gap analysis,
SERP ranking presence (Google CSE), competitor content deep-dive (Exa), and trend
momentum into a single ROI score per topic.

Phase 3 of the Content Gap Radar feature.
"""

import traceback
from typing import List, Dict, Any, Optional
from loguru import logger

from services.intelligence.agents.specialized import SIFBaseAgent
from services.intelligence.agents.specialized.strategy_architect import StrategyArchitectAgent
from services.intelligence.agents.trend_surfer_agent import TrendSurferAgent
from services.intelligence.agents.core_agent_framework import TaskProposal
from services.intelligence.txtai_service import TxtaiIntelligenceService
from services.seo_tools.serp_gap_service import SerpGapService
from services.seo_tools.competitor_content_service import CompetitorContentService


class ContentGapRadarAgent(SIFBaseAgent):
    """
    Agent that scores and prioritizes content opportunities by combining
    SIF semantic gap analysis, SERP ranking presence, Exa competitor content,
    and trend momentum into a single ROI score.
    """

    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, **kwargs):
        super().__init__(intelligence_service, user_id, agent_type="content_gap_radar", **kwargs)
        self.user_id = user_id
        self.serp_service = SerpGapService()
        self.competitor_content_service = CompetitorContentService()
        self.strategy_architect = StrategyArchitectAgent(intelligence_service, user_id)

    async def analyze(
        self,
        competitor_domains: List[str],
        competitor_indices: Optional[List[Any]] = None,
        topics: Optional[List[str]] = None,
        bypass_cache: bool = False,
    ) -> Dict[str, Any]:
        """
        Full content gap radar pipeline.

        1. Get topic-level gaps from SIF semantic analysis
        2. Get SERP ranking data per topic
        3. Get Exa competitor content for top topics
        4. Get trend momentum data
        5. Score each topic with ROI formula
        6. Return prioritized results

        Args:
            competitor_domains: Known competitor domains
            competitor_indices: SIF index positions for competitor docs
            topics: Optional explicit topic list (derived from SIF if omitted)
            bypass_cache: Force fresh API calls

        Returns:
            Dict with scored gaps list and summary.
        """
        self._log_agent_operation(
            "Running content gap radar",
            competitor_count=len(competitor_domains),
            topics_provided=bool(topics),
        )

        try:
            sif_gaps = []

            # Step 1: Derive topics from SIF semantic gaps if not provided
            if not topics:
                sif_gaps = await self.strategy_architect.find_semantic_gaps(
                    competitor_indices or []
                )
                topics = [g["topic"] for g in sif_gaps[:12]]
                logger.info(
                    f"[{self.__class__.__name__}] Derived {len(topics)} topics from SIF gaps"
                )

            if not topics:
                logger.info(f"[{self.__class__.__name__}] No topics to analyze")
                return {"gaps": [], "summary": {}}

            # If we got sif_gaps externally but topics were provided, fetch SIF data anyway
            if not sif_gaps:
                try:
                    sif_gaps = await self.strategy_architect.find_semantic_gaps(
                        competitor_indices or []
                    )
                except Exception as e:
                    logger.warning(
                        f"[{self.__class__.__name__}] SIF gap fetch failed (non-fatal): {e}"
                    )
                    sif_gaps = []

            # Build lookup maps for cross-referencing
            sif_map = {g["topic"]: g for g in sif_gaps}

            # Step 2: SERP gap analysis
            serp_data = await self.serp_service.analyze_topic_gaps(
                topics, competitor_domains, bypass_cache=bypass_cache
            )
            serp_map = {}
            for g in serp_data.get("gaps", []):
                serp_map[g["topic"]] = g

            # Step 3: Exa deep-dive (top 6 topics — paid API)
            exa_data = await self.competitor_content_service.deep_dive(
                topics[:6], competitor_domains, bypass_cache=bypass_cache
            )
            exa_map = {}
            for r in exa_data.get("results", []):
                exa_map[r["topic"]] = r

            # Step 4: Trend momentum data
            trend_surfer = TrendSurferAgent(
                self.intelligence, self.user_id
            )
            trend_signals = await trend_surfer.surf_trends()

            # Step 5: Score each topic
            scored = []
            for topic in topics:
                scored.append(
                    self._score_topic(
                        topic=topic,
                        sif_map=sif_map,
                        serp_map=serp_map,
                        exa_map=exa_map,
                        trend_signals=trend_signals,
                    )
                )

            scored.sort(key=lambda x: x["roi_score"], reverse=True)

            # Step 6: Summary
            high = [g for g in scored if g["priority"] == "high"]
            medium = [g for g in scored if g["priority"] == "medium"]
            low = [g for g in scored if g["priority"] == "low"]

            logger.info(
                f"[{self.__class__.__name__}] Scored {len(scored)} gaps: "
                f"{len(high)} high, {len(medium)} medium, {len(low)} low"
            )

            return {
                "gaps": scored,
                "summary": {
                    "total_topics_analyzed": len(topics),
                    "high_priority": len(high),
                    "medium_priority": len(medium),
                    "low_priority": len(low),
                },
            }

        except Exception as e:
            logger.error(
                f"[{self.__class__.__name__}] Content gap radar failed: {e}"
            )
            logger.error(
                f"[{self.__class__.__name__}] Full traceback: {traceback.format_exc()}"
            )
            return {"gaps": [], "summary": {}, "error": str(e)}

    async def propose_daily_tasks(self, context: Dict[str, Any]) -> List[TaskProposal]:
        """
        Propose high-ROI content tasks from gap radar analysis.
        Integrates with Today's Workflow agent committee polling.
        """
        proposals = []

        onboarding = context.get("onboarding_data", {})
        competitor_focus = onboarding.get("competitor_focus", {})
        competitor_domains = competitor_focus.get("top_competitor_domains", [])

        if not competitor_domains:
            logger.info(f"[{self.__class__.__name__}] No competitor domains in context, skipping")
            return proposals

        try:
            result = await self.analyze(
                competitor_domains=competitor_domains,
                competitor_indices=[],
            )
        except Exception as e:
            logger.error(f"[{self.__class__.__name__}] propose_daily_tasks failed: {e}")
            return proposals

        gaps = result.get("gaps", [])
        scored = [g for g in gaps if g["priority"] in ("high", "medium")]
        scored.sort(key=lambda x: x["roi_score"], reverse=True)

        for gap in scored[:3]:
            pillar_id = self._action_to_pillar(gap["recommended_action"])
            action_url = (
                "/blog-writer"
                if pillar_id == "generate"
                else "/seo-dashboard#content-gap-radar"
            )
            proposals.append(TaskProposal(
                title=f"Write about: {gap['topic']}",
                description=gap["recommended_action"],
                pillar_id=pillar_id,
                priority=gap["priority"],
                estimated_time=60 if pillar_id == "generate" else 30,
                source_agent="ContentGapRadarAgent",
                reasoning=(
                    f"Content gap with {gap['scoring']['gap_size']:.0%} gap size, "
                    f"{gap['scoring']['volume']:.0%} volume, "
                    f"{gap['scoring']['trend']:.0%} trend momentum, "
                    f"ROI {gap['roi_score']:.0%}"
                ),
                action_type="navigate",
                action_url=action_url,
                context_data={"gap": gap},
            ))

        return proposals

    @staticmethod
    def _action_to_pillar(recommended_action: str) -> str:
        action_lower = recommended_action.lower()
        if "optimize" in action_lower:
            return "analyze"
        return "generate"

    def _score_topic(
        self,
        topic: str,
        sif_map: Dict[str, Any],
        serp_map: Dict[str, Any],
        exa_map: Dict[str, Any],
        trend_signals: List[Any],
    ) -> Dict[str, Any]:
        """Score a single topic with the ROI formula."""
        # gap_size: from SIF coverage_delta
        sif = sif_map.get(topic, {})
        gap_size = sif.get("coverage_delta", 0.5)

        # volume: from SERP gap — competitors ranking for this topic
        serp = serp_map.get(topic, {})
        comp_count = serp.get("competitor_count", 0)
        total_domains = serp.get("total_domains_checked", 1)
        volume = min(comp_count / max(total_domains, 1), 1.0)

        # trend: match topic against TrendSurfer signals
        trend_score = self._match_trend_score(topic, trend_signals)

        # intent: classify topic commercial value
        intent = self._classify_intent(topic)

        # competition: Exa content depth as penalty
        exa = exa_map.get(topic, {})
        content_count = exa.get("total_results", 0)
        competition = min(content_count / 10.0, 1.0)

        # ROI = (gap_size × volume × trend × intent) × (1 - 0.3 × competition)
        base_roi = gap_size * volume * trend_score * intent
        roi = base_roi * (1 - 0.3 * competition)

        # Priority thresholds
        if roi >= 0.6:
            priority = "high"
        elif roi >= 0.3:
            priority = "medium"
        else:
            priority = "low"

        # Recommended action based on scoring profile
        action = self._recommend_action(gap_size, competition, intent)

        return {
            "topic": topic,
            "roi_score": round(roi, 3),
            "priority": priority,
            "recommended_action": action,
            "scoring": {
                "gap_size": round(gap_size, 3),
                "volume": round(volume, 3),
                "trend": round(trend_score, 3),
                "intent": round(intent, 3),
                "competition": round(competition, 3),
            },
            "sif_gap": sif if sif else None,
            "serp_evidence": {
                "competitors_found": serp.get("competitors_found", []),
                "competitor_count": comp_count,
                "domains_with_content": serp.get("domains_with_content", []),
            } if serp else None,
            "competitor_content": exa if exa else None,
        }

    def _match_trend_score(self, topic: str, signals: List[Dict[str, Any]]) -> float:
        if not signals:
            return 0.5

        topic_lower = topic.lower()
        topic_words = set(topic_lower.split())

        best_score = 0.0
        for signal in signals:
            impact = signal.get("impact_score", 0.5)
            text_fields = " ".join(filter(None, [
                signal.get("topic", ""),
                signal.get("headline", ""),
                signal.get("suggested_angle", ""),
            ]))
            text_lower = text_fields.lower()

            if topic_lower in text_lower:
                best_score = max(best_score, impact)

            text_words = set(text_lower.split())
            overlap = len(topic_words & text_words)
            if overlap > 0:
                word_score = (overlap / max(len(topic_words), 1)) * impact
                best_score = max(best_score, word_score)

        return max(best_score, 0.5)

    def _classify_intent(self, topic: str) -> float:
        """
        Classify topic intent using LLM with keyword fallback.
        Returns intent score 0.0-1.0.
        """
        topic_lower = topic.lower()

        # Keyword-based heuristics
        commercial_words = [
            "best", "top", "review", "vs", "comparison", "alternative",
            "vs.", "versus", "pricing", "cost", "price", "cheap",
            "affordable", "discount", "coupon", "deal", "buy",
        ]
        transactional_words = [
            "buy", "purchase", "order", "subscribe", "sign up",
            "download", "get started", "free trial", "demo",
        ]

        has_commercial = any(w in topic_lower for w in commercial_words)
        has_transactional = any(w in topic_lower for w in transactional_words)

        if has_transactional:
            return 0.9
        if has_commercial:
            return 0.7
        return 0.4  # Informational default

    def _recommend_action(
        self, gap_size: float, competition: float, intent: float
    ) -> str:
        """Generate a recommended action based on scoring profile."""
        if gap_size > 0.7 and competition < 0.3:
            return "Create comprehensive pillar page — large gap, low competition"
        elif gap_size > 0.5 and intent > 0.6:
            return "Create high-conversion content — significant gap, strong intent"
        elif competition > 0.7:
            return "Create differentiated content — high competition requires unique angle"
        elif gap_size < 0.3:
            return "Optimize existing content — incremental gap, update current pages"
        else:
            return "Create targeted blog post — moderate opportunity"

    async def generate_content_brief(
        self,
        topic: str,
        recommended_action: str,
        scoring: Optional[Dict[str, float]] = None,
        serp_evidence: Optional[Dict[str, Any]] = None,
        sif_gap: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """
        Generate a structured content brief from a gap item.
        Uses LLM to produce title options, outline sections, target keywords,
        and a writing angle. Falls back to template-based generation on LLM failure.
        """
        gap_size = (scoring or {}).get("gap_size", 0.5)
        volume = (scoring or {}).get("volume", 0.5)
        trend = (scoring or {}).get("trend", 0.5)
        intent = (scoring or {}).get("intent", 0.5)
        competition = (scoring or {}).get("competition", 0.5)
        word_count = 800 if competition > 0.7 else 1200 if gap_size > 0.5 else 600

        serp_context = ""
        if serp_evidence and serp_evidence.get("competitors_found"):
            snippets = [
                f"- {c.get('title','')}: {c.get('snippet','')[:100]}"
                for c in serp_evidence["competitors_found"][:3]
            ]
            serp_context = "Competitor content already ranking:\n" + "\n".join(snippets)

        sif_context = ""
        if sif_gap:
            sif_context = (
                f"SIF coverage delta: {sif_gap.get('coverage_delta', 0):.2%}, "
                f"confidence: {sif_gap.get('confidence', 0):.2%}"
            )

        prompt = f"""You are a senior content strategist. Create a detailed content brief for the topic below.

TOPIC: {topic}
RECOMMENDED ACTION: {recommended_action}
{serp_context}
{sif_context}

Scoring profile:
- Gap size: {gap_size:.0%}
- Search volume: {volume:.0%}
- Trend momentum: {trend:.0%}
- Intent score: {intent:.0%}
- Competition level: {competition:.0%}
- Target word count: {word_count}

Return a JSON object with these exact keys:
{{
  "titles": ["Title option 1", "Title option 2", "Title option 3"],
  "outline": [
    {{"heading": "Section heading", "key_points": ["point 1", "point 2", "point 3"]}}
  ],
  "keywords": ["keyword1", "keyword2", "keyword3", "keyword4", "keyword5"],
  "angle": "A single paragraph describing the strategic writing angle",
  "word_count": {word_count}
}}

Generate 4-6 outline sections. Only return valid JSON, no other text."""

        try:
            response = await self._generate_llm_response(prompt)
            import json as _json
            start = response.find("{")
            end = response.rfind("}") + 1
            if start >= 0 and end > start:
                brief = _json.loads(response[start:end])
            else:
                raise ValueError("No JSON found in LLM response")
        except Exception as e:
            logger.warning(
                f"[{self.__class__.__name__}] LLM brief generation failed, using template: {e}"
            )
            brief = {
                "titles": [
                    f"The Ultimate Guide to {topic}",
                    f"{topic}: Strategies That Actually Work",
                    f"Why {topic} Matters More Than Ever",
                ],
                "outline": [
                    {"heading": f"Introduction to {topic}", "key_points": ["Context and importance", "What this guide covers"]},
                    {"heading": "Why This Matters", "key_points": ["Current landscape", "Key challenges and opportunities"]},
                    {"heading": "Key Strategies", "key_points": ["Strategy 1 with examples", "Strategy 2 with implementation tips", "Strategy 3 for advanced practitioners"]},
                    {"heading": "Common Pitfalls to Avoid", "key_points": ["Mistake 1 and how to avoid it", "Mistake 2 and how to avoid it"]},
                    {"heading": "Measuring Success", "key_points": ["Key metrics to track", "Tools and methods for measurement"]},
                    {"heading": "Conclusion & Next Steps", "key_points": ["Summary of key takeaways", "Actionable next steps"]},
                ],
                "keywords": [topic] + [topic.split()[-1]] if len(topic.split()) > 1 else [topic, "guide", "strategy"],
                "angle": f"Create comprehensive, actionable content about {topic} that fills the gap identified in competitor analysis. Focus on providing unique insights and practical implementation guidance.",
                "word_count": word_count,
            }

        return {
            "topic": topic,
            "recommended_action": recommended_action,
            "brief": brief,
            "scoring": scoring,
        }