ALwrity/backend/services/seo_tools/ai_visibility_insights_service.py

"""
AI Visibility Insights Service

Detects Google AI Overview impact signals from GSC search analytics data.

Core heuristic:
  - AIO Impacted keywords: high impressions + high position (top 3) + very low CTR
    → content likely being shown/cited in Google AI Overviews without clicks
  - AIO Opportunity keywords: strong CTR + moderate position
    → content already performing well, potential for AIO citation with optimization

All thresholds are configurable for flexibility.
"""

from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from loguru import logger

from services.gsc_service import GSCService


@dataclass
class AIOThresholds:
    """Configurable thresholds for AI Overview detection."""

    # AIO Impacted detection
    impacted_min_impressions: int = 500
    impacted_max_position: float = 4.0
    impacted_max_ctr: float = 2.0

    # AIO Opportunity detection
    opportunity_min_impressions: int = 300
    opportunity_min_position: float = 4.0
    opportunity_max_position: float = 10.0
    opportunity_min_ctr: float = 5.0


@dataclass
class AIOVisibilityResult:
    """Structured result from AI Overview analysis."""

    summary: Dict[str, Any] = field(default_factory=dict)
    impacted_keywords: List[Dict[str, Any]] = field(default_factory=list)
    opportunity_keywords: List[Dict[str, Any]] = field(default_factory=list)
    recommendations: List[str] = field(default_factory=list)
    error: Optional[str] = None


class AIVisibilityInsightsService:
    """Analyze GSC data for AI Overview impact signals."""

    def __init__(self, gsc_service: GSCService):
        self.gsc_service = gsc_service

    def analyze(
        self,
        user_id: str,
        site_url: str,
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        thresholds: Optional[AIOThresholds] = None,
    ) -> AIOVisibilityResult:
        """
        Analyze GSC data for AI Overview insights.

        Args:
            user_id: Clerk user ID
            site_url: Verified GSC site URL (e.g., "https://example.com/")
            start_date: ISO date string; defaults to 30 days ago
            end_date: ISO date string; defaults to today
            thresholds: Custom thresholds; uses defaults if omitted

        Returns:
            AIOVisibilityResult with summary, keyword lists, and recommendations
        """
        t = thresholds or AIOThresholds()
        result = AIOVisibilityResult()

        try:
            # Set date defaults
            if not end_date:
                end_date = datetime.now().strftime("%Y-%m-%d")
            if not start_date:
                start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")

            logger.info(
                f"AIVisibility: analyzing {site_url} for user {user_id} "
                f"({start_date} to {end_date})"
            )

            # Fetch GSC search analytics
            analytics = self.gsc_service.get_search_analytics(
                user_id=user_id,
                site_url=site_url,
                start_date=start_date,
                end_date=end_date,
            )

            # Validate response
            error = analytics.get("error")
            if error:
                result.error = error
                return result

            query_data = analytics.get("query_data", {})
            rows = query_data.get("rows", [])
            if not rows:
                result.error = "No query data returned from GSC"
                return result

            # Parse and classify each keyword
            total_keywords = 0
            total_impressions = 0
            total_clicks = 0
            aio_impressions = 0
            aio_estimated_clicks = 0
            impact_count = 0
            opportunity_count = 0

            impacted_list = []
            opportunity_list = []

            for row in rows:
                keys = row.get("keys", [])
                keyword = keys[0] if keys else "(not set)"
                impressions = row.get("impressions", 0)
                clicks = row.get("clicks", 0)
                ctr_decimal = row.get("ctr", 0)
                ctr_pct = round(ctr_decimal * 100, 2)
                position = round(row.get("position", 0), 1)

                total_keywords += 1
                total_impressions += impressions
                total_clicks += clicks

                entry = {
                    "keyword": keyword,
                    "impressions": impressions,
                    "clicks": clicks,
                    "ctr": ctr_pct,
                    "position": position,
                }

                # AIO Impacted: high impressions, top position, very low CTR
                if (
                    impressions >= t.impacted_min_impressions
                    and position <= t.impacted_max_position
                    and ctr_pct <= t.impacted_max_ctr
                ):
                    # Estimate what clicks WOULD be at a healthy top-3 CTR (~8%)
                    target_ctr = 8.0
                    expected_clicks = int(impressions * target_ctr / 100)
                    traffic_loss = max(0, expected_clicks - clicks)

                    entry["estimated_traffic_loss"] = traffic_loss
                    entry["target_ctr"] = target_ctr
                    entry["aio_impacted"] = True
                    impacted_list.append(entry)
                    aio_impressions += impressions
                    aio_estimated_clicks += traffic_loss
                    impact_count += 1

                # AIO Opportunity: good CTR, position 4-10 — strong enough to target AIO citation
                if (
                    impressions >= t.opportunity_min_impressions
                    and t.opportunity_min_position <= position <= t.opportunity_max_position
                    and ctr_pct >= t.opportunity_min_ctr
                ):
                    entry["aio_opportunity"] = True
                    entry["recommendation"] = self._suggest_aio_format(keyword, position, ctr_pct)
                    opportunity_list.append(entry)
                    opportunity_count += 1

            # Sort by impact/opportunity
            impacted_list.sort(key=lambda x: x.get("estimated_traffic_loss", 0), reverse=True)
            opportunity_list.sort(key=lambda x: x["impressions"], reverse=True)

            # Compute summary
            avg_ctr = round((total_clicks / total_impressions * 100) if total_impressions else 0, 2)
            avg_position = (
                round(
                    sum(r.get("position", 0) for r in rows) / len(rows), 1
                )
                if rows
                else 0
            )

            result.summary = {
                "total_keywords_analyzed": total_keywords,
                "total_impressions": total_impressions,
                "total_clicks": total_clicks,
                "average_ctr": avg_ctr,
                "average_position": avg_position,
                "aio_impacted_keywords": impact_count,
                "aio_opportunity_keywords": opportunity_count,
                "aio_zero_click_impressions": aio_impressions,
                "aio_estimated_traffic_loss": aio_estimated_clicks,
                "date_range": {"start": start_date, "end": end_date},
                "thresholds_used": {
                    "impacted": {
                        "min_impressions": t.impacted_min_impressions,
                        "max_position": t.impacted_max_position,
                        "max_ctr": t.impacted_max_ctr,
                    },
                    "opportunity": {
                        "min_impressions": t.opportunity_min_impressions,
                        "min_position": t.opportunity_min_position,
                        "max_position": t.opportunity_max_position,
                        "min_ctr": t.opportunity_min_ctr,
                    },
                },
            }

            # Build recommendations
            result.recommendations = self._build_recommendations(
                impacted_list, opportunity_list, result.summary
            )

            result.impacted_keywords = impacted_list[:20]
            result.opportunity_keywords = opportunity_list[:20]

            logger.info(
                f"AIVisibility: analysis complete for {site_url} — "
                f"{impact_count} impacted, {opportunity_count} opportunities"
            )

        except Exception as e:
            logger.error(f"AIVisibility: analysis error for {user_id}: {e}")
            result.error = str(e)

        return result

    @staticmethod
    def _suggest_aio_format(keyword: str, position: float, ctr: float) -> str:
        """Suggest content format for AIO optimization based on keyword pattern."""
        kw_lower = keyword.lower()

        if any(w in kw_lower for w in ["how", "steps", "guide", "tutorial", "way to"]):
            return "Create a step-by-step guide with clear numbered lists for AIO citation"
        if any(w in kw_lower for w in ["what", "define", "meaning", "explain", "overview"]):
            return "Add a concise definition/summary block at the top of the article"
        if any(w in kw_lower for w in ["vs", "versus", "difference", "comparison", "or"]):
            return "Use a structured comparison table — AI crawlers favor tabular data"
        if any(w in kw_lower for w in ["best", "top", "recommended", "review"]):
            return "Format as a ranked list with bullet-point pros/cons for AI snippet extraction"
        if any(w in kw_lower for w in ["why", "reason", "cause", "benefit"]):
            return "Include a bullet-point summary of key reasons/benefits for AIO extraction"
        if any(w in kw_lower for w in ["price", "cost", "pricing", "cheap", "affordable"]):
            return "Add a pricing/comparison table — highly structured data for AI citation"
        if any(w in kw_lower for w in ["example", "sample", "template", "checklist"]):
            return "Provide actionable examples or a downloadable template checklist"

        if position <= 3 and ctr < 3:
            return "Optimize content with FAQ schema and concise summary paragraphs to reclaim AIO clicks"
        if position <= 5:
            return "Add structured data markup (FAQ, HowTo) and a TL;DR box for AI Overview targeting"
        return "Improve content depth with data-backed insights and structured formatting for AI snippet eligibility"

    @staticmethod
    def _build_recommendations(
        impacted: List[Dict[str, Any]],
        opportunities: List[Dict[str, Any]],
        summary: Dict[str, Any],
    ) -> List[str]:
        """Generate AI Overview optimization recommendations."""
        recs = []
        impacted_count = summary.get("aio_impacted_keywords", 0)
        opportunity_count = summary.get("aio_opportunity_keywords", 0)
        traffic_loss = summary.get("aio_estimated_traffic_loss", 0)

        if impacted_count > 0:
            recs.append(
                f"⚠️ {impacted_count} keyword(s) show AI Overview impact signals "
                f"(estimated {traffic_loss} lost clicks). "
                "Add concise, structured summary blocks early in your content to reclaim visibility."
            )
        if opportunity_count > 0:
            recs.append(
                f"✅ {opportunity_count} keyword(s) are strong AIO optimization candidates. "
                "Apply FAQ schema, HowTo schema, and clear bullet-point summaries."
            )
        if impacted_count == 0 and opportunity_count == 0:
            recs.append(
                "No clear AI Overview signals detected. "
                "Consider expanding your keyword coverage in conversational/intent-based queries."
            )

        recs.append(
            "General AIO best practices: "
            "1) Use FAQ schema for question-based queries, "
            "2) Add <table> elements for comparative data, "
            "3) Keep key takeaways in the first 100 words, "
            "4) Use descriptive headings (H2/H3) that mirror natural language queries."
        )

        return recs