ALwrity/backend/services/seo_tools/competitor_content_service.py

"""
Competitor Content Service for ALwrity

Fetches full competitor content for gap topics using Exa with include_domains.
Phase 2 of the Content Gap Radar feature.

Usage:
    service = CompetitorContentService()
    result = await service.deep_dive(
        topics=["AI content strategy"],
        competitor_domains=["example.com"]
    )
"""

import os
import asyncio
import hashlib
import json
import time
from typing import Dict, List, Optional, Any
from loguru import logger


class CompetitorContentService:
    """
    Fetches competitor content for gap topics using Exa neural search.

    Uses Exa's `include_domains` to scope searches to known competitor domains,
    returning full text, highlights, and summaries for deeper competitive analysis.
    Results are cached for 24h to reduce API costs.
    Designed to be consumed by the future ContentGapRadarAgent.
    """

    CACHE_TTL = int(os.getenv("COMPETITOR_CONTENT_CACHE_TTL", "86400"))

    def __init__(self):
        self.api_key = os.getenv("EXA_API_KEY")
        if not self.api_key:
            logger.warning(
                "EXA_API_KEY not configured; CompetitorContentService disabled"
            )
        self._exa = None
        self._cache: Dict[str, Dict[str, Any]] = {}

    @property
    def exa(self):
        """Lazy-init Exa SDK to allow env injection after import."""
        if self._exa is None and self.api_key:
            from exa_py import Exa
            self._exa = Exa(self.api_key)
        return self._exa

    def _cache_key(self, topics: List[str], domains: List[str]) -> str:
        raw = json.dumps(
            {"t": sorted(topics), "d": sorted(domains)}, sort_keys=True
        )
        return hashlib.md5(raw.encode()).hexdigest()

    def _get_cached(self, key: str) -> Optional[Dict[str, Any]]:
        entry = self._cache.get(key)
        if entry and (time.time() - entry["ts"]) < self.CACHE_TTL:
            return entry["data"]
        return None

    def _set_cache(self, key: str, data: Dict[str, Any]):
        self._cache[key] = {"data": data, "ts": time.time()}

    async def deep_dive(
        self,
        topics: List[str],
        competitor_domains: List[str],
        max_total_results: int = 10,
        concurrency: int = 3,
        bypass_cache: bool = False,
    ) -> Dict[str, Any]:
        """
        Fetch competitor content for a list of gap topics.

        For each topic, searches Exa scoped to competitor domains and returns
        full text, highlights, and publishing metadata.

        Args:
            topics: Topic phrases to research (e.g. from SERP gap analysis)
            competitor_domains: Known competitor domains to scope search
            max_total_results: Max results per topic total (Exa API limit varies)
            concurrency: Max concurrent Exa API calls
            bypass_cache: Force fresh API calls, ignoring cache

        Returns:
            Dict with keys:
                results: List of per-topic competitor content results
                total_topics_analyzed: int
                topics_with_content: int
                cached: bool
        """
        if not topics or not competitor_domains:
            return {
                "results": [],
                "total_topics_analyzed": 0,
                "topics_with_content": 0,
                "cached": False,
            }

        ck = self._cache_key(topics, competitor_domains)
        if not bypass_cache:
            cached = self._get_cached(ck)
            if cached:
                logger.info("Returning cached competitor content results")
                return {**cached, "cached": True}

        if not self.api_key or not self.exa:
            return {
                "results": [],
                "total_topics_analyzed": len(topics),
                "topics_with_content": 0,
                "cached": False,
                "error": "EXA_API_KEY not configured",
            }

        semaphore = asyncio.Semaphore(concurrency)
        loop = asyncio.get_running_loop()

        async def search_topic(topic: str) -> Dict[str, Any]:
            async with semaphore:
                return await self._search_single_topic(
                    topic, competitor_domains, max_total_results, loop
                )

        tasks = [search_topic(topic) for topic in topics]
        results = await asyncio.gather(*tasks)

        output = {
            "results": results,
            "total_topics_analyzed": len(topics),
            "topics_with_content": sum(
                1 for r in results if r.get("total_results", 0) > 0
            ),
            "cached": False,
        }
        self._set_cache(ck, output)
        return output

    async def _search_single_topic(
        self,
        topic: str,
        competitor_domains: List[str],
        max_results: int,
        loop: asyncio.AbstractEventLoop,
    ) -> Dict[str, Any]:
        """
        Search Exa for a single topic, scoped to competitor domains.
        """
        query = topic

        search_kwargs = {
            "type": "auto",
            "num_results": max_results,
            "include_domains": competitor_domains,
            "text": {"max_characters": 2000},
            "highlights": {"num_sentences": 3, "highlights_per_url": 3},
            "summary": {"query": f"Key details about {topic}"},
        }

        try:
            results = await loop.run_in_executor(
                None,
                lambda: self.exa.search_and_contents(query, **search_kwargs),
            )

            content = []
            seen_urls = set()
            for result in getattr(results, "results", []) or []:
                url = getattr(result, "url", "")
                if not url or url in seen_urls:
                    continue
                seen_urls.add(url)
                content.append({
                    "domain": self._extract_domain(url),
                    "title": getattr(result, "title", "Untitled"),
                    "url": url,
                    "highlights": getattr(result, "highlights", []),
                    "summary": getattr(result, "summary", ""),
                    "text": getattr(result, "text", ""),
                    "published_date": getattr(result, "published_date", None),
                    "author": getattr(result, "author", None),
                })

            return {
                "topic": topic,
                "competitor_content": content,
                "total_results": len(content),
                "domains_found": list(
                    set(c["domain"] for c in content if c["domain"])
                ),
            }

        except Exception as e:
            logger.warning(f"Exa search failed for topic '{topic}': {e}")
            return {
                "topic": topic,
                "competitor_content": [],
                "total_results": 0,
                "domains_found": [],
                "error": str(e),
            }

    @staticmethod
    def _extract_domain(url: str) -> str:
        """Extract domain from URL."""
        try:
            from urllib.parse import urlparse
            return urlparse(url).netloc.lower()
        except Exception:
            return url.lower()