feat: ContentGuardianAgent, onboarding UX, Team Activity action wiring, docs, agent help modal

ContentGuardianAgent consolidation: - Merge 3 duplicate classes into single source in specialized/content_guardian.py - Watchdog audit_committee() with heuristic scoring, coverage gaps, overlaps, alerts - Remove misleading rejection_rate() helper; use acceptance_rate directly - Integrate audit + alerts + trend signals into today_workflow_service.py Team Activity page: - QualityAuditPanel: health ring, per-agent critiques, coverage gaps, overlaps - TrendSignalsPanel: opportunity cards with urgency/impact/coverage bars - AlertBanner: persistent dismiss via POST /alerts/{id}/mark-read - AgentHelpModal: dialog showing all 8 agents with descriptions, tools, schedule - QualityAuditPanel action buttons: Fill gap -> /content-planning, Resolve overlap, View CTA on alerts/issues - TrendSignalsPanel action buttons: Create content from this trend -> /blog-writer with trend context state Onboarding system: - Step 4 validation: no auto-pass via basic_ready; requires persona data or explicit progression - Step 5 validation: logs warning on auto-pass without integration data - OnboardingCompletionService: single DB session, transactional task creation, upsert pattern - Business-without-website: nullable website_url on SIFIndexingTask and MarketTrendsTask - DeepCompetitorAnalysisExecutor: 5-min timeout, 10-competitor cap, asyncio.wait_for - Persona generation: async with 30s timeout, falls back to scheduler - OnboardingProgressService.reset_onboarding(): resets session + pauses all DB tasks - OnboardingControlService.reset_onboarding(): also cancels APScheduler jobs - FinalStep TaskSchedulingPanel: shows scheduled/failed tasks after completion, 8s auto-redirect - onboarding_completed agent activity event logged to feed Documentation: - docs-site/features/onboarding/: overview, steps, scheduler-tasks, technical-reference (4 pages) - docs-site/mkdocs.yml: added Onboarding System nav section - docs-site/features/sif-agents/: overview, agent-directory, committee-system, content-guardian (4 pages) - docs-site/features/team-activity/: overview, quality-audit, trend-signals, alert-system (4 pages) - docs-site/features/todays-workflow/: updated overview, technical-architecture, workflow-guide, api-reference
2026-06-01 12:24:31 +05:30
parent 9b472f1c18
commit 923fa671fe
90 changed files with 8914 additions and 2731 deletions
--- a/backend/services/seo_tools/init.py
+++ b/backend/services/seo_tools/init.py
@@ -9,6 +9,8 @@ from .on_page_seo_service import OnPageSEOService
 from .technical_seo_service import TechnicalSEOService
 from .enterprise_seo_service import EnterpriseSEOService
 from .content_strategy_service import ContentStrategyService
+from .serp_gap_service import SerpGapService
+from .competitor_content_service import CompetitorContentService

 __all__ = [
    'MetaDescriptionService',
@@ -20,4 +22,6 @@ __all__ = [
    'TechnicalSEOService',
    'EnterpriseSEOService',
    'ContentStrategyService',
+    'SerpGapService',
+    'CompetitorContentService',
 ]
--- a/backend/services/seo_tools/competitor_content_service.py
+++ b/backend/services/seo_tools/competitor_content_service.py
@@ -0,0 +1,214 @@
+"""
+Competitor Content Service for ALwrity
+
+Fetches full competitor content for gap topics using Exa with include_domains.
+Phase 2 of the Content Gap Radar feature.
+
+Usage:
+    service = CompetitorContentService()
+    result = await service.deep_dive(
+        topics=["AI content strategy"],
+        competitor_domains=["example.com"]
+    )
+"""
+
+import os
+import asyncio
+import hashlib
+import json
+import time
+from typing import Dict, List, Optional, Any
+from loguru import logger
+
+
+class CompetitorContentService:
+    """
+    Fetches competitor content for gap topics using Exa neural search.
+
+    Uses Exa's `include_domains` to scope searches to known competitor domains,
+    returning full text, highlights, and summaries for deeper competitive analysis.
+    Results are cached for 24h to reduce API costs.
+    Designed to be consumed by the future ContentGapRadarAgent.
+    """
+
+    CACHE_TTL = int(os.getenv("COMPETITOR_CONTENT_CACHE_TTL", "86400"))
+
+    def __init__(self):
+        self.api_key = os.getenv("EXA_API_KEY")
+        if not self.api_key:
+            logger.warning(
+                "EXA_API_KEY not configured; CompetitorContentService disabled"
+            )
+        self._exa = None
+        self._cache: Dict[str, Dict[str, Any]] = {}
+
+    @property
+    def exa(self):
+        """Lazy-init Exa SDK to allow env injection after import."""
+        if self._exa is None and self.api_key:
+            from exa_py import Exa
+            self._exa = Exa(self.api_key)
+        return self._exa
+
+    def _cache_key(self, topics: List[str], domains: List[str]) -> str:
+        raw = json.dumps(
+            {"t": sorted(topics), "d": sorted(domains)}, sort_keys=True
+        )
+        return hashlib.md5(raw.encode()).hexdigest()
+
+    def _get_cached(self, key: str) -> Optional[Dict[str, Any]]:
+        entry = self._cache.get(key)
+        if entry and (time.time() - entry["ts"]) < self.CACHE_TTL:
+            return entry["data"]
+        return None
+
+    def _set_cache(self, key: str, data: Dict[str, Any]):
+        self._cache[key] = {"data": data, "ts": time.time()}
+
+    async def deep_dive(
+        self,
+        topics: List[str],
+        competitor_domains: List[str],
+        max_total_results: int = 10,
+        concurrency: int = 3,
+        bypass_cache: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Fetch competitor content for a list of gap topics.
+
+        For each topic, searches Exa scoped to competitor domains and returns
+        full text, highlights, and publishing metadata.
+
+        Args:
+            topics: Topic phrases to research (e.g. from SERP gap analysis)
+            competitor_domains: Known competitor domains to scope search
+            max_total_results: Max results per topic total (Exa API limit varies)
+            concurrency: Max concurrent Exa API calls
+            bypass_cache: Force fresh API calls, ignoring cache
+
+        Returns:
+            Dict with keys:
+                results: List of per-topic competitor content results
+                total_topics_analyzed: int
+                topics_with_content: int
+                cached: bool
+        """
+        if not topics or not competitor_domains:
+            return {
+                "results": [],
+                "total_topics_analyzed": 0,
+                "topics_with_content": 0,
+                "cached": False,
+            }
+
+        ck = self._cache_key(topics, competitor_domains)
+        if not bypass_cache:
+            cached = self._get_cached(ck)
+            if cached:
+                logger.info("Returning cached competitor content results")
+                return {**cached, "cached": True}
+
+        if not self.api_key or not self.exa:
+            return {
+                "results": [],
+                "total_topics_analyzed": len(topics),
+                "topics_with_content": 0,
+                "cached": False,
+                "error": "EXA_API_KEY not configured",
+            }
+
+        semaphore = asyncio.Semaphore(concurrency)
+        loop = asyncio.get_running_loop()
+
+        async def search_topic(topic: str) -> Dict[str, Any]:
+            async with semaphore:
+                return await self._search_single_topic(
+                    topic, competitor_domains, max_total_results, loop
+                )
+
+        tasks = [search_topic(topic) for topic in topics]
+        results = await asyncio.gather(*tasks)
+
+        output = {
+            "results": results,
+            "total_topics_analyzed": len(topics),
+            "topics_with_content": sum(
+                1 for r in results if r.get("total_results", 0) > 0
+            ),
+            "cached": False,
+        }
+        self._set_cache(ck, output)
+        return output
+
+    async def _search_single_topic(
+        self,
+        topic: str,
+        competitor_domains: List[str],
+        max_results: int,
+        loop: asyncio.AbstractEventLoop,
+    ) -> Dict[str, Any]:
+        """
+        Search Exa for a single topic, scoped to competitor domains.
+        """
+        query = topic
+
+        search_kwargs = {
+            "type": "auto",
+            "num_results": max_results,
+            "include_domains": competitor_domains,
+            "text": {"max_characters": 2000},
+            "highlights": {"num_sentences": 3, "highlights_per_url": 3},
+            "summary": {"query": f"Key details about {topic}"},
+        }
+
+        try:
+            results = await loop.run_in_executor(
+                None,
+                lambda: self.exa.search_and_contents(query, **search_kwargs),
+            )
+
+            content = []
+            seen_urls = set()
+            for result in getattr(results, "results", []) or []:
+                url = getattr(result, "url", "")
+                if not url or url in seen_urls:
+                    continue
+                seen_urls.add(url)
+                content.append({
+                    "domain": self._extract_domain(url),
+                    "title": getattr(result, "title", "Untitled"),
+                    "url": url,
+                    "highlights": getattr(result, "highlights", []),
+                    "summary": getattr(result, "summary", ""),
+                    "text": getattr(result, "text", ""),
+                    "published_date": getattr(result, "published_date", None),
+                    "author": getattr(result, "author", None),
+                })
+
+            return {
+                "topic": topic,
+                "competitor_content": content,
+                "total_results": len(content),
+                "domains_found": list(
+                    set(c["domain"] for c in content if c["domain"])
+                ),
+            }
+
+        except Exception as e:
+            logger.warning(f"Exa search failed for topic '{topic}': {e}")
+            return {
+                "topic": topic,
+                "competitor_content": [],
+                "total_results": 0,
+                "domains_found": [],
+                "error": str(e),
+            }
+
+    @staticmethod
+    def _extract_domain(url: str) -> str:
+        """Extract domain from URL."""
+        try:
+            from urllib.parse import urlparse
+            return urlparse(url).netloc.lower()
+        except Exception:
+            return url.lower()
--- a/backend/services/seo_tools/serp_gap_service.py
+++ b/backend/services/seo_tools/serp_gap_service.py
@@ -0,0 +1,175 @@
+"""
+SERP Gap Service for ALwrity
+
+Detects which competitors rank for target topics using Google Custom Search.
+Phase 1 of the Content Gap Radar feature.
+
+Usage:
+    service = SerpGapService()
+    result = await service.analyze_topic_gaps(
+        topics=["AI content strategy", "topic clustering"],
+        competitor_domains=["example.com", "competitor.org"]
+    )
+"""
+
+import asyncio
+import hashlib
+import json
+import os
+import time
+from typing import Dict, List, Optional, Any
+from loguru import logger
+from services.research.google_search_service import GoogleSearchService
+
+
+class SerpGapService:
+    """
+    SERP Gap Analysis Service.
+
+    Uses Google Custom Search `site:` queries to detect competitor ranking presence
+    for specific topics. Results are cached for 24h to stay within free-tier quotas
+    (100 queries/day). Designed to be consumed by a future ContentGapRadarAgent
+    that scores and prioritizes gaps.
+    """
+
+    CACHE_TTL = int(os.getenv("SERP_GAP_CACHE_TTL", "86400"))  # 24 hours default
+
+    def __init__(self, google_search_service: Optional[GoogleSearchService] = None):
+        self.gcs = google_search_service or GoogleSearchService()
+        self._cache: Dict[str, Dict[str, Any]] = {}
+        logger.info("SerpGapService initialized")
+
+    def _cache_key(self, topics: List[str], domains: List[str]) -> str:
+        """Deterministic cache key from sorted topics + domains."""
+        raw = json.dumps(
+            {"t": sorted(topics), "d": sorted(domains)}, sort_keys=True
+        )
+        return hashlib.md5(raw.encode()).hexdigest()
+
+    def _get_cached(self, key: str) -> Optional[Dict[str, Any]]:
+        entry = self._cache.get(key)
+        if entry and (time.time() - entry["ts"]) < self.CACHE_TTL:
+            return entry["data"]
+        return None
+
+    def _set_cache(self, key: str, data: Dict[str, Any]):
+        self._cache[key] = {"data": data, "ts": time.time()}
+
+    async def analyze_topic_gaps(
+        self,
+        topics: List[str],
+        competitor_domains: List[str],
+        max_results_per_site: int = 5,
+        concurrency: int = 3,
+        bypass_cache: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Analyze SERP gaps for a list of topics across known competitors.
+
+        For each topic, queries Google with `site:competitor_domain topic` for
+        each known competitor to detect ranking presence.
+
+        Args:
+            topics: Topic phrases to check (e.g. from find_semantic_gaps())
+            competitor_domains: Known competitor domains (e.g. ["example.com"])
+            max_results_per_site: Max Google CSE results per site: query (max 10)
+            concurrency: Max concurrent API calls to stay under rate limits
+            bypass_cache: Force fresh API calls, ignoring cache
+
+        Returns:
+            Dict with keys:
+                gaps: List of per-topic SERP gap results
+                total_topics_analyzed: int
+                total_competitors: int
+                cached: bool
+        """
+        if not topics or not competitor_domains:
+            return {
+                "gaps": [],
+                "total_topics_analyzed": 0,
+                "total_competitors": 0,
+                "cached": False,
+            }
+
+        ck = self._cache_key(topics, competitor_domains)
+        if not bypass_cache:
+            cached = self._get_cached(ck)
+            if cached:
+                logger.info("Returning cached SERP gap results")
+                return {**cached, "cached": True}
+
+        semaphore = asyncio.Semaphore(concurrency)
+
+        async def analyze_topic(topic: str) -> Dict[str, Any]:
+            async with semaphore:
+                return await self._analyze_single_topic(
+                    topic, competitor_domains, max_results_per_site
+                )
+
+        tasks = [analyze_topic(topic) for topic in topics]
+        results = await asyncio.gather(*tasks)
+
+        output = {
+            "gaps": results,
+            "total_topics_analyzed": len(topics),
+            "total_competitors": len(competitor_domains),
+            "cached": False,
+        }
+        self._set_cache(ck, output)
+        return dict(output)
+
+    async def _analyze_single_topic(
+        self,
+        topic: str,
+        competitor_domains: List[str],
+        max_results: int,
+    ) -> Dict[str, Any]:
+        """
+        Check SERP presence for a single topic across all competitor domains.
+
+        Removes the dateRestrict and sort=date defaults from Google CSE so we
+        see all-time competitor content (not just last month).
+        """
+        competitors_found = []
+        failed_queries = 0
+
+        for domain in competitor_domains:
+            query = f"site:{domain} {topic}"
+            try:
+                raw_results = await self.gcs.perform_search(
+                    query,
+                    max_results,
+                    dateRestrict=None,  # Don't limit to last month
+                    sort=None,  # Use relevance sorting, not date
+                )
+                for result in raw_results:
+                    competitors_found.append({
+                        "domain": domain,
+                        "title": result.get("title", ""),
+                        "url": result.get("link", ""),
+                        "snippet": result.get("snippet", ""),
+                    })
+            except Exception as e:
+                logger.warning(
+                    f"GCS query failed for site:{domain} topic='{topic}': {e}"
+                )
+                failed_queries += 1
+                continue
+
+        seen_urls = set()
+        unique_competitors = []
+        for entry in competitors_found:
+            if entry["url"] not in seen_urls:
+                seen_urls.add(entry["url"])
+                unique_competitors.append(entry)
+
+        return {
+            "topic": topic,
+            "competitors_found": unique_competitors,
+            "competitor_count": len(unique_competitors),
+            "domains_with_content": list(
+                set(e["domain"] for e in unique_competitors)
+            ),
+            "failed_queries": failed_queries,
+            "total_domains_checked": len(competitor_domains),
+        }