Files
ALwrity/backend/services/seo_tools/competitor_content_service.py
ajaysi 923fa671fe feat: ContentGuardianAgent, onboarding UX, Team Activity action wiring, docs, agent help modal
ContentGuardianAgent consolidation:
- Merge 3 duplicate classes into single source in specialized/content_guardian.py
- Watchdog audit_committee() with heuristic scoring, coverage gaps, overlaps, alerts
- Remove misleading rejection_rate() helper; use acceptance_rate directly
- Integrate audit + alerts + trend signals into today_workflow_service.py

Team Activity page:
- QualityAuditPanel: health ring, per-agent critiques, coverage gaps, overlaps
- TrendSignalsPanel: opportunity cards with urgency/impact/coverage bars
- AlertBanner: persistent dismiss via POST /alerts/{id}/mark-read
- AgentHelpModal: dialog showing all 8 agents with descriptions, tools, schedule
- QualityAuditPanel action buttons: Fill gap -> /content-planning, Resolve overlap, View CTA on alerts/issues
- TrendSignalsPanel action buttons: Create content from this trend -> /blog-writer with trend context state

Onboarding system:
- Step 4 validation: no auto-pass via basic_ready; requires persona data or explicit progression
- Step 5 validation: logs warning on auto-pass without integration data
- OnboardingCompletionService: single DB session, transactional task creation, upsert pattern
- Business-without-website: nullable website_url on SIFIndexingTask and MarketTrendsTask
- DeepCompetitorAnalysisExecutor: 5-min timeout, 10-competitor cap, asyncio.wait_for
- Persona generation: async with 30s timeout, falls back to scheduler
- OnboardingProgressService.reset_onboarding(): resets session + pauses all DB tasks
- OnboardingControlService.reset_onboarding(): also cancels APScheduler jobs
- FinalStep TaskSchedulingPanel: shows scheduled/failed tasks after completion, 8s auto-redirect
- onboarding_completed agent activity event logged to feed

Documentation:
- docs-site/features/onboarding/: overview, steps, scheduler-tasks, technical-reference (4 pages)
- docs-site/mkdocs.yml: added Onboarding System nav section
- docs-site/features/sif-agents/: overview, agent-directory, committee-system, content-guardian (4 pages)
- docs-site/features/team-activity/: overview, quality-audit, trend-signals, alert-system (4 pages)
- docs-site/features/todays-workflow/: updated overview, technical-architecture, workflow-guide, api-reference
2026-06-01 12:24:31 +05:30

215 lines
7.1 KiB
Python

"""
Competitor Content Service for ALwrity
Fetches full competitor content for gap topics using Exa with include_domains.
Phase 2 of the Content Gap Radar feature.
Usage:
service = CompetitorContentService()
result = await service.deep_dive(
topics=["AI content strategy"],
competitor_domains=["example.com"]
)
"""
import os
import asyncio
import hashlib
import json
import time
from typing import Dict, List, Optional, Any
from loguru import logger
class CompetitorContentService:
"""
Fetches competitor content for gap topics using Exa neural search.
Uses Exa's `include_domains` to scope searches to known competitor domains,
returning full text, highlights, and summaries for deeper competitive analysis.
Results are cached for 24h to reduce API costs.
Designed to be consumed by the future ContentGapRadarAgent.
"""
CACHE_TTL = int(os.getenv("COMPETITOR_CONTENT_CACHE_TTL", "86400"))
def __init__(self):
self.api_key = os.getenv("EXA_API_KEY")
if not self.api_key:
logger.warning(
"EXA_API_KEY not configured; CompetitorContentService disabled"
)
self._exa = None
self._cache: Dict[str, Dict[str, Any]] = {}
@property
def exa(self):
"""Lazy-init Exa SDK to allow env injection after import."""
if self._exa is None and self.api_key:
from exa_py import Exa
self._exa = Exa(self.api_key)
return self._exa
def _cache_key(self, topics: List[str], domains: List[str]) -> str:
raw = json.dumps(
{"t": sorted(topics), "d": sorted(domains)}, sort_keys=True
)
return hashlib.md5(raw.encode()).hexdigest()
def _get_cached(self, key: str) -> Optional[Dict[str, Any]]:
entry = self._cache.get(key)
if entry and (time.time() - entry["ts"]) < self.CACHE_TTL:
return entry["data"]
return None
def _set_cache(self, key: str, data: Dict[str, Any]):
self._cache[key] = {"data": data, "ts": time.time()}
async def deep_dive(
self,
topics: List[str],
competitor_domains: List[str],
max_total_results: int = 10,
concurrency: int = 3,
bypass_cache: bool = False,
) -> Dict[str, Any]:
"""
Fetch competitor content for a list of gap topics.
For each topic, searches Exa scoped to competitor domains and returns
full text, highlights, and publishing metadata.
Args:
topics: Topic phrases to research (e.g. from SERP gap analysis)
competitor_domains: Known competitor domains to scope search
max_total_results: Max results per topic total (Exa API limit varies)
concurrency: Max concurrent Exa API calls
bypass_cache: Force fresh API calls, ignoring cache
Returns:
Dict with keys:
results: List of per-topic competitor content results
total_topics_analyzed: int
topics_with_content: int
cached: bool
"""
if not topics or not competitor_domains:
return {
"results": [],
"total_topics_analyzed": 0,
"topics_with_content": 0,
"cached": False,
}
ck = self._cache_key(topics, competitor_domains)
if not bypass_cache:
cached = self._get_cached(ck)
if cached:
logger.info("Returning cached competitor content results")
return {**cached, "cached": True}
if not self.api_key or not self.exa:
return {
"results": [],
"total_topics_analyzed": len(topics),
"topics_with_content": 0,
"cached": False,
"error": "EXA_API_KEY not configured",
}
semaphore = asyncio.Semaphore(concurrency)
loop = asyncio.get_running_loop()
async def search_topic(topic: str) -> Dict[str, Any]:
async with semaphore:
return await self._search_single_topic(
topic, competitor_domains, max_total_results, loop
)
tasks = [search_topic(topic) for topic in topics]
results = await asyncio.gather(*tasks)
output = {
"results": results,
"total_topics_analyzed": len(topics),
"topics_with_content": sum(
1 for r in results if r.get("total_results", 0) > 0
),
"cached": False,
}
self._set_cache(ck, output)
return output
async def _search_single_topic(
self,
topic: str,
competitor_domains: List[str],
max_results: int,
loop: asyncio.AbstractEventLoop,
) -> Dict[str, Any]:
"""
Search Exa for a single topic, scoped to competitor domains.
"""
query = topic
search_kwargs = {
"type": "auto",
"num_results": max_results,
"include_domains": competitor_domains,
"text": {"max_characters": 2000},
"highlights": {"num_sentences": 3, "highlights_per_url": 3},
"summary": {"query": f"Key details about {topic}"},
}
try:
results = await loop.run_in_executor(
None,
lambda: self.exa.search_and_contents(query, **search_kwargs),
)
content = []
seen_urls = set()
for result in getattr(results, "results", []) or []:
url = getattr(result, "url", "")
if not url or url in seen_urls:
continue
seen_urls.add(url)
content.append({
"domain": self._extract_domain(url),
"title": getattr(result, "title", "Untitled"),
"url": url,
"highlights": getattr(result, "highlights", []),
"summary": getattr(result, "summary", ""),
"text": getattr(result, "text", ""),
"published_date": getattr(result, "published_date", None),
"author": getattr(result, "author", None),
})
return {
"topic": topic,
"competitor_content": content,
"total_results": len(content),
"domains_found": list(
set(c["domain"] for c in content if c["domain"])
),
}
except Exception as e:
logger.warning(f"Exa search failed for topic '{topic}': {e}")
return {
"topic": topic,
"competitor_content": [],
"total_results": 0,
"domains_found": [],
"error": str(e),
}
@staticmethod
def _extract_domain(url: str) -> str:
"""Extract domain from URL."""
try:
from urllib.parse import urlparse
return urlparse(url).netloc.lower()
except Exception:
return url.lower()