Files
ALwrity/backend/services/research/deep_competitor_analysis.py

604 lines
25 KiB
Python

from __future__ import annotations
import asyncio
import json
import re
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlparse
from services.component_logic.web_crawler_logic import WebCrawlerLogic
from services.llm_providers.main_text_generation import llm_text_gen
from services.ai_service_manager import AIServiceManager, AIServiceType
from services.seo_tools.sitemap_service import SitemapService
from services.seo.advertools_service import AdvertoolsService
from utils.logger_utils import get_service_logger
logger = get_service_logger("deep_competitor_analysis")
class DeepCompetitorAnalysisService:
def __init__(self):
self.crawler = WebCrawlerLogic()
self.advertools = AdvertoolsService()
async def run(
self,
*,
user_id: str,
website_analysis: Dict[str, Any],
competitors: List[Dict[str, Any]],
max_competitors: int = 25,
crawl_concurrency: int = 4
) -> Dict[str, Any]:
baseline = self._build_baseline(website_analysis)
normalized_competitors = self._normalize_competitors(competitors, max_competitors=max_competitors)
crawl_results = await self._crawl_competitors(
normalized_competitors,
crawl_concurrency=crawl_concurrency
)
per_competitor_outputs: List[Dict[str, Any]] = []
for competitor_input, crawl_result in crawl_results:
extraction = self._build_extraction_artifact(competitor_input, crawl_result)
ai_analysis = await self._analyze_competitor_with_ai(
user_id=user_id,
baseline=baseline,
competitor_input=competitor_input,
extraction=extraction
)
per_competitor_outputs.append({
"input": competitor_input,
"extraction": extraction,
"ai_analysis": ai_analysis
})
aggregation = await self._aggregate_with_ai(
user_id=user_id,
baseline=baseline,
competitors=per_competitor_outputs
)
return {
"baseline": baseline,
"competitors": per_competitor_outputs,
"aggregation": aggregation,
"metadata": {
"generated_at": datetime.utcnow().isoformat(),
"competitors_requested": len(normalized_competitors),
"competitors_analyzed": len(per_competitor_outputs),
"crawl_concurrency": crawl_concurrency
}
}
async def generate_weekly_strategy_brief(
self,
*,
user_id: str,
website_analysis: Dict[str, Any],
competitors: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""
Generates a weekly strategic intelligence brief by analyzing
recent competitor changes and market shifts.
"""
sitemap_service = SitemapService()
ai_manager = AIServiceManager()
# Stage 1: Data Collection (User + Competitors)
baseline = self._build_baseline(website_analysis)
normalized_competitors = self._normalize_competitors(competitors, max_competitors=10)
# Fetch competitor sitemaps for recent changes
competitor_changes = []
seven_days_ago = datetime.utcnow() - timedelta(days=7)
ninety_days_ago = datetime.utcnow() - timedelta(days=90)
for comp in normalized_competitors:
try:
# Stage 1: Advertools Deep Intelligence
# Discover exact sitemap URL first (essential for Advertools)
discovered_sitemap = await sitemap_service.discover_sitemap_url(comp['url'])
effective_url = discovered_sitemap if discovered_sitemap else comp['url']
adv_result = await self.advertools.analyze_sitemap(effective_url)
# REUSE: Use existing SitemapService.analyze_sitemap for robust Stage 1 & 2
analysis_result = await sitemap_service.analyze_sitemap(
sitemap_url=effective_url,
analyze_content_trends=True,
analyze_publishing_patterns=True,
include_ai_insights=False,
user_id=user_id
)
if analysis_result and analysis_result.get('urls'):
urls = analysis_result['urls']
structure = analysis_result.get('structure_analysis', {})
# Enhancement 1: Keyword Clustering (NLP from URLs) - REUSE from SitemapService
keyword_clusters = structure.get('keyword_clusters', {})
# Enhancement 2: Strategic Pillar Mapping - REUSE from SitemapService
pillars = structure.get('strategic_pillars', {})
# Enhancement 3: Advertools Site Hierarchy (from folders)
site_hierarchy = adv_result.get('metrics', {}).get('top_pillars', {}) if adv_result.get('success') else {}
# Enhancement 4: Content Cadence Trend (Last 7 days vs 90 days)
recent_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), seven_days_ago)]
historical_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), ninety_days_ago)]
recent_velocity = len(recent_urls) / 7
historical_velocity = len(historical_urls) / 90
cadence_shift = ((recent_velocity - historical_velocity) / max(historical_velocity, 0.01)) * 100
# Advertools Word Frequency (Audit top 5 recent URLs)
top_themes = []
if recent_urls:
audit_urls = [u['loc'] for u in recent_urls[:5]]
# Use thread-safe audit_content from AdvertoolsService
audit_result = await self.advertools.audit_content(audit_urls)
if audit_result.get('success'):
top_themes = audit_result.get('themes', [])
competitor_changes.append({
"domain": comp['domain'],
"name": comp['name'],
"new_content_count": len(recent_urls),
"recent_topics": [self._extract_topic_from_url(u['loc']) for u in recent_urls[:10]],
"total_pages": len(urls),
"keyword_clusters": keyword_clusters,
"strategic_pillars": pillars,
"site_hierarchy": site_hierarchy,
"top_themes": top_themes,
"cadence_shift_percent": round(cadence_shift, 1),
"publishing_velocity": round(recent_velocity, 2),
"stale_content_pct": adv_result.get('metrics', {}).get('stale_content_percentage', 0) if adv_result.get('success') else 0
})
except Exception as e:
logger.warning(f"Failed to fetch sitemap for {comp['domain']}: {e}")
# Stage 2: Differential Analysis (Non-AI Aggregation)
avg_competitor_velocity = sum(c['publishing_velocity'] for c in competitor_changes) / len(competitor_changes) if competitor_changes else 0
market_clusters = self._aggregate_clusters([c['keyword_clusters'] for c in competitor_changes])
# Stage 3: AI Strategic Intelligence
# Extract rich user context from baseline
brand_analysis = baseline.get("brand_analysis", {})
seo_audit = baseline.get("seo_audit", {})
user_niche = brand_analysis.get("industry") or "General Business"
user_topics = brand_analysis.get("topics") or []
if not user_topics and seo_audit.get("keywords"):
user_topics = seo_audit.get("keywords")[:5]
analysis_context = {
"user_profile": {
"website_url": baseline.get("website_url"),
"industry": user_niche,
"niche_description": brand_analysis.get("description") or brand_analysis.get("summary") or "",
"core_topics": user_topics,
"target_audience": baseline.get("target_audience") or {},
"business_objectives": brand_analysis.get("objectives") or "Growth",
"brand_voice": brand_analysis.get("voice") or "Professional",
"augmented_themes": brand_analysis.get("augmented_themes", []) # Added from Advertools
},
"market_intelligence": {
"market_clusters": market_clusters,
"competitors_analyzed_count": len(competitor_changes),
"market_opportunities_detected": ["Content Velocity Gap", "Topic Authority Shift", "Stale Content Replacement"],
"competitor_hierarchies": {c['name']: c['site_hierarchy'] for c in competitor_changes},
"competitor_content_themes": {c['name']: c['top_themes'] for c in competitor_changes}
},
"competitive_landscape_detailed": competitor_changes,
}
# Call AI for strategic intelligence
strategic_intelligence = await ai_manager.generate_strategic_intelligence(analysis_context, user_id=user_id)
content_gaps = await ai_manager.generate_content_gap_analysis(analysis_context, user_id=user_id)
# Stage 4: Result Assembly
report = {
"week_commencing": seven_days_ago.date().isoformat(),
"generated_at": datetime.utcnow().isoformat(),
"metrics": {
"market_velocity": round(avg_competitor_velocity, 2),
"market_clusters": market_clusters[:5],
"aggressive_competitors": [c['name'] for c in competitor_changes if c['cadence_shift_percent'] > 50]
},
"insights": {
"the_big_move": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[0] if strategic_intelligence.get("success") else {},
"low_hanging_fruit": content_gaps.get("data", {}).get("content_recommendations", []) if content_gaps.get("success") else [],
"threat_alerts": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[1:] if strategic_intelligence.get("success") else []
},
"raw_data": {
"competitor_changes": competitor_changes
}
}
return report
def _is_newer_than(self, lastmod: Optional[str], threshold: datetime) -> bool:
if not lastmod:
return False
try:
# Handle various ISO formats
dt_str = lastmod.replace('Z', '+00:00')
return datetime.fromisoformat(dt_str).replace(tzinfo=None) > threshold
except:
return False
def _aggregate_clusters(self, clusters_list: List[Dict[str, int]]) -> List[str]:
"""Aggregate clusters across competitors to find market-wide themes."""
master: Dict[str, int] = {}
for cluster in clusters_list:
for k, v in cluster.items():
master[k] = master.get(k, 0) + 1 # Count competitor occurrences
return sorted(master, key=lambda x: master[x], reverse=True)[:10]
def _extract_topic_from_url(self, url: str) -> str:
"""Helper to get a readable topic from a URL slug."""
try:
path = urlparse(url).path
slug = path.strip('/').split('/')[-1]
return slug.replace('-', ' ').replace('_', ' ').capitalize()
except:
return "New Content"
def _build_baseline(self, website_analysis: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(website_analysis, dict):
website_analysis = {}
baseline = {
"website_url": website_analysis.get("website_url"),
"brand_analysis": website_analysis.get("brand_analysis") or {},
"content_strategy_insights": website_analysis.get("content_strategy_insights") or {},
"seo_audit": website_analysis.get("seo_audit") or {},
"style_guidelines": website_analysis.get("style_guidelines") or {},
"style_patterns": website_analysis.get("style_patterns") or {}
}
return baseline
def _normalize_competitors(self, competitors: List[Dict[str, Any]], *, max_competitors: int) -> List[Dict[str, Any]]:
if not isinstance(competitors, list):
return []
seen_domains = set()
normalized: List[Dict[str, Any]] = []
for comp in competitors:
if not isinstance(comp, dict):
continue
raw_url = comp.get("url") or comp.get("website_url") or comp.get("domain") or ""
url = self._normalize_url(raw_url)
if not url:
continue
domain = self._extract_domain(url)
if not domain or domain in seen_domains:
continue
seen_domains.add(domain)
normalized.append({
"url": url,
"domain": domain,
"name": comp.get("name") or comp.get("title") or domain,
"summary": comp.get("summary") or comp.get("description") or ""
})
if len(normalized) >= max_competitors:
break
return normalized
def _normalize_url(self, raw: str) -> Optional[str]:
if not raw or not isinstance(raw, str):
return None
raw = raw.strip()
if not raw:
return None
if not raw.startswith(("http://", "https://")):
raw = "https://" + raw
try:
parsed = urlparse(raw)
if not parsed.scheme or not parsed.netloc:
return None
return f"{parsed.scheme}://{parsed.netloc}"
except Exception:
return None
def _extract_domain(self, url: str) -> Optional[str]:
try:
parsed = urlparse(url)
domain = (parsed.netloc or "").lower()
if domain.startswith("www."):
domain = domain[4:]
return domain or None
except Exception:
return None
async def _crawl_competitors(
self,
competitors: List[Dict[str, Any]],
*,
crawl_concurrency: int
) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
semaphore = asyncio.Semaphore(max(1, int(crawl_concurrency)))
async def crawl_one(comp: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
async with semaphore:
url = comp.get("url")
if not url:
return comp, {"success": False, "error": "missing_url"}
try:
return comp, await self.crawler.crawl_website(url)
except Exception as e:
return comp, {"success": False, "error": str(e)}
tasks = [crawl_one(c) for c in competitors]
return await asyncio.gather(*tasks)
def _build_extraction_artifact(self, competitor_input: Dict[str, Any], crawl_result: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(crawl_result, dict) or not crawl_result.get("success"):
return {
"fetch_status": {
"status": "failed",
"error": crawl_result.get("error") if isinstance(crawl_result, dict) else "unknown_error"
}
}
content = crawl_result.get("content") if isinstance(crawl_result.get("content"), dict) else {}
title = content.get("title") or ""
description = content.get("description") or ""
headings = content.get("headings") if isinstance(content.get("headings"), list) else []
links = content.get("links") if isinstance(content.get("links"), list) else []
meta_tags = content.get("meta_tags") if isinstance(content.get("meta_tags"), dict) else {}
main_content = content.get("main_content") or ""
content_structure = content.get("content_structure") if isinstance(content.get("content_structure"), dict) else {}
nav_labels = self._extract_nav_labels(links)
h1_h2 = [h for h in headings if isinstance(h, str)][:25]
cta_signals = self._extract_cta_signals(main_content, links)
proof_signals = self._extract_proof_signals(main_content, links)
excerpt = main_content.strip()
if len(excerpt) > 2000:
excerpt = excerpt[:2000]
return {
"fetch_status": {
"status": "ok",
"fetched_url": crawl_result.get("url"),
"timestamp": crawl_result.get("timestamp")
},
"page_meta": {
"title": title,
"meta_description": description,
"og_title": meta_tags.get("og:title"),
"og_description": meta_tags.get("og:description")
},
"structure": {
"headings": h1_h2,
"nav_labels": nav_labels,
"content_structure": content_structure
},
"signals": {
"cta_signals": cta_signals,
"proof_signals": proof_signals
},
"content_excerpt": excerpt
}
def _extract_nav_labels(self, links: List[Dict[str, Any]]) -> List[str]:
labels: List[str] = []
for link in links[:200]:
if not isinstance(link, dict):
continue
text = (link.get("text") or "").strip()
if not text or len(text) > 50:
continue
labels.append(text)
deduped: List[str] = []
seen = set()
for label in labels:
key = label.lower()
if key in seen:
continue
seen.add(key)
deduped.append(label)
if len(deduped) >= 25:
break
return deduped
def _extract_cta_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
text = (main_content or "").lower()
keywords = ["get started", "start", "book", "demo", "trial", "pricing", "contact", "signup", "sign up", "subscribe"]
keyword_hits = [k for k in keywords if k in text]
link_texts = []
for link in links[:200]:
if isinstance(link, dict):
t = (link.get("text") or "").strip()
if t:
link_texts.append(t.lower())
cta_link_hits = [k for k in keywords if any(k in lt for lt in link_texts)]
return {
"keyword_hits": keyword_hits[:10],
"link_cta_hits": list(dict.fromkeys(cta_link_hits))[:10]
}
def _extract_proof_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
text = (main_content or "").lower()
proof_keywords = ["case study", "testimonials", "customers", "trusted by", "reviews", "awards", "partners"]
hits = [k for k in proof_keywords if k in text]
link_hits = []
for link in links[:200]:
if not isinstance(link, dict):
continue
href = (link.get("href") or "").lower()
if any(k.replace(" ", "") in href.replace("-", "").replace("_", "") for k in ["case study", "testimonials", "customers"]):
link_hits.append(href)
return {
"keyword_hits": hits[:10],
"supporting_links": link_hits[:10]
}
async def _analyze_competitor_with_ai(
self,
*,
user_id: str,
baseline: Dict[str, Any],
competitor_input: Dict[str, Any],
extraction: Dict[str, Any]
) -> Dict[str, Any]:
if not isinstance(extraction, dict) or extraction.get("fetch_status", {}).get("status") != "ok":
return {
"status": "skipped",
"reason": "crawl_failed"
}
json_struct = {
"positioning": {
"value_prop": "string",
"target_audience": "string",
"market_tier": "string",
"primary_offer": "string"
},
"content_strategy": {
"themes": ["string"],
"messaging_angles": ["string"],
"cta_patterns": ["string"],
"tone_markers": ["string"]
},
"competitive_advantages": ["string"],
"weaknesses_or_risks": ["string"],
"comparison_to_user_baseline": {
"overlaps": ["string"],
"deltas": ["string"],
"opportunities": ["string"]
},
"confidence": {
"overall": "number",
"notes": ["string"]
}
}
prompt = (
"You are a competitive intelligence analyst.\n"
"Analyze the competitor homepage extraction and compare it to the user's Step 2 baseline insights.\n"
"Return strictly the requested JSON.\n\n"
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
f"Competitor input: {json.dumps(competitor_input, ensure_ascii=False)}\n\n"
f"Homepage extraction: {json.dumps(extraction, ensure_ascii=False)}\n"
)
try:
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
parsed = self._safe_json_parse(raw)
if isinstance(parsed, dict):
return parsed
return {"status": "failed", "error": "invalid_ai_json"}
except Exception as e:
logger.warning(f"AI competitor analysis failed for {competitor_input.get('domain')}: {e}")
return {"status": "failed", "error": str(e)}
async def _aggregate_with_ai(
self,
*,
user_id: str,
baseline: Dict[str, Any],
competitors: List[Dict[str, Any]]
) -> Dict[str, Any]:
json_struct = {
"market_map": {
"clusters": [
{
"cluster_name": "string",
"description": "string",
"competitors": ["string"]
}
]
},
"common_patterns": {
"common_themes": ["string"],
"common_ctas": ["string"],
"common_proof_signals": ["string"]
},
"content_gaps_and_opportunities": [
{
"gap": "string",
"why_it_matters": "string",
"recommended_content_types": ["string"],
"impact": "string",
"effort": "string"
}
],
"strategic_recommendations": [
{
"action": "string",
"expected_impact": "string",
"effort": "string",
"first_steps": ["string"]
}
],
"warnings": ["string"]
}
compact = []
for item in competitors:
comp = item.get("input") if isinstance(item, dict) else None
ai = item.get("ai_analysis") if isinstance(item, dict) else None
if isinstance(comp, dict) and isinstance(ai, dict):
compact.append({
"domain": comp.get("domain"),
"name": comp.get("name"),
"ai_analysis": ai
})
prompt = (
"You are a senior strategy consultant.\n"
"Using the user's Step 2 baseline insights and per-competitor analyses, produce an aggregated market view.\n"
"Return strictly the requested JSON.\n\n"
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
f"Per-competitor analyses: {json.dumps(compact, ensure_ascii=False)}\n"
)
try:
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
parsed = self._safe_json_parse(raw)
if isinstance(parsed, dict):
return parsed
return {"warnings": ["invalid_ai_json"]}
except Exception as e:
logger.warning(f"AI aggregation failed: {e}")
return {"warnings": [str(e)]}
def _safe_json_parse(self, text: str) -> Any:
if not isinstance(text, str):
return None
cleaned = text.strip()
cleaned = re.sub(r"^```json\\s*", "", cleaned)
cleaned = re.sub(r"^```\\s*", "", cleaned)
cleaned = re.sub(r"```\\s*$", "", cleaned)
cleaned = cleaned.strip()
try:
return json.loads(cleaned)
except Exception:
match = re.search(r"\\{[\\s\\S]*\\}", cleaned)
if match:
try:
return json.loads(match.group(0))
except Exception:
return None
return None