Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts

This commit is contained in:
ajaysi
2026-02-08 13:56:57 +05:30
parent 1db10ccd0f
commit e404a86502
333 changed files with 42223 additions and 10875 deletions

View File

@@ -0,0 +1,221 @@
import advertools as adv
import pandas as pd
import asyncio
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from loguru import logger
import json
import os
import tempfile
class AdvertoolsService:
"""
Centralized service for leveraging the Advertools library for deep SEO intelligence.
Provides functions for sitemap analysis, content auditing, and link extraction.
"""
def __init__(self):
self.logger = logger.bind(service="AdvertoolsService")
async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
"""
Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
"""
try:
self.logger.info(f"Analyzing sitemap: {sitemap_url}")
# advertools sitemap_to_df is blocking, run in executor
loop = asyncio.get_event_loop()
df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
if df is None or df.empty:
return {"success": False, "error": "Sitemap is empty or could not be parsed."}
# Convert lastmod to datetime
if 'lastmod' in df.columns:
df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
total_urls = len(df)
# Handle potential empty datetime columns
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
now = datetime.now(df['lastmod'].dt.tz)
thirty_days_ago = now - timedelta(days=30)
recent_urls = df[df['lastmod'] > thirty_days_ago]
six_months_ago = now - timedelta(days=180)
stale_urls = df[df['lastmod'] < six_months_ago]
publishing_velocity = len(recent_urls) / 4.0 # URLs per week
stale_count = len(stale_urls)
else:
publishing_velocity = 0
stale_count = 0
# Enhanced Content Pillars (Top folder patterns - 3 levels deep)
def extract_hierarchy(url: str):
try:
parts = urlparse(url).path.strip('/').split('/')
if not parts or not parts[0]: return "home"
return "/".join(parts[:2]) # Capture top 2 segments
except:
return "other"
df['pillar'] = df['loc'].apply(extract_hierarchy)
pillars = df['pillar'].value_counts().head(15).to_dict()
# Return a sample of URLs for auditing (top 15 most recent if available)
audit_urls = []
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
else:
audit_urls = df['loc'].head(15).tolist()
return {
"success": True,
"metrics": {
"total_urls": total_urls,
"publishing_velocity": round(publishing_velocity, 2),
"stale_content_count": stale_count,
"stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
"top_pillars": pillars,
"audit_sample_urls": audit_urls
},
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
return {"success": False, "error": str(e)}
async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
"""
Performs a shallow crawl and theme analysis using word frequency.
Uses unique temporary files for thread safety.
"""
temp_file = None
try:
self.logger.info(f"Auditing content for {len(url_list)} URLs")
# Create a unique temporary file
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
# advertools crawl is blocking
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=url_list,
output_file=temp_file,
follow_links=False,
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 15, # Guardrail: Max 15 pages
'DOWNLOAD_TIMEOUT': 30 # Guardrail: 30s timeout per page
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Crawl failed to generate output or output is empty."}
crawl_df = pd.read_json(temp_file, lines=True)
# Extract themes using word frequency
text_columns = [col for col in ['body_text', 'h1', 'h2', 'title'] if col in crawl_df.columns]
if not text_columns:
return {"success": False, "error": "No text content found to analyze."}
all_text = " ".join(crawl_df[text_columns].fillna("").values.flatten())
if not all_text.strip():
return {"success": False, "error": "Extracted text is empty."}
word_freq = await loop.run_in_executor(None, lambda: adv.word_frequency([all_text], rm_stopwords=True))
top_themes = word_freq.head(20).to_dict(orient='records')
# Additional metrics: Readability, word count
avg_word_count = 0
if 'body_text' in crawl_df.columns:
crawl_df['word_count'] = crawl_df['body_text'].fillna("").str.split().str.len()
avg_word_count = crawl_df['word_count'].mean()
return {
"success": True,
"themes": top_themes,
"page_count": len(crawl_df),
"avg_word_count": round(avg_word_count, 1),
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
self.logger.error(f"Failed to audit content: {str(e)}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try:
os.remove(temp_file)
except Exception as e:
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
"""
Analyzes linking patterns and social media presence using unique temporary files.
"""
temp_file = None
try:
self.logger.info(f"Extracting communication style for {len(url_list)} URLs")
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=url_list,
output_file=temp_file,
follow_links=False,
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 10,
'DOWNLOAD_TIMEOUT': 30
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Link extraction crawl failed."}
crawl_df = pd.read_json(temp_file, lines=True)
# Extract social links and internal/external stats
all_links = []
if 'links_url' in crawl_df.columns:
for links in crawl_df['links_url'].dropna():
if isinstance(links, str):
all_links.extend(links.split("@@"))
elif isinstance(links, list):
all_links.extend(links)
if not all_links:
return {"success": True, "social_links": [], "link_stats": {"total_links_found": 0, "unique_domains": 0}}
# Analyze links
link_df = adv.url_to_df(all_links)
social_domains = ['twitter.com', 'x.com', 'linkedin.com', 'facebook.com', 'instagram.com', 'youtube.com', 'github.com']
social_links = []
if not link_df.empty and 'netloc' in link_df.columns:
social_links = link_df[link_df['netloc'].isin(social_domains)]['url'].unique().tolist()
return {
"success": True,
"social_links": social_links,
"link_stats": {
"total_links_found": len(all_links),
"unique_domains": link_df['netloc'].nunique() if not link_df.empty else 0
},
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
self.logger.error(f"Failed to extract communication style: {str(e)}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try:
os.remove(temp_file)
except Exception as e:
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")

View File

@@ -0,0 +1,94 @@
"""
Advertools Task Restoration Utility
Handles creation and restoration of Advertools intelligence tasks for users.
"""
from datetime import datetime, timedelta
from typing import Any
from loguru import logger
from sqlalchemy import func
from sqlalchemy.orm import Session
from models.onboarding import WebsiteAnalysis, OnboardingSession
from models.advertools_monitoring_models import AdvertoolsTask
from services.database import get_all_user_ids, get_session_for_user
async def restore_advertools_tasks(scheduler: Any) -> int:
"""
Restore/create Advertools tasks for all users who have completed Step 2.
Returns:
Number of tasks created/restored
"""
logger.info("Restoring Advertools intelligence tasks...")
total_created = 0
user_ids = get_all_user_ids()
for user_id in user_ids:
try:
db = get_session_for_user(user_id)
if not db:
continue
try:
# Check if user has completed Step 2 (has WebsiteAnalysis)
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
if not session:
continue
analysis = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
if not analysis or not analysis.website_url:
continue
# Check for existing Advertools tasks
existing_audit = db.query(AdvertoolsTask).filter(
AdvertoolsTask.user_id == user_id,
func.json_extract(AdvertoolsTask.payload, '$.type') == 'content_audit'
).first()
if not existing_audit:
# Create weekly content audit task
new_audit = AdvertoolsTask(
user_id=user_id,
website_url=analysis.website_url,
status='active',
next_execution=datetime.utcnow() + timedelta(days=1), # Start tomorrow
frequency_days=7,
payload={
"type": "content_audit",
"website_url": analysis.website_url
}
)
db.add(new_audit)
total_created += 1
logger.info(f"Created weekly content audit task for user {user_id}")
existing_health = db.query(AdvertoolsTask).filter(
AdvertoolsTask.user_id == user_id,
func.json_extract(AdvertoolsTask.payload, '$.type') == 'site_health'
).first()
if not existing_health:
# Create weekly site health task
new_health = AdvertoolsTask(
user_id=user_id,
website_url=analysis.website_url,
status='active',
next_execution=datetime.utcnow() + timedelta(days=2), # Start in 2 days
frequency_days=7,
payload={
"type": "site_health",
"website_url": analysis.website_url
}
)
db.add(new_health)
total_created += 1
logger.info(f"Created weekly site health task for user {user_id}")
db.commit()
finally:
db.close()
except Exception as e:
logger.error(f"Error restoring Advertools tasks for user {user_id}: {e}")
return total_created

View File

@@ -12,8 +12,7 @@ from sqlalchemy.orm import Session
from loguru import logger
from utils.logger_utils import get_service_logger
from services.onboarding.data_service import OnboardingDataService
from services.calendar_generation_datasource_framework.data_processing.comprehensive_user_data import ComprehensiveUserDataProcessor
from api.content_planning.services.content_strategy.onboarding import OnboardingDataIntegrationService
logger = get_service_logger("competitive_analyzer")
@@ -23,8 +22,7 @@ class CompetitiveAnalyzer:
def __init__(self, db: Session):
"""Initialize the competitive analyzer."""
self.db = db
self.user_data_service = OnboardingDataService(db)
self.comprehensive_processor = ComprehensiveUserDataProcessor(db)
self.integration_service = OnboardingDataIntegrationService()
async def get_competitive_insights(self, user_id: str) -> Dict[str, Any]:
"""
@@ -37,8 +35,9 @@ class CompetitiveAnalyzer:
Dictionary containing competitive insights
"""
try:
# Get user's research preferences and competitor data
research_prefs = self.user_data_service.get_user_research_preferences(user_id)
# Get user's research preferences and competitor data via SSOT
onboarding_data = await self.integration_service.process_onboarding_data(user_id, self.db)
research_prefs = onboarding_data.get('research_preferences', {})
competitors = research_prefs.get('competitors', []) if research_prefs else []
if not competitors:
@@ -51,9 +50,8 @@ class CompetitiveAnalyzer:
"last_updated": datetime.now().isoformat()
}
# Get comprehensive user data including competitor analysis
comprehensive_data = self.comprehensive_processor.get_comprehensive_user_data(user_id)
competitor_analysis = comprehensive_data.get('competitor_analysis', {})
# Get competitor analysis directly from SSOT data
competitor_analysis = onboarding_data.get('competitor_analysis', {})
# Extract competitor keywords and content topics
competitor_keywords = self._extract_competitor_keywords(competitor_analysis, competitors)
@@ -300,6 +298,7 @@ class CompetitiveAnalyzer:
else:
keyword_map[keyword] = {
'keyword': kw['keyword'],
'competitor': kw['competitor'], # Primary competitor
'competitors': [kw['competitor']],
'source': kw['source'],
'volume_estimate': kw['volume_estimate'],

View File

@@ -9,6 +9,7 @@ OAuth connections from onboarding step 5.
from typing import Dict, Any, Optional, List
from datetime import datetime, timedelta
from sqlalchemy.orm import Session
from sqlalchemy import func
from loguru import logger
from utils.logger_utils import get_service_logger
@@ -16,9 +17,12 @@ from services.gsc_service import GSCService
from services.integrations.bing_oauth import BingOAuthService
from services.bing_analytics_storage_service import BingAnalyticsStorageService
from services.analytics_cache_service import AnalyticsCacheService
from services.onboarding.data_service import OnboardingDataService
from api.content_planning.services.content_strategy.onboarding.data_integration import OnboardingDataIntegrationService
from .analytics_aggregator import AnalyticsAggregator
from .competitive_analyzer import CompetitiveAnalyzer
from models.onboarding import SEOPageAudit, WebsiteAnalysis, OnboardingSession
from models.website_analysis_monitoring_models import OnboardingFullWebsiteAnalysisTask
from models.advertools_monitoring_models import AdvertoolsTask
logger = get_service_logger("seo_dashboard")
@@ -30,12 +34,19 @@ class SEODashboardService:
self.db = db
self.gsc_service = GSCService()
self.bing_oauth = BingOAuthService()
self.bing_storage = BingAnalyticsStorageService("sqlite:///alwrity.db")
# Bing storage is initialized per-user dynamically
self.analytics_cache = AnalyticsCacheService()
self.user_data_service = OnboardingDataService(db)
self.integration_service = OnboardingDataIntegrationService()
self.analytics_aggregator = AnalyticsAggregator()
self.competitive_analyzer = CompetitiveAnalyzer(db)
def _get_bing_storage(self, user_id: str) -> BingAnalyticsStorageService:
"""Get Bing storage service for user."""
from services.database import get_user_db_path
db_path = get_user_db_path(user_id)
db_url = f"sqlite:///{db_path}"
return BingAnalyticsStorageService(db_url)
async def get_platform_status(self, user_id: str) -> Dict[str, Any]:
"""Get connection status for GSC and Bing platforms."""
try:
@@ -81,8 +92,10 @@ class SEODashboardService:
try:
# Get user's website URL if not provided
if not site_url:
# Try to get from website analysis first
website_analysis = self.user_data_service.get_user_website_analysis(int(user_id))
# Use SSOT for onboarding data
onboarding_data = await self.integration_service.process_onboarding_data(user_id, self.db)
website_analysis = onboarding_data.get('website_analysis', {})
if website_analysis and website_analysis.get('website_url'):
site_url = website_analysis['website_url']
else:
@@ -115,6 +128,10 @@ class SEODashboardService:
# Generate AI insights
ai_insights = await self._generate_ai_insights(summary, timeseries, competitor_insights)
technical_seo_audit = self._get_technical_seo_audit_overview(user_id, site_url)
advertools_insights = self._get_advertools_insights(user_id, site_url)
return {
"website_url": site_url,
@@ -124,12 +141,71 @@ class SEODashboardService:
"competitor_insights": competitor_insights,
"health_score": health_score,
"ai_insights": ai_insights,
"technical_seo_audit": technical_seo_audit,
"advertools_insights": advertools_insights,
"last_updated": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Error getting dashboard overview for user {user_id}: {e}")
raise
def _get_technical_seo_audit_overview(self, user_id: str, site_url: str) -> Dict[str, Any]:
site_key = (site_url or "").rstrip("/")
try:
q = self.db.query(SEOPageAudit).filter(SEOPageAudit.user_id == str(user_id))
if site_key:
q = q.filter(SEOPageAudit.website_url.like(f"{site_key}%"))
audits = q.order_by(func.coalesce(SEOPageAudit.overall_score, 1000).asc()).all()
pages_audited = len(audits)
scores = [a.overall_score for a in audits if isinstance(a.overall_score, int)]
avg_score = round(sum(scores) / len(scores)) if scores else 0
fix_scheduled_pages = len([a for a in audits if a.status == 'fix_scheduled'])
worst_pages = [
{
"page_url": a.page_url,
"overall_score": a.overall_score,
"status": a.status,
"issues_count": len(a.issues or []) if isinstance(a.issues, list) else 0
}
for a in audits[:10]
]
task = self.db.query(OnboardingFullWebsiteAnalysisTask).filter(
OnboardingFullWebsiteAnalysisTask.user_id == str(user_id),
OnboardingFullWebsiteAnalysisTask.website_url.like(f"{site_key}%")
).order_by(OnboardingFullWebsiteAnalysisTask.updated_at.desc()).first()
task_status = None
next_execution = None
if task:
task_status = task.status
next_execution = task.next_execution.isoformat() if task.next_execution else None
return {
"status": "ready" if pages_audited > 0 else ("scheduled" if task_status == "active" else "pending"),
"task_status": task_status,
"next_execution": next_execution,
"pages_audited": pages_audited,
"avg_score": avg_score,
"fix_scheduled_pages": fix_scheduled_pages,
"worst_pages": worst_pages
}
except Exception as e:
logger.warning(f"Failed to build technical SEO audit overview for user {user_id}: {e}")
return {
"status": "error",
"error": str(e),
"pages_audited": 0,
"avg_score": 0,
"fix_scheduled_pages": 0,
"worst_pages": []
}
async def get_gsc_data(self, user_id: str, site_url: Optional[str] = None) -> Dict[str, Any]:
"""Get GSC data for the specified site."""
@@ -181,13 +257,15 @@ class SEODashboardService:
# Get data from Bing storage service
if site_url:
bing_data = self.bing_storage.get_analytics_summary(user_id, site_url, days=30)
bing_storage = self._get_bing_storage(user_id)
bing_data = bing_storage.get_analytics_summary(user_id, site_url, days=30)
else:
# Get all sites for user
sites = self._get_bing_sites(user_id)
if sites:
logger.info(f"Using first Bing site for analysis: {sites[0]}")
bing_data = self.bing_storage.get_analytics_summary(user_id, sites[0], days=30)
bing_storage = self._get_bing_storage(user_id)
bing_data = bing_storage.get_analytics_summary(user_id, sites[0], days=30)
else:
logger.warning(f"No Bing sites found for user {user_id}")
return {"error": "No Bing sites found", "data": [], "status": "disconnected"}
@@ -249,6 +327,46 @@ class SEODashboardService:
"last_updated": datetime.now().isoformat()
}
def _get_advertools_insights(self, user_id: str, site_url: str) -> Dict[str, Any]:
"""Fetch Advertools-based insights from WebsiteAnalysis and AdvertoolsTasks."""
try:
# 1. Get augmented persona themes from WebsiteAnalysis
session = self.db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
if not session:
return {}
analysis = self.db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
# 2. Get latest tasks status
tasks = self.db.query(AdvertoolsTask).filter(AdvertoolsTask.user_id == user_id).all()
audit_status = "pending"
health_status = "pending"
for task in tasks:
t_type = task.payload.get('type') if task.payload else None
if t_type == 'content_audit':
audit_status = task.status
elif t_type == 'site_health':
health_status = task.status
brand_analysis = analysis.brand_analysis or {} if analysis else {}
seo_audit = analysis.seo_audit or {} if analysis else {}
return {
"augmented_themes": brand_analysis.get('augmented_themes', []),
"last_audit": brand_analysis.get('last_advertools_audit'),
"site_health": seo_audit.get('site_health', {}),
"last_health_check": seo_audit.get('last_advertools_health_check'),
"tasks": {
"content_audit": audit_status,
"site_health": health_status
}
}
except Exception as e:
logger.warning(f"Failed to fetch Advertools insights for user {user_id}: {e}")
return {}
def _get_gsc_sites(self, user_id: str) -> List[str]:
"""Get GSC sites for user."""
try:
@@ -394,4 +512,4 @@ class SEODashboardService:
except Exception as e:
logger.error(f"Error generating AI insights: {e}")
return []
return []

View File

@@ -0,0 +1,603 @@
from __future__ import annotations
import asyncio
import json
import re
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlparse
from services.component_logic.web_crawler_logic import WebCrawlerLogic
from services.llm_providers.main_text_generation import llm_text_gen
from services.ai_service_manager import AIServiceManager, AIServiceType
from services.seo_tools.sitemap_service import SitemapService
from services.seo.advertools_service import AdvertoolsService
from utils.logger_utils import get_service_logger
logger = get_service_logger("deep_competitor_analysis")
class DeepCompetitorAnalysisService:
def __init__(self):
self.crawler = WebCrawlerLogic()
self.advertools = AdvertoolsService()
async def run(
self,
*,
user_id: str,
website_analysis: Dict[str, Any],
competitors: List[Dict[str, Any]],
max_competitors: int = 25,
crawl_concurrency: int = 4
) -> Dict[str, Any]:
baseline = self._build_baseline(website_analysis)
normalized_competitors = self._normalize_competitors(competitors, max_competitors=max_competitors)
crawl_results = await self._crawl_competitors(
normalized_competitors,
crawl_concurrency=crawl_concurrency
)
per_competitor_outputs: List[Dict[str, Any]] = []
for competitor_input, crawl_result in crawl_results:
extraction = self._build_extraction_artifact(competitor_input, crawl_result)
ai_analysis = await self._analyze_competitor_with_ai(
user_id=user_id,
baseline=baseline,
competitor_input=competitor_input,
extraction=extraction
)
per_competitor_outputs.append({
"input": competitor_input,
"extraction": extraction,
"ai_analysis": ai_analysis
})
aggregation = await self._aggregate_with_ai(
user_id=user_id,
baseline=baseline,
competitors=per_competitor_outputs
)
return {
"baseline": baseline,
"competitors": per_competitor_outputs,
"aggregation": aggregation,
"metadata": {
"generated_at": datetime.utcnow().isoformat(),
"competitors_requested": len(normalized_competitors),
"competitors_analyzed": len(per_competitor_outputs),
"crawl_concurrency": crawl_concurrency
}
}
async def generate_weekly_strategy_brief(
self,
*,
user_id: str,
website_analysis: Dict[str, Any],
competitors: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""
Generates a weekly strategic intelligence brief by analyzing
recent competitor changes and market shifts.
"""
sitemap_service = SitemapService()
ai_manager = AIServiceManager()
# Stage 1: Data Collection (User + Competitors)
baseline = self._build_baseline(website_analysis)
normalized_competitors = self._normalize_competitors(competitors, max_competitors=10)
# Fetch competitor sitemaps for recent changes
competitor_changes = []
seven_days_ago = datetime.utcnow() - timedelta(days=7)
ninety_days_ago = datetime.utcnow() - timedelta(days=90)
for comp in normalized_competitors:
try:
# Stage 1: Advertools Deep Intelligence
# Discover exact sitemap URL first (essential for Advertools)
discovered_sitemap = await sitemap_service.discover_sitemap_url(comp['url'])
effective_url = discovered_sitemap if discovered_sitemap else comp['url']
adv_result = await self.advertools.analyze_sitemap(effective_url)
# REUSE: Use existing SitemapService.analyze_sitemap for robust Stage 1 & 2
analysis_result = await sitemap_service.analyze_sitemap(
sitemap_url=effective_url,
analyze_content_trends=True,
analyze_publishing_patterns=True,
include_ai_insights=False,
user_id=user_id
)
if analysis_result and analysis_result.get('urls'):
urls = analysis_result['urls']
structure = analysis_result.get('structure_analysis', {})
# Enhancement 1: Keyword Clustering (NLP from URLs) - REUSE from SitemapService
keyword_clusters = structure.get('keyword_clusters', {})
# Enhancement 2: Strategic Pillar Mapping - REUSE from SitemapService
pillars = structure.get('strategic_pillars', {})
# Enhancement 3: Advertools Site Hierarchy (from folders)
site_hierarchy = adv_result.get('metrics', {}).get('top_pillars', {}) if adv_result.get('success') else {}
# Enhancement 4: Content Cadence Trend (Last 7 days vs 90 days)
recent_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), seven_days_ago)]
historical_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), ninety_days_ago)]
recent_velocity = len(recent_urls) / 7
historical_velocity = len(historical_urls) / 90
cadence_shift = ((recent_velocity - historical_velocity) / max(historical_velocity, 0.01)) * 100
# Advertools Word Frequency (Audit top 5 recent URLs)
top_themes = []
if recent_urls:
audit_urls = [u['loc'] for u in recent_urls[:5]]
# Use thread-safe audit_content from AdvertoolsService
audit_result = await self.advertools.audit_content(audit_urls)
if audit_result.get('success'):
top_themes = audit_result.get('themes', [])
competitor_changes.append({
"domain": comp['domain'],
"name": comp['name'],
"new_content_count": len(recent_urls),
"recent_topics": [self._extract_topic_from_url(u['loc']) for u in recent_urls[:10]],
"total_pages": len(urls),
"keyword_clusters": keyword_clusters,
"strategic_pillars": pillars,
"site_hierarchy": site_hierarchy,
"top_themes": top_themes,
"cadence_shift_percent": round(cadence_shift, 1),
"publishing_velocity": round(recent_velocity, 2),
"stale_content_pct": adv_result.get('metrics', {}).get('stale_content_percentage', 0) if adv_result.get('success') else 0
})
except Exception as e:
logger.warning(f"Failed to fetch sitemap for {comp['domain']}: {e}")
# Stage 2: Differential Analysis (Non-AI Aggregation)
avg_competitor_velocity = sum(c['publishing_velocity'] for c in competitor_changes) / len(competitor_changes) if competitor_changes else 0
market_clusters = self._aggregate_clusters([c['keyword_clusters'] for c in competitor_changes])
# Stage 3: AI Strategic Intelligence
# Extract rich user context from baseline
brand_analysis = baseline.get("brand_analysis", {})
seo_audit = baseline.get("seo_audit", {})
user_niche = brand_analysis.get("industry") or "General Business"
user_topics = brand_analysis.get("topics") or []
if not user_topics and seo_audit.get("keywords"):
user_topics = seo_audit.get("keywords")[:5]
analysis_context = {
"user_profile": {
"website_url": baseline.get("website_url"),
"industry": user_niche,
"niche_description": brand_analysis.get("description") or brand_analysis.get("summary") or "",
"core_topics": user_topics,
"target_audience": baseline.get("target_audience") or {},
"business_objectives": brand_analysis.get("objectives") or "Growth",
"brand_voice": brand_analysis.get("voice") or "Professional",
"augmented_themes": brand_analysis.get("augmented_themes", []) # Added from Advertools
},
"market_intelligence": {
"market_clusters": market_clusters,
"competitors_analyzed_count": len(competitor_changes),
"market_opportunities_detected": ["Content Velocity Gap", "Topic Authority Shift", "Stale Content Replacement"],
"competitor_hierarchies": {c['name']: c['site_hierarchy'] for c in competitor_changes},
"competitor_content_themes": {c['name']: c['top_themes'] for c in competitor_changes}
},
"competitive_landscape_detailed": competitor_changes,
}
# Call AI for strategic intelligence
strategic_intelligence = await ai_manager.generate_strategic_intelligence(analysis_context, user_id=user_id)
content_gaps = await ai_manager.generate_content_gap_analysis(analysis_context, user_id=user_id)
# Stage 4: Result Assembly
report = {
"week_commencing": seven_days_ago.date().isoformat(),
"generated_at": datetime.utcnow().isoformat(),
"metrics": {
"market_velocity": round(avg_competitor_velocity, 2),
"market_clusters": market_clusters[:5],
"aggressive_competitors": [c['name'] for c in competitor_changes if c['cadence_shift_percent'] > 50]
},
"insights": {
"the_big_move": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[0] if strategic_intelligence.get("success") else {},
"low_hanging_fruit": content_gaps.get("data", {}).get("content_recommendations", []) if content_gaps.get("success") else [],
"threat_alerts": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[1:] if strategic_intelligence.get("success") else []
},
"raw_data": {
"competitor_changes": competitor_changes
}
}
return report
def _is_newer_than(self, lastmod: Optional[str], threshold: datetime) -> bool:
if not lastmod:
return False
try:
# Handle various ISO formats
dt_str = lastmod.replace('Z', '+00:00')
return datetime.fromisoformat(dt_str).replace(tzinfo=None) > threshold
except:
return False
def _aggregate_clusters(self, clusters_list: List[Dict[str, int]]) -> List[str]:
"""Aggregate clusters across competitors to find market-wide themes."""
master: Dict[str, int] = {}
for cluster in clusters_list:
for k, v in cluster.items():
master[k] = master.get(k, 0) + 1 # Count competitor occurrences
return sorted(master, key=lambda x: master[x], reverse=True)[:10]
def _extract_topic_from_url(self, url: str) -> str:
"""Helper to get a readable topic from a URL slug."""
try:
path = urlparse(url).path
slug = path.strip('/').split('/')[-1]
return slug.replace('-', ' ').replace('_', ' ').capitalize()
except:
return "New Content"
def _build_baseline(self, website_analysis: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(website_analysis, dict):
website_analysis = {}
baseline = {
"website_url": website_analysis.get("website_url"),
"brand_analysis": website_analysis.get("brand_analysis") or {},
"content_strategy_insights": website_analysis.get("content_strategy_insights") or {},
"seo_audit": website_analysis.get("seo_audit") or {},
"style_guidelines": website_analysis.get("style_guidelines") or {},
"style_patterns": website_analysis.get("style_patterns") or {}
}
return baseline
def _normalize_competitors(self, competitors: List[Dict[str, Any]], *, max_competitors: int) -> List[Dict[str, Any]]:
if not isinstance(competitors, list):
return []
seen_domains = set()
normalized: List[Dict[str, Any]] = []
for comp in competitors:
if not isinstance(comp, dict):
continue
raw_url = comp.get("url") or comp.get("website_url") or comp.get("domain") or ""
url = self._normalize_url(raw_url)
if not url:
continue
domain = self._extract_domain(url)
if not domain or domain in seen_domains:
continue
seen_domains.add(domain)
normalized.append({
"url": url,
"domain": domain,
"name": comp.get("name") or comp.get("title") or domain,
"summary": comp.get("summary") or comp.get("description") or ""
})
if len(normalized) >= max_competitors:
break
return normalized
def _normalize_url(self, raw: str) -> Optional[str]:
if not raw or not isinstance(raw, str):
return None
raw = raw.strip()
if not raw:
return None
if not raw.startswith(("http://", "https://")):
raw = "https://" + raw
try:
parsed = urlparse(raw)
if not parsed.scheme or not parsed.netloc:
return None
return f"{parsed.scheme}://{parsed.netloc}"
except Exception:
return None
def _extract_domain(self, url: str) -> Optional[str]:
try:
parsed = urlparse(url)
domain = (parsed.netloc or "").lower()
if domain.startswith("www."):
domain = domain[4:]
return domain or None
except Exception:
return None
async def _crawl_competitors(
self,
competitors: List[Dict[str, Any]],
*,
crawl_concurrency: int
) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
semaphore = asyncio.Semaphore(max(1, int(crawl_concurrency)))
async def crawl_one(comp: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
async with semaphore:
url = comp.get("url")
if not url:
return comp, {"success": False, "error": "missing_url"}
try:
return comp, await self.crawler.crawl_website(url)
except Exception as e:
return comp, {"success": False, "error": str(e)}
tasks = [crawl_one(c) for c in competitors]
return await asyncio.gather(*tasks)
def _build_extraction_artifact(self, competitor_input: Dict[str, Any], crawl_result: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(crawl_result, dict) or not crawl_result.get("success"):
return {
"fetch_status": {
"status": "failed",
"error": crawl_result.get("error") if isinstance(crawl_result, dict) else "unknown_error"
}
}
content = crawl_result.get("content") if isinstance(crawl_result.get("content"), dict) else {}
title = content.get("title") or ""
description = content.get("description") or ""
headings = content.get("headings") if isinstance(content.get("headings"), list) else []
links = content.get("links") if isinstance(content.get("links"), list) else []
meta_tags = content.get("meta_tags") if isinstance(content.get("meta_tags"), dict) else {}
main_content = content.get("main_content") or ""
content_structure = content.get("content_structure") if isinstance(content.get("content_structure"), dict) else {}
nav_labels = self._extract_nav_labels(links)
h1_h2 = [h for h in headings if isinstance(h, str)][:25]
cta_signals = self._extract_cta_signals(main_content, links)
proof_signals = self._extract_proof_signals(main_content, links)
excerpt = main_content.strip()
if len(excerpt) > 2000:
excerpt = excerpt[:2000]
return {
"fetch_status": {
"status": "ok",
"fetched_url": crawl_result.get("url"),
"timestamp": crawl_result.get("timestamp")
},
"page_meta": {
"title": title,
"meta_description": description,
"og_title": meta_tags.get("og:title"),
"og_description": meta_tags.get("og:description")
},
"structure": {
"headings": h1_h2,
"nav_labels": nav_labels,
"content_structure": content_structure
},
"signals": {
"cta_signals": cta_signals,
"proof_signals": proof_signals
},
"content_excerpt": excerpt
}
def _extract_nav_labels(self, links: List[Dict[str, Any]]) -> List[str]:
labels: List[str] = []
for link in links[:200]:
if not isinstance(link, dict):
continue
text = (link.get("text") or "").strip()
if not text or len(text) > 50:
continue
labels.append(text)
deduped: List[str] = []
seen = set()
for label in labels:
key = label.lower()
if key in seen:
continue
seen.add(key)
deduped.append(label)
if len(deduped) >= 25:
break
return deduped
def _extract_cta_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
text = (main_content or "").lower()
keywords = ["get started", "start", "book", "demo", "trial", "pricing", "contact", "signup", "sign up", "subscribe"]
keyword_hits = [k for k in keywords if k in text]
link_texts = []
for link in links[:200]:
if isinstance(link, dict):
t = (link.get("text") or "").strip()
if t:
link_texts.append(t.lower())
cta_link_hits = [k for k in keywords if any(k in lt for lt in link_texts)]
return {
"keyword_hits": keyword_hits[:10],
"link_cta_hits": list(dict.fromkeys(cta_link_hits))[:10]
}
def _extract_proof_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
text = (main_content or "").lower()
proof_keywords = ["case study", "testimonials", "customers", "trusted by", "reviews", "awards", "partners"]
hits = [k for k in proof_keywords if k in text]
link_hits = []
for link in links[:200]:
if not isinstance(link, dict):
continue
href = (link.get("href") or "").lower()
if any(k.replace(" ", "") in href.replace("-", "").replace("_", "") for k in ["case study", "testimonials", "customers"]):
link_hits.append(href)
return {
"keyword_hits": hits[:10],
"supporting_links": link_hits[:10]
}
async def _analyze_competitor_with_ai(
self,
*,
user_id: str,
baseline: Dict[str, Any],
competitor_input: Dict[str, Any],
extraction: Dict[str, Any]
) -> Dict[str, Any]:
if not isinstance(extraction, dict) or extraction.get("fetch_status", {}).get("status") != "ok":
return {
"status": "skipped",
"reason": "crawl_failed"
}
json_struct = {
"positioning": {
"value_prop": "string",
"target_audience": "string",
"market_tier": "string",
"primary_offer": "string"
},
"content_strategy": {
"themes": ["string"],
"messaging_angles": ["string"],
"cta_patterns": ["string"],
"tone_markers": ["string"]
},
"competitive_advantages": ["string"],
"weaknesses_or_risks": ["string"],
"comparison_to_user_baseline": {
"overlaps": ["string"],
"deltas": ["string"],
"opportunities": ["string"]
},
"confidence": {
"overall": "number",
"notes": ["string"]
}
}
prompt = (
"You are a competitive intelligence analyst.\n"
"Analyze the competitor homepage extraction and compare it to the user's Step 2 baseline insights.\n"
"Return strictly the requested JSON.\n\n"
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
f"Competitor input: {json.dumps(competitor_input, ensure_ascii=False)}\n\n"
f"Homepage extraction: {json.dumps(extraction, ensure_ascii=False)}\n"
)
try:
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
parsed = self._safe_json_parse(raw)
if isinstance(parsed, dict):
return parsed
return {"status": "failed", "error": "invalid_ai_json"}
except Exception as e:
logger.warning(f"AI competitor analysis failed for {competitor_input.get('domain')}: {e}")
return {"status": "failed", "error": str(e)}
async def _aggregate_with_ai(
self,
*,
user_id: str,
baseline: Dict[str, Any],
competitors: List[Dict[str, Any]]
) -> Dict[str, Any]:
json_struct = {
"market_map": {
"clusters": [
{
"cluster_name": "string",
"description": "string",
"competitors": ["string"]
}
]
},
"common_patterns": {
"common_themes": ["string"],
"common_ctas": ["string"],
"common_proof_signals": ["string"]
},
"content_gaps_and_opportunities": [
{
"gap": "string",
"why_it_matters": "string",
"recommended_content_types": ["string"],
"impact": "string",
"effort": "string"
}
],
"strategic_recommendations": [
{
"action": "string",
"expected_impact": "string",
"effort": "string",
"first_steps": ["string"]
}
],
"warnings": ["string"]
}
compact = []
for item in competitors:
comp = item.get("input") if isinstance(item, dict) else None
ai = item.get("ai_analysis") if isinstance(item, dict) else None
if isinstance(comp, dict) and isinstance(ai, dict):
compact.append({
"domain": comp.get("domain"),
"name": comp.get("name"),
"ai_analysis": ai
})
prompt = (
"You are a senior strategy consultant.\n"
"Using the user's Step 2 baseline insights and per-competitor analyses, produce an aggregated market view.\n"
"Return strictly the requested JSON.\n\n"
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
f"Per-competitor analyses: {json.dumps(compact, ensure_ascii=False)}\n"
)
try:
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
parsed = self._safe_json_parse(raw)
if isinstance(parsed, dict):
return parsed
return {"warnings": ["invalid_ai_json"]}
except Exception as e:
logger.warning(f"AI aggregation failed: {e}")
return {"warnings": [str(e)]}
def _safe_json_parse(self, text: str) -> Any:
if not isinstance(text, str):
return None
cleaned = text.strip()
cleaned = re.sub(r"^```json\\s*", "", cleaned)
cleaned = re.sub(r"^```\\s*", "", cleaned)
cleaned = re.sub(r"```\\s*$", "", cleaned)
cleaned = cleaned.strip()
try:
return json.loads(cleaned)
except Exception:
match = re.search(r"\\{[\\s\\S]*\\}", cleaned)
if match:
try:
return json.loads(match.group(0))
except Exception:
return None
return None