Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts

2026-02-08 13:56:57 +05:30
parent 1db10ccd0f
commit e404a86502
333 changed files with 42223 additions and 10875 deletions
--- a/backend/services/seo/advertools_service.py
+++ b/backend/services/seo/advertools_service.py
@@ -0,0 +1,221 @@
+import advertools as adv
+import pandas as pd
+import asyncio
+from typing import Dict, Any, List, Optional
+from datetime import datetime, timedelta
+from loguru import logger
+import json
+import os
+import tempfile
+
+class AdvertoolsService:
+    """
+    Centralized service for leveraging the Advertools library for deep SEO intelligence.
+    Provides functions for sitemap analysis, content auditing, and link extraction.
+    """
+    
+    def __init__(self):
+        self.logger = logger.bind(service="AdvertoolsService")
+
+    async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
+        """
+        Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
+        """
+        try:
+            self.logger.info(f"Analyzing sitemap: {sitemap_url}")
+            
+            # advertools sitemap_to_df is blocking, run in executor
+            loop = asyncio.get_event_loop()
+            df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
+            
+            if df is None or df.empty:
+                return {"success": False, "error": "Sitemap is empty or could not be parsed."}
+
+            # Convert lastmod to datetime
+            if 'lastmod' in df.columns:
+                df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
+                
+            total_urls = len(df)
+            
+            # Handle potential empty datetime columns
+            if 'lastmod' in df.columns and not df['lastmod'].isna().all():
+                now = datetime.now(df['lastmod'].dt.tz)
+                thirty_days_ago = now - timedelta(days=30)
+                recent_urls = df[df['lastmod'] > thirty_days_ago]
+                six_months_ago = now - timedelta(days=180)
+                stale_urls = df[df['lastmod'] < six_months_ago]
+                
+                publishing_velocity = len(recent_urls) / 4.0 # URLs per week
+                stale_count = len(stale_urls)
+            else:
+                publishing_velocity = 0
+                stale_count = 0
+            
+            # Enhanced Content Pillars (Top folder patterns - 3 levels deep)
+            def extract_hierarchy(url: str):
+                try:
+                    parts = urlparse(url).path.strip('/').split('/')
+                    if not parts or not parts[0]: return "home"
+                    return "/".join(parts[:2]) # Capture top 2 segments
+                except:
+                    return "other"
+
+            df['pillar'] = df['loc'].apply(extract_hierarchy)
+            pillars = df['pillar'].value_counts().head(15).to_dict()
+
+            # Return a sample of URLs for auditing (top 15 most recent if available)
+            audit_urls = []
+            if 'lastmod' in df.columns and not df['lastmod'].isna().all():
+                audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
+            else:
+                audit_urls = df['loc'].head(15).tolist()
+
+            return {
+                "success": True,
+                "metrics": {
+                    "total_urls": total_urls,
+                    "publishing_velocity": round(publishing_velocity, 2),
+                    "stale_content_count": stale_count,
+                    "stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
+                    "top_pillars": pillars,
+                    "audit_sample_urls": audit_urls
+                },
+                "timestamp": datetime.utcnow().isoformat()
+            }
+        except Exception as e:
+            self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
+            return {"success": False, "error": str(e)}
+
+    async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
+        """
+        Performs a shallow crawl and theme analysis using word frequency.
+        Uses unique temporary files for thread safety.
+        """
+        temp_file = None
+        try:
+            self.logger.info(f"Auditing content for {len(url_list)} URLs")
+            
+            # Create a unique temporary file
+            with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
+                temp_file = tf.name
+
+            # advertools crawl is blocking
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, lambda: adv.crawl(
+                url_list=url_list,
+                output_file=temp_file,
+                follow_links=False,
+                custom_settings={
+                    'LOG_LEVEL': 'WARNING',
+                    'CLOSESPIDER_PAGECOUNT': 15, # Guardrail: Max 15 pages
+                    'DOWNLOAD_TIMEOUT': 30        # Guardrail: 30s timeout per page
+                }
+            ))
+            
+            if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
+                return {"success": False, "error": "Crawl failed to generate output or output is empty."}
+
+            crawl_df = pd.read_json(temp_file, lines=True)
+            
+            # Extract themes using word frequency
+            text_columns = [col for col in ['body_text', 'h1', 'h2', 'title'] if col in crawl_df.columns]
+            if not text_columns:
+                 return {"success": False, "error": "No text content found to analyze."}
+
+            all_text = " ".join(crawl_df[text_columns].fillna("").values.flatten())
+            
+            if not all_text.strip():
+                return {"success": False, "error": "Extracted text is empty."}
+
+            word_freq = await loop.run_in_executor(None, lambda: adv.word_frequency([all_text], rm_stopwords=True))
+            top_themes = word_freq.head(20).to_dict(orient='records')
+
+            # Additional metrics: Readability, word count
+            avg_word_count = 0
+            if 'body_text' in crawl_df.columns:
+                crawl_df['word_count'] = crawl_df['body_text'].fillna("").str.split().str.len()
+                avg_word_count = crawl_df['word_count'].mean()
+
+            return {
+                "success": True,
+                "themes": top_themes,
+                "page_count": len(crawl_df),
+                "avg_word_count": round(avg_word_count, 1),
+                "timestamp": datetime.utcnow().isoformat()
+            }
+        except Exception as e:
+            self.logger.error(f"Failed to audit content: {str(e)}")
+            return {"success": False, "error": str(e)}
+        finally:
+            if temp_file and os.path.exists(temp_file):
+                try:
+                    os.remove(temp_file)
+                except Exception as e:
+                    self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
+
+    async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
+        """
+        Analyzes linking patterns and social media presence using unique temporary files.
+        """
+        temp_file = None
+        try:
+            self.logger.info(f"Extracting communication style for {len(url_list)} URLs")
+            
+            with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
+                temp_file = tf.name
+
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, lambda: adv.crawl(
+                url_list=url_list,
+                output_file=temp_file,
+                follow_links=False,
+                custom_settings={
+                    'LOG_LEVEL': 'WARNING',
+                    'CLOSESPIDER_PAGECOUNT': 10,
+                    'DOWNLOAD_TIMEOUT': 30
+                }
+            ))
+            
+            if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
+                return {"success": False, "error": "Link extraction crawl failed."}
+
+            crawl_df = pd.read_json(temp_file, lines=True)
+            
+            # Extract social links and internal/external stats
+            all_links = []
+            if 'links_url' in crawl_df.columns:
+                for links in crawl_df['links_url'].dropna():
+                    if isinstance(links, str):
+                        all_links.extend(links.split("@@"))
+                    elif isinstance(links, list):
+                        all_links.extend(links)
+
+            if not all_links:
+                return {"success": True, "social_links": [], "link_stats": {"total_links_found": 0, "unique_domains": 0}}
+
+            # Analyze links
+            link_df = adv.url_to_df(all_links)
+            
+            social_domains = ['twitter.com', 'x.com', 'linkedin.com', 'facebook.com', 'instagram.com', 'youtube.com', 'github.com']
+            social_links = []
+            if not link_df.empty and 'netloc' in link_df.columns:
+                social_links = link_df[link_df['netloc'].isin(social_domains)]['url'].unique().tolist()
+            
+            return {
+                "success": True,
+                "social_links": social_links,
+                "link_stats": {
+                    "total_links_found": len(all_links),
+                    "unique_domains": link_df['netloc'].nunique() if not link_df.empty else 0
+                },
+                "timestamp": datetime.utcnow().isoformat()
+            }
+        except Exception as e:
+            self.logger.error(f"Failed to extract communication style: {str(e)}")
+            return {"success": False, "error": str(e)}
+        finally:
+            if temp_file and os.path.exists(temp_file):
+                try:
+                    os.remove(temp_file)
+                except Exception as e:
+                    self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
--- a/backend/services/seo/advertools_task_manager.py
+++ b/backend/services/seo/advertools_task_manager.py
@@ -0,0 +1,94 @@
+"""
+Advertools Task Restoration Utility
+Handles creation and restoration of Advertools intelligence tasks for users.
+"""
+
+from datetime import datetime, timedelta
+from typing import Any
+from loguru import logger
+from sqlalchemy import func
+from sqlalchemy.orm import Session
+
+from models.onboarding import WebsiteAnalysis, OnboardingSession
+from models.advertools_monitoring_models import AdvertoolsTask
+from services.database import get_all_user_ids, get_session_for_user
+
+async def restore_advertools_tasks(scheduler: Any) -> int:
+    """
+    Restore/create Advertools tasks for all users who have completed Step 2.
+    
+    Returns:
+        Number of tasks created/restored
+    """
+    logger.info("Restoring Advertools intelligence tasks...")
+    total_created = 0
+    
+    user_ids = get_all_user_ids()
+    for user_id in user_ids:
+        try:
+            db = get_session_for_user(user_id)
+            if not db:
+                continue
+                
+            try:
+                # Check if user has completed Step 2 (has WebsiteAnalysis)
+                session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
+                if not session:
+                    continue
+                
+                analysis = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
+                if not analysis or not analysis.website_url:
+                    continue
+                
+                # Check for existing Advertools tasks
+                existing_audit = db.query(AdvertoolsTask).filter(
+                    AdvertoolsTask.user_id == user_id,
+                    func.json_extract(AdvertoolsTask.payload, '$.type') == 'content_audit'
+                ).first()
+                
+                if not existing_audit:
+                    # Create weekly content audit task
+                    new_audit = AdvertoolsTask(
+                        user_id=user_id,
+                        website_url=analysis.website_url,
+                        status='active',
+                        next_execution=datetime.utcnow() + timedelta(days=1), # Start tomorrow
+                        frequency_days=7,
+                        payload={
+                            "type": "content_audit",
+                            "website_url": analysis.website_url
+                        }
+                    )
+                    db.add(new_audit)
+                    total_created += 1
+                    logger.info(f"Created weekly content audit task for user {user_id}")
+                
+                existing_health = db.query(AdvertoolsTask).filter(
+                    AdvertoolsTask.user_id == user_id,
+                    func.json_extract(AdvertoolsTask.payload, '$.type') == 'site_health'
+                ).first()
+                
+                if not existing_health:
+                    # Create weekly site health task
+                    new_health = AdvertoolsTask(
+                        user_id=user_id,
+                        website_url=analysis.website_url,
+                        status='active',
+                        next_execution=datetime.utcnow() + timedelta(days=2), # Start in 2 days
+                        frequency_days=7,
+                        payload={
+                            "type": "site_health",
+                            "website_url": analysis.website_url
+                        }
+                    )
+                    db.add(new_health)
+                    total_created += 1
+                    logger.info(f"Created weekly site health task for user {user_id}")
+                
+                db.commit()
+            finally:
+                db.close()
+        except Exception as e:
+            logger.error(f"Error restoring Advertools tasks for user {user_id}: {e}")
+            
+    return total_created
--- a/backend/services/seo/competitive_analyzer.py
+++ b/backend/services/seo/competitive_analyzer.py
@@ -12,8 +12,7 @@ from sqlalchemy.orm import Session
 from loguru import logger

 from utils.logger_utils import get_service_logger
-from services.onboarding.data_service import OnboardingDataService
-from services.calendar_generation_datasource_framework.data_processing.comprehensive_user_data import ComprehensiveUserDataProcessor
+from api.content_planning.services.content_strategy.onboarding import OnboardingDataIntegrationService

 logger = get_service_logger("competitive_analyzer")

@@ -23,8 +22,7 @@ class CompetitiveAnalyzer:
    def __init__(self, db: Session):
        """Initialize the competitive analyzer."""
        self.db = db
-        self.user_data_service = OnboardingDataService(db)
-        self.comprehensive_processor = ComprehensiveUserDataProcessor(db)
+        self.integration_service = OnboardingDataIntegrationService()
    
    async def get_competitive_insights(self, user_id: str) -> Dict[str, Any]:
        """
@@ -37,8 +35,9 @@ class CompetitiveAnalyzer:
            Dictionary containing competitive insights
        """
        try:
-            # Get user's research preferences and competitor data
-            research_prefs = self.user_data_service.get_user_research_preferences(user_id)
+            # Get user's research preferences and competitor data via SSOT
+            onboarding_data = await self.integration_service.process_onboarding_data(user_id, self.db)
+            research_prefs = onboarding_data.get('research_preferences', {})
            competitors = research_prefs.get('competitors', []) if research_prefs else []
            
            if not competitors:
@@ -51,9 +50,8 @@ class CompetitiveAnalyzer:
                    "last_updated": datetime.now().isoformat()
                }
            
-            # Get comprehensive user data including competitor analysis
-            comprehensive_data = self.comprehensive_processor.get_comprehensive_user_data(user_id)
-            competitor_analysis = comprehensive_data.get('competitor_analysis', {})
+            # Get competitor analysis directly from SSOT data
+            competitor_analysis = onboarding_data.get('competitor_analysis', {})
            
            # Extract competitor keywords and content topics
            competitor_keywords = self._extract_competitor_keywords(competitor_analysis, competitors)
@@ -300,6 +298,7 @@ class CompetitiveAnalyzer:
                else:
                    keyword_map[keyword] = {
                        'keyword': kw['keyword'],
+                        'competitor': kw['competitor'],  # Primary competitor
                        'competitors': [kw['competitor']],
                        'source': kw['source'],
                        'volume_estimate': kw['volume_estimate'],
--- a/backend/services/seo/dashboard_service.py
+++ b/backend/services/seo/dashboard_service.py
@@ -9,6 +9,7 @@ OAuth connections from onboarding step 5.
 from typing import Dict, Any, Optional, List
 from datetime import datetime, timedelta
 from sqlalchemy.orm import Session
+from sqlalchemy import func
 from loguru import logger

 from utils.logger_utils import get_service_logger
@@ -16,9 +17,12 @@ from services.gsc_service import GSCService
 from services.integrations.bing_oauth import BingOAuthService
 from services.bing_analytics_storage_service import BingAnalyticsStorageService
 from services.analytics_cache_service import AnalyticsCacheService
-from services.onboarding.data_service import OnboardingDataService
+from api.content_planning.services.content_strategy.onboarding.data_integration import OnboardingDataIntegrationService
 from .analytics_aggregator import AnalyticsAggregator
 from .competitive_analyzer import CompetitiveAnalyzer
+from models.onboarding import SEOPageAudit, WebsiteAnalysis, OnboardingSession
+from models.website_analysis_monitoring_models import OnboardingFullWebsiteAnalysisTask
+from models.advertools_monitoring_models import AdvertoolsTask

 logger = get_service_logger("seo_dashboard")

@@ -30,12 +34,19 @@ class SEODashboardService:
        self.db = db
        self.gsc_service = GSCService()
        self.bing_oauth = BingOAuthService()
-        self.bing_storage = BingAnalyticsStorageService("sqlite:///alwrity.db")
+        # Bing storage is initialized per-user dynamically
        self.analytics_cache = AnalyticsCacheService()
-        self.user_data_service = OnboardingDataService(db)
+        self.integration_service = OnboardingDataIntegrationService()
        self.analytics_aggregator = AnalyticsAggregator()
        self.competitive_analyzer = CompetitiveAnalyzer(db)
        
+    def _get_bing_storage(self, user_id: str) -> BingAnalyticsStorageService:
+        """Get Bing storage service for user."""
+        from services.database import get_user_db_path
+        db_path = get_user_db_path(user_id)
+        db_url = f"sqlite:///{db_path}"
+        return BingAnalyticsStorageService(db_url)
+        
    async def get_platform_status(self, user_id: str) -> Dict[str, Any]:
        """Get connection status for GSC and Bing platforms."""
        try:
@@ -81,8 +92,10 @@ class SEODashboardService:
        try:
            # Get user's website URL if not provided
            if not site_url:
-                # Try to get from website analysis first
-                website_analysis = self.user_data_service.get_user_website_analysis(int(user_id))
+                # Use SSOT for onboarding data
+                onboarding_data = await self.integration_service.process_onboarding_data(user_id, self.db)
+                website_analysis = onboarding_data.get('website_analysis', {})
+                
                if website_analysis and website_analysis.get('website_url'):
                    site_url = website_analysis['website_url']
                else:
@@ -115,6 +128,10 @@ class SEODashboardService:
            
            # Generate AI insights
            ai_insights = await self._generate_ai_insights(summary, timeseries, competitor_insights)
+
+            technical_seo_audit = self._get_technical_seo_audit_overview(user_id, site_url)
+            
+            advertools_insights = self._get_advertools_insights(user_id, site_url)
            
            return {
                "website_url": site_url,
@@ -124,12 +141,71 @@ class SEODashboardService:
                "competitor_insights": competitor_insights,
                "health_score": health_score,
                "ai_insights": ai_insights,
+                "technical_seo_audit": technical_seo_audit,
+                "advertools_insights": advertools_insights,
                "last_updated": datetime.now().isoformat()
            }
            
        except Exception as e:
            logger.error(f"Error getting dashboard overview for user {user_id}: {e}")
            raise
+
+    def _get_technical_seo_audit_overview(self, user_id: str, site_url: str) -> Dict[str, Any]:
+        site_key = (site_url or "").rstrip("/")
+
+        try:
+            q = self.db.query(SEOPageAudit).filter(SEOPageAudit.user_id == str(user_id))
+
+            if site_key:
+                q = q.filter(SEOPageAudit.website_url.like(f"{site_key}%"))
+
+            audits = q.order_by(func.coalesce(SEOPageAudit.overall_score, 1000).asc()).all()
+
+            pages_audited = len(audits)
+            scores = [a.overall_score for a in audits if isinstance(a.overall_score, int)]
+            avg_score = round(sum(scores) / len(scores)) if scores else 0
+            fix_scheduled_pages = len([a for a in audits if a.status == 'fix_scheduled'])
+
+            worst_pages = [
+                {
+                    "page_url": a.page_url,
+                    "overall_score": a.overall_score,
+                    "status": a.status,
+                    "issues_count": len(a.issues or []) if isinstance(a.issues, list) else 0
+                }
+                for a in audits[:10]
+            ]
+
+            task = self.db.query(OnboardingFullWebsiteAnalysisTask).filter(
+                OnboardingFullWebsiteAnalysisTask.user_id == str(user_id),
+                OnboardingFullWebsiteAnalysisTask.website_url.like(f"{site_key}%")
+            ).order_by(OnboardingFullWebsiteAnalysisTask.updated_at.desc()).first()
+
+            task_status = None
+            next_execution = None
+            if task:
+                task_status = task.status
+                next_execution = task.next_execution.isoformat() if task.next_execution else None
+
+            return {
+                "status": "ready" if pages_audited > 0 else ("scheduled" if task_status == "active" else "pending"),
+                "task_status": task_status,
+                "next_execution": next_execution,
+                "pages_audited": pages_audited,
+                "avg_score": avg_score,
+                "fix_scheduled_pages": fix_scheduled_pages,
+                "worst_pages": worst_pages
+            }
+        except Exception as e:
+            logger.warning(f"Failed to build technical SEO audit overview for user {user_id}: {e}")
+            return {
+                "status": "error",
+                "error": str(e),
+                "pages_audited": 0,
+                "avg_score": 0,
+                "fix_scheduled_pages": 0,
+                "worst_pages": []
+            }
    
    async def get_gsc_data(self, user_id: str, site_url: Optional[str] = None) -> Dict[str, Any]:
        """Get GSC data for the specified site."""
@@ -181,13 +257,15 @@ class SEODashboardService:
            
            # Get data from Bing storage service
            if site_url:
-                bing_data = self.bing_storage.get_analytics_summary(user_id, site_url, days=30)
+                bing_storage = self._get_bing_storage(user_id)
+                bing_data = bing_storage.get_analytics_summary(user_id, site_url, days=30)
            else:
                # Get all sites for user
                sites = self._get_bing_sites(user_id)
                if sites:
                    logger.info(f"Using first Bing site for analysis: {sites[0]}")
-                    bing_data = self.bing_storage.get_analytics_summary(user_id, sites[0], days=30)
+                    bing_storage = self._get_bing_storage(user_id)
+                    bing_data = bing_storage.get_analytics_summary(user_id, sites[0], days=30)
                else:
                    logger.warning(f"No Bing sites found for user {user_id}")
                    return {"error": "No Bing sites found", "data": [], "status": "disconnected"}
@@ -249,6 +327,46 @@ class SEODashboardService:
                "last_updated": datetime.now().isoformat()
            }
    
+    def _get_advertools_insights(self, user_id: str, site_url: str) -> Dict[str, Any]:
+        """Fetch Advertools-based insights from WebsiteAnalysis and AdvertoolsTasks."""
+        try:
+            # 1. Get augmented persona themes from WebsiteAnalysis
+            session = self.db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
+            if not session:
+                return {}
+
+            analysis = self.db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
+            
+            # 2. Get latest tasks status
+            tasks = self.db.query(AdvertoolsTask).filter(AdvertoolsTask.user_id == user_id).all()
+            
+            audit_status = "pending"
+            health_status = "pending"
+            
+            for task in tasks:
+                t_type = task.payload.get('type') if task.payload else None
+                if t_type == 'content_audit':
+                    audit_status = task.status
+                elif t_type == 'site_health':
+                    health_status = task.status
+
+            brand_analysis = analysis.brand_analysis or {} if analysis else {}
+            seo_audit = analysis.seo_audit or {} if analysis else {}
+
+            return {
+                "augmented_themes": brand_analysis.get('augmented_themes', []),
+                "last_audit": brand_analysis.get('last_advertools_audit'),
+                "site_health": seo_audit.get('site_health', {}),
+                "last_health_check": seo_audit.get('last_advertools_health_check'),
+                "tasks": {
+                    "content_audit": audit_status,
+                    "site_health": health_status
+                }
+            }
+        except Exception as e:
+            logger.warning(f"Failed to fetch Advertools insights for user {user_id}: {e}")
+            return {}
+
    def _get_gsc_sites(self, user_id: str) -> List[str]:
        """Get GSC sites for user."""
        try:
@@ -394,4 +512,4 @@ class SEODashboardService:
            
        except Exception as e:
            logger.error(f"Error generating AI insights: {e}")
-            return []
+            return []
--- a/backend/services/seo/deep_competitor_analysis_service.py
+++ b/backend/services/seo/deep_competitor_analysis_service.py
@@ -0,0 +1,603 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import re
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+
+from services.component_logic.web_crawler_logic import WebCrawlerLogic
+from services.llm_providers.main_text_generation import llm_text_gen
+from services.ai_service_manager import AIServiceManager, AIServiceType
+from services.seo_tools.sitemap_service import SitemapService
+from services.seo.advertools_service import AdvertoolsService
+from utils.logger_utils import get_service_logger
+
+logger = get_service_logger("deep_competitor_analysis")
+
+
+class DeepCompetitorAnalysisService:
+    def __init__(self):
+        self.crawler = WebCrawlerLogic()
+        self.advertools = AdvertoolsService()
+
+    async def run(
+        self,
+        *,
+        user_id: str,
+        website_analysis: Dict[str, Any],
+        competitors: List[Dict[str, Any]],
+        max_competitors: int = 25,
+        crawl_concurrency: int = 4
+    ) -> Dict[str, Any]:
+        baseline = self._build_baseline(website_analysis)
+        normalized_competitors = self._normalize_competitors(competitors, max_competitors=max_competitors)
+
+        crawl_results = await self._crawl_competitors(
+            normalized_competitors,
+            crawl_concurrency=crawl_concurrency
+        )
+
+        per_competitor_outputs: List[Dict[str, Any]] = []
+        for competitor_input, crawl_result in crawl_results:
+            extraction = self._build_extraction_artifact(competitor_input, crawl_result)
+            ai_analysis = await self._analyze_competitor_with_ai(
+                user_id=user_id,
+                baseline=baseline,
+                competitor_input=competitor_input,
+                extraction=extraction
+            )
+            per_competitor_outputs.append({
+                "input": competitor_input,
+                "extraction": extraction,
+                "ai_analysis": ai_analysis
+            })
+
+        aggregation = await self._aggregate_with_ai(
+            user_id=user_id,
+            baseline=baseline,
+            competitors=per_competitor_outputs
+        )
+
+        return {
+            "baseline": baseline,
+            "competitors": per_competitor_outputs,
+            "aggregation": aggregation,
+            "metadata": {
+                "generated_at": datetime.utcnow().isoformat(),
+                "competitors_requested": len(normalized_competitors),
+                "competitors_analyzed": len(per_competitor_outputs),
+                "crawl_concurrency": crawl_concurrency
+            }
+        }
+
+    async def generate_weekly_strategy_brief(
+        self,
+        *,
+        user_id: str,
+        website_analysis: Dict[str, Any],
+        competitors: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """
+        Generates a weekly strategic intelligence brief by analyzing 
+        recent competitor changes and market shifts.
+        """
+        sitemap_service = SitemapService()
+        ai_manager = AIServiceManager()
+        
+        # Stage 1: Data Collection (User + Competitors)
+        baseline = self._build_baseline(website_analysis)
+        normalized_competitors = self._normalize_competitors(competitors, max_competitors=10)
+        
+        # Fetch competitor sitemaps for recent changes
+        competitor_changes = []
+        seven_days_ago = datetime.utcnow() - timedelta(days=7)
+        ninety_days_ago = datetime.utcnow() - timedelta(days=90)
+        
+        for comp in normalized_competitors:
+            try:
+                # Stage 1: Advertools Deep Intelligence
+                # Discover exact sitemap URL first (essential for Advertools)
+                discovered_sitemap = await sitemap_service.discover_sitemap_url(comp['url'])
+                effective_url = discovered_sitemap if discovered_sitemap else comp['url']
+                
+                adv_result = await self.advertools.analyze_sitemap(effective_url)
+                
+                # REUSE: Use existing SitemapService.analyze_sitemap for robust Stage 1 & 2
+                analysis_result = await sitemap_service.analyze_sitemap(
+                    sitemap_url=effective_url,
+                    analyze_content_trends=True,
+                    analyze_publishing_patterns=True,
+                    include_ai_insights=False,
+                    user_id=user_id
+                )
+                
+                if analysis_result and analysis_result.get('urls'):
+                    urls = analysis_result['urls']
+                    structure = analysis_result.get('structure_analysis', {})
+                    
+                    # Enhancement 1: Keyword Clustering (NLP from URLs) - REUSE from SitemapService
+                    keyword_clusters = structure.get('keyword_clusters', {})
+                    
+                    # Enhancement 2: Strategic Pillar Mapping - REUSE from SitemapService
+                    pillars = structure.get('strategic_pillars', {})
+                    
+                    # Enhancement 3: Advertools Site Hierarchy (from folders)
+                    site_hierarchy = adv_result.get('metrics', {}).get('top_pillars', {}) if adv_result.get('success') else {}
+                    
+                    # Enhancement 4: Content Cadence Trend (Last 7 days vs 90 days)
+                    recent_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), seven_days_ago)]
+                    historical_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), ninety_days_ago)]
+                    
+                    recent_velocity = len(recent_urls) / 7
+                    historical_velocity = len(historical_urls) / 90
+                    cadence_shift = ((recent_velocity - historical_velocity) / max(historical_velocity, 0.01)) * 100
+                    
+                    # Advertools Word Frequency (Audit top 5 recent URLs)
+                    top_themes = []
+                    if recent_urls:
+                        audit_urls = [u['loc'] for u in recent_urls[:5]]
+                        # Use thread-safe audit_content from AdvertoolsService
+                        audit_result = await self.advertools.audit_content(audit_urls)
+                        if audit_result.get('success'):
+                            top_themes = audit_result.get('themes', [])
+
+                    competitor_changes.append({
+                        "domain": comp['domain'],
+                        "name": comp['name'],
+                        "new_content_count": len(recent_urls),
+                        "recent_topics": [self._extract_topic_from_url(u['loc']) for u in recent_urls[:10]],
+                        "total_pages": len(urls),
+                        "keyword_clusters": keyword_clusters,
+                        "strategic_pillars": pillars,
+                        "site_hierarchy": site_hierarchy,
+                        "top_themes": top_themes,
+                        "cadence_shift_percent": round(cadence_shift, 1),
+                        "publishing_velocity": round(recent_velocity, 2),
+                        "stale_content_pct": adv_result.get('metrics', {}).get('stale_content_percentage', 0) if adv_result.get('success') else 0
+                    })
+            except Exception as e:
+                logger.warning(f"Failed to fetch sitemap for {comp['domain']}: {e}")
+
+        # Stage 2: Differential Analysis (Non-AI Aggregation)
+        avg_competitor_velocity = sum(c['publishing_velocity'] for c in competitor_changes) / len(competitor_changes) if competitor_changes else 0
+        market_clusters = self._aggregate_clusters([c['keyword_clusters'] for c in competitor_changes])
+        
+        # Stage 3: AI Strategic Intelligence
+        # Extract rich user context from baseline
+        brand_analysis = baseline.get("brand_analysis", {})
+        seo_audit = baseline.get("seo_audit", {})
+        
+        user_niche = brand_analysis.get("industry") or "General Business"
+        user_topics = brand_analysis.get("topics") or []
+        if not user_topics and seo_audit.get("keywords"):
+             user_topics = seo_audit.get("keywords")[:5]
+
+        analysis_context = {
+            "user_profile": {
+                "website_url": baseline.get("website_url"),
+                "industry": user_niche,
+                "niche_description": brand_analysis.get("description") or brand_analysis.get("summary") or "",
+                "core_topics": user_topics,
+                "target_audience": baseline.get("target_audience") or {},
+                "business_objectives": brand_analysis.get("objectives") or "Growth",
+                "brand_voice": brand_analysis.get("voice") or "Professional",
+                "augmented_themes": brand_analysis.get("augmented_themes", []) # Added from Advertools
+            },
+            "market_intelligence": {
+                "market_clusters": market_clusters,
+                "competitors_analyzed_count": len(competitor_changes),
+                "market_opportunities_detected": ["Content Velocity Gap", "Topic Authority Shift", "Stale Content Replacement"],
+                "competitor_hierarchies": {c['name']: c['site_hierarchy'] for c in competitor_changes},
+                "competitor_content_themes": {c['name']: c['top_themes'] for c in competitor_changes}
+            },
+            "competitive_landscape_detailed": competitor_changes,
+        }
+        
+        # Call AI for strategic intelligence
+        strategic_intelligence = await ai_manager.generate_strategic_intelligence(analysis_context, user_id=user_id)
+        content_gaps = await ai_manager.generate_content_gap_analysis(analysis_context, user_id=user_id)
+
+        # Stage 4: Result Assembly
+        report = {
+            "week_commencing": seven_days_ago.date().isoformat(),
+            "generated_at": datetime.utcnow().isoformat(),
+            "metrics": {
+                "market_velocity": round(avg_competitor_velocity, 2),
+                "market_clusters": market_clusters[:5],
+                "aggressive_competitors": [c['name'] for c in competitor_changes if c['cadence_shift_percent'] > 50]
+            },
+            "insights": {
+                "the_big_move": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[0] if strategic_intelligence.get("success") else {},
+                "low_hanging_fruit": content_gaps.get("data", {}).get("content_recommendations", []) if content_gaps.get("success") else [],
+                "threat_alerts": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[1:] if strategic_intelligence.get("success") else []
+            },
+            "raw_data": {
+                "competitor_changes": competitor_changes
+            }
+        }
+        
+        return report
+
+    def _is_newer_than(self, lastmod: Optional[str], threshold: datetime) -> bool:
+        if not lastmod:
+            return False
+        try:
+            # Handle various ISO formats
+            dt_str = lastmod.replace('Z', '+00:00')
+            return datetime.fromisoformat(dt_str).replace(tzinfo=None) > threshold
+        except:
+            return False
+
+    def _aggregate_clusters(self, clusters_list: List[Dict[str, int]]) -> List[str]:
+        """Aggregate clusters across competitors to find market-wide themes."""
+        master: Dict[str, int] = {}
+        for cluster in clusters_list:
+            for k, v in cluster.items():
+                master[k] = master.get(k, 0) + 1 # Count competitor occurrences
+        return sorted(master, key=lambda x: master[x], reverse=True)[:10]
+
+    def _extract_topic_from_url(self, url: str) -> str:
+        """Helper to get a readable topic from a URL slug."""
+        try:
+            path = urlparse(url).path
+            slug = path.strip('/').split('/')[-1]
+            return slug.replace('-', ' ').replace('_', ' ').capitalize()
+        except:
+            return "New Content"
+
+    def _build_baseline(self, website_analysis: Dict[str, Any]) -> Dict[str, Any]:
+        if not isinstance(website_analysis, dict):
+            website_analysis = {}
+
+        baseline = {
+            "website_url": website_analysis.get("website_url"),
+            "brand_analysis": website_analysis.get("brand_analysis") or {},
+            "content_strategy_insights": website_analysis.get("content_strategy_insights") or {},
+            "seo_audit": website_analysis.get("seo_audit") or {},
+            "style_guidelines": website_analysis.get("style_guidelines") or {},
+            "style_patterns": website_analysis.get("style_patterns") or {}
+        }
+
+        return baseline
+
+    def _normalize_competitors(self, competitors: List[Dict[str, Any]], *, max_competitors: int) -> List[Dict[str, Any]]:
+        if not isinstance(competitors, list):
+            return []
+
+        seen_domains = set()
+        normalized: List[Dict[str, Any]] = []
+
+        for comp in competitors:
+            if not isinstance(comp, dict):
+                continue
+
+            raw_url = comp.get("url") or comp.get("website_url") or comp.get("domain") or ""
+            url = self._normalize_url(raw_url)
+            if not url:
+                continue
+
+            domain = self._extract_domain(url)
+            if not domain or domain in seen_domains:
+                continue
+
+            seen_domains.add(domain)
+            normalized.append({
+                "url": url,
+                "domain": domain,
+                "name": comp.get("name") or comp.get("title") or domain,
+                "summary": comp.get("summary") or comp.get("description") or ""
+            })
+
+            if len(normalized) >= max_competitors:
+                break
+
+        return normalized
+
+    def _normalize_url(self, raw: str) -> Optional[str]:
+        if not raw or not isinstance(raw, str):
+            return None
+
+        raw = raw.strip()
+        if not raw:
+            return None
+
+        if not raw.startswith(("http://", "https://")):
+            raw = "https://" + raw
+
+        try:
+            parsed = urlparse(raw)
+            if not parsed.scheme or not parsed.netloc:
+                return None
+            return f"{parsed.scheme}://{parsed.netloc}"
+        except Exception:
+            return None
+
+    def _extract_domain(self, url: str) -> Optional[str]:
+        try:
+            parsed = urlparse(url)
+            domain = (parsed.netloc or "").lower()
+            if domain.startswith("www."):
+                domain = domain[4:]
+            return domain or None
+        except Exception:
+            return None
+
+    async def _crawl_competitors(
+        self,
+        competitors: List[Dict[str, Any]],
+        *,
+        crawl_concurrency: int
+    ) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
+        semaphore = asyncio.Semaphore(max(1, int(crawl_concurrency)))
+
+        async def crawl_one(comp: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+            async with semaphore:
+                url = comp.get("url")
+                if not url:
+                    return comp, {"success": False, "error": "missing_url"}
+                try:
+                    return comp, await self.crawler.crawl_website(url)
+                except Exception as e:
+                    return comp, {"success": False, "error": str(e)}
+
+        tasks = [crawl_one(c) for c in competitors]
+        return await asyncio.gather(*tasks)
+
+    def _build_extraction_artifact(self, competitor_input: Dict[str, Any], crawl_result: Dict[str, Any]) -> Dict[str, Any]:
+        if not isinstance(crawl_result, dict) or not crawl_result.get("success"):
+            return {
+                "fetch_status": {
+                    "status": "failed",
+                    "error": crawl_result.get("error") if isinstance(crawl_result, dict) else "unknown_error"
+                }
+            }
+
+        content = crawl_result.get("content") if isinstance(crawl_result.get("content"), dict) else {}
+        title = content.get("title") or ""
+        description = content.get("description") or ""
+        headings = content.get("headings") if isinstance(content.get("headings"), list) else []
+        links = content.get("links") if isinstance(content.get("links"), list) else []
+        meta_tags = content.get("meta_tags") if isinstance(content.get("meta_tags"), dict) else {}
+        main_content = content.get("main_content") or ""
+        content_structure = content.get("content_structure") if isinstance(content.get("content_structure"), dict) else {}
+
+        nav_labels = self._extract_nav_labels(links)
+        h1_h2 = [h for h in headings if isinstance(h, str)][:25]
+        cta_signals = self._extract_cta_signals(main_content, links)
+        proof_signals = self._extract_proof_signals(main_content, links)
+
+        excerpt = main_content.strip()
+        if len(excerpt) > 2000:
+            excerpt = excerpt[:2000]
+
+        return {
+            "fetch_status": {
+                "status": "ok",
+                "fetched_url": crawl_result.get("url"),
+                "timestamp": crawl_result.get("timestamp")
+            },
+            "page_meta": {
+                "title": title,
+                "meta_description": description,
+                "og_title": meta_tags.get("og:title"),
+                "og_description": meta_tags.get("og:description")
+            },
+            "structure": {
+                "headings": h1_h2,
+                "nav_labels": nav_labels,
+                "content_structure": content_structure
+            },
+            "signals": {
+                "cta_signals": cta_signals,
+                "proof_signals": proof_signals
+            },
+            "content_excerpt": excerpt
+        }
+
+    def _extract_nav_labels(self, links: List[Dict[str, Any]]) -> List[str]:
+        labels: List[str] = []
+        for link in links[:200]:
+            if not isinstance(link, dict):
+                continue
+            text = (link.get("text") or "").strip()
+            if not text or len(text) > 50:
+                continue
+            labels.append(text)
+        deduped: List[str] = []
+        seen = set()
+        for label in labels:
+            key = label.lower()
+            if key in seen:
+                continue
+            seen.add(key)
+            deduped.append(label)
+            if len(deduped) >= 25:
+                break
+        return deduped
+
+    def _extract_cta_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
+        text = (main_content or "").lower()
+        keywords = ["get started", "start", "book", "demo", "trial", "pricing", "contact", "signup", "sign up", "subscribe"]
+        keyword_hits = [k for k in keywords if k in text]
+
+        link_texts = []
+        for link in links[:200]:
+            if isinstance(link, dict):
+                t = (link.get("text") or "").strip()
+                if t:
+                    link_texts.append(t.lower())
+
+        cta_link_hits = [k for k in keywords if any(k in lt for lt in link_texts)]
+        return {
+            "keyword_hits": keyword_hits[:10],
+            "link_cta_hits": list(dict.fromkeys(cta_link_hits))[:10]
+        }
+
+    def _extract_proof_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
+        text = (main_content or "").lower()
+        proof_keywords = ["case study", "testimonials", "customers", "trusted by", "reviews", "awards", "partners"]
+        hits = [k for k in proof_keywords if k in text]
+
+        link_hits = []
+        for link in links[:200]:
+            if not isinstance(link, dict):
+                continue
+            href = (link.get("href") or "").lower()
+            if any(k.replace(" ", "") in href.replace("-", "").replace("_", "") for k in ["case study", "testimonials", "customers"]):
+                link_hits.append(href)
+        return {
+            "keyword_hits": hits[:10],
+            "supporting_links": link_hits[:10]
+        }
+
+    async def _analyze_competitor_with_ai(
+        self,
+        *,
+        user_id: str,
+        baseline: Dict[str, Any],
+        competitor_input: Dict[str, Any],
+        extraction: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        if not isinstance(extraction, dict) or extraction.get("fetch_status", {}).get("status") != "ok":
+            return {
+                "status": "skipped",
+                "reason": "crawl_failed"
+            }
+
+        json_struct = {
+            "positioning": {
+                "value_prop": "string",
+                "target_audience": "string",
+                "market_tier": "string",
+                "primary_offer": "string"
+            },
+            "content_strategy": {
+                "themes": ["string"],
+                "messaging_angles": ["string"],
+                "cta_patterns": ["string"],
+                "tone_markers": ["string"]
+            },
+            "competitive_advantages": ["string"],
+            "weaknesses_or_risks": ["string"],
+            "comparison_to_user_baseline": {
+                "overlaps": ["string"],
+                "deltas": ["string"],
+                "opportunities": ["string"]
+            },
+            "confidence": {
+                "overall": "number",
+                "notes": ["string"]
+            }
+        }
+
+        prompt = (
+            "You are a competitive intelligence analyst.\n"
+            "Analyze the competitor homepage extraction and compare it to the user's Step 2 baseline insights.\n"
+            "Return strictly the requested JSON.\n\n"
+            f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
+            f"Competitor input: {json.dumps(competitor_input, ensure_ascii=False)}\n\n"
+            f"Homepage extraction: {json.dumps(extraction, ensure_ascii=False)}\n"
+        )
+
+        try:
+            raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
+            parsed = self._safe_json_parse(raw)
+            if isinstance(parsed, dict):
+                return parsed
+            return {"status": "failed", "error": "invalid_ai_json"}
+        except Exception as e:
+            logger.warning(f"AI competitor analysis failed for {competitor_input.get('domain')}: {e}")
+            return {"status": "failed", "error": str(e)}
+
+    async def _aggregate_with_ai(
+        self,
+        *,
+        user_id: str,
+        baseline: Dict[str, Any],
+        competitors: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        json_struct = {
+            "market_map": {
+                "clusters": [
+                    {
+                        "cluster_name": "string",
+                        "description": "string",
+                        "competitors": ["string"]
+                    }
+                ]
+            },
+            "common_patterns": {
+                "common_themes": ["string"],
+                "common_ctas": ["string"],
+                "common_proof_signals": ["string"]
+            },
+            "content_gaps_and_opportunities": [
+                {
+                    "gap": "string",
+                    "why_it_matters": "string",
+                    "recommended_content_types": ["string"],
+                    "impact": "string",
+                    "effort": "string"
+                }
+            ],
+            "strategic_recommendations": [
+                {
+                    "action": "string",
+                    "expected_impact": "string",
+                    "effort": "string",
+                    "first_steps": ["string"]
+                }
+            ],
+            "warnings": ["string"]
+        }
+
+        compact = []
+        for item in competitors:
+            comp = item.get("input") if isinstance(item, dict) else None
+            ai = item.get("ai_analysis") if isinstance(item, dict) else None
+            if isinstance(comp, dict) and isinstance(ai, dict):
+                compact.append({
+                    "domain": comp.get("domain"),
+                    "name": comp.get("name"),
+                    "ai_analysis": ai
+                })
+
+        prompt = (
+            "You are a senior strategy consultant.\n"
+            "Using the user's Step 2 baseline insights and per-competitor analyses, produce an aggregated market view.\n"
+            "Return strictly the requested JSON.\n\n"
+            f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
+            f"Per-competitor analyses: {json.dumps(compact, ensure_ascii=False)}\n"
+        )
+
+        try:
+            raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
+            parsed = self._safe_json_parse(raw)
+            if isinstance(parsed, dict):
+                return parsed
+            return {"warnings": ["invalid_ai_json"]}
+        except Exception as e:
+            logger.warning(f"AI aggregation failed: {e}")
+            return {"warnings": [str(e)]}
+
+    def _safe_json_parse(self, text: str) -> Any:
+        if not isinstance(text, str):
+            return None
+        cleaned = text.strip()
+        cleaned = re.sub(r"^```json\\s*", "", cleaned)
+        cleaned = re.sub(r"^```\\s*", "", cleaned)
+        cleaned = re.sub(r"```\\s*$", "", cleaned)
+        cleaned = cleaned.strip()
+        try:
+            return json.loads(cleaned)
+        except Exception:
+            match = re.search(r"\\{[\\s\\S]*\\}", cleaned)
+            if match:
+                try:
+                    return json.loads(match.group(0))
+                except Exception:
+                    return None
+            return None
+