Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts

2026-02-08 13:56:57 +05:30
parent 1db10ccd0f
commit e404a86502
333 changed files with 42223 additions and 10875 deletions
--- a/backend/services/research/deep_competitor_analysis.py
+++ b/backend/services/research/deep_competitor_analysis.py
@@ -0,0 +1,603 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import re
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+
+from services.component_logic.web_crawler_logic import WebCrawlerLogic
+from services.llm_providers.main_text_generation import llm_text_gen
+from services.ai_service_manager import AIServiceManager, AIServiceType
+from services.seo_tools.sitemap_service import SitemapService
+from services.seo.advertools_service import AdvertoolsService
+from utils.logger_utils import get_service_logger
+
+logger = get_service_logger("deep_competitor_analysis")
+
+
+class DeepCompetitorAnalysisService:
+    def __init__(self):
+        self.crawler = WebCrawlerLogic()
+        self.advertools = AdvertoolsService()
+
+    async def run(
+        self,
+        *,
+        user_id: str,
+        website_analysis: Dict[str, Any],
+        competitors: List[Dict[str, Any]],
+        max_competitors: int = 25,
+        crawl_concurrency: int = 4
+    ) -> Dict[str, Any]:
+        baseline = self._build_baseline(website_analysis)
+        normalized_competitors = self._normalize_competitors(competitors, max_competitors=max_competitors)
+
+        crawl_results = await self._crawl_competitors(
+            normalized_competitors,
+            crawl_concurrency=crawl_concurrency
+        )
+
+        per_competitor_outputs: List[Dict[str, Any]] = []
+        for competitor_input, crawl_result in crawl_results:
+            extraction = self._build_extraction_artifact(competitor_input, crawl_result)
+            ai_analysis = await self._analyze_competitor_with_ai(
+                user_id=user_id,
+                baseline=baseline,
+                competitor_input=competitor_input,
+                extraction=extraction
+            )
+            per_competitor_outputs.append({
+                "input": competitor_input,
+                "extraction": extraction,
+                "ai_analysis": ai_analysis
+            })
+
+        aggregation = await self._aggregate_with_ai(
+            user_id=user_id,
+            baseline=baseline,
+            competitors=per_competitor_outputs
+        )
+
+        return {
+            "baseline": baseline,
+            "competitors": per_competitor_outputs,
+            "aggregation": aggregation,
+            "metadata": {
+                "generated_at": datetime.utcnow().isoformat(),
+                "competitors_requested": len(normalized_competitors),
+                "competitors_analyzed": len(per_competitor_outputs),
+                "crawl_concurrency": crawl_concurrency
+            }
+        }
+
+    async def generate_weekly_strategy_brief(
+        self,
+        *,
+        user_id: str,
+        website_analysis: Dict[str, Any],
+        competitors: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """
+        Generates a weekly strategic intelligence brief by analyzing 
+        recent competitor changes and market shifts.
+        """
+        sitemap_service = SitemapService()
+        ai_manager = AIServiceManager()
+        
+        # Stage 1: Data Collection (User + Competitors)
+        baseline = self._build_baseline(website_analysis)
+        normalized_competitors = self._normalize_competitors(competitors, max_competitors=10)
+        
+        # Fetch competitor sitemaps for recent changes
+        competitor_changes = []
+        seven_days_ago = datetime.utcnow() - timedelta(days=7)
+        ninety_days_ago = datetime.utcnow() - timedelta(days=90)
+        
+        for comp in normalized_competitors:
+            try:
+                # Stage 1: Advertools Deep Intelligence
+                # Discover exact sitemap URL first (essential for Advertools)
+                discovered_sitemap = await sitemap_service.discover_sitemap_url(comp['url'])
+                effective_url = discovered_sitemap if discovered_sitemap else comp['url']
+                
+                adv_result = await self.advertools.analyze_sitemap(effective_url)
+                
+                # REUSE: Use existing SitemapService.analyze_sitemap for robust Stage 1 & 2
+                analysis_result = await sitemap_service.analyze_sitemap(
+                    sitemap_url=effective_url,
+                    analyze_content_trends=True,
+                    analyze_publishing_patterns=True,
+                    include_ai_insights=False,
+                    user_id=user_id
+                )
+                
+                if analysis_result and analysis_result.get('urls'):
+                    urls = analysis_result['urls']
+                    structure = analysis_result.get('structure_analysis', {})
+                    
+                    # Enhancement 1: Keyword Clustering (NLP from URLs) - REUSE from SitemapService
+                    keyword_clusters = structure.get('keyword_clusters', {})
+                    
+                    # Enhancement 2: Strategic Pillar Mapping - REUSE from SitemapService
+                    pillars = structure.get('strategic_pillars', {})
+                    
+                    # Enhancement 3: Advertools Site Hierarchy (from folders)
+                    site_hierarchy = adv_result.get('metrics', {}).get('top_pillars', {}) if adv_result.get('success') else {}
+                    
+                    # Enhancement 4: Content Cadence Trend (Last 7 days vs 90 days)
+                    recent_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), seven_days_ago)]
+                    historical_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), ninety_days_ago)]
+                    
+                    recent_velocity = len(recent_urls) / 7
+                    historical_velocity = len(historical_urls) / 90
+                    cadence_shift = ((recent_velocity - historical_velocity) / max(historical_velocity, 0.01)) * 100
+                    
+                    # Advertools Word Frequency (Audit top 5 recent URLs)
+                    top_themes = []
+                    if recent_urls:
+                        audit_urls = [u['loc'] for u in recent_urls[:5]]
+                        # Use thread-safe audit_content from AdvertoolsService
+                        audit_result = await self.advertools.audit_content(audit_urls)
+                        if audit_result.get('success'):
+                            top_themes = audit_result.get('themes', [])
+
+                    competitor_changes.append({
+                        "domain": comp['domain'],
+                        "name": comp['name'],
+                        "new_content_count": len(recent_urls),
+                        "recent_topics": [self._extract_topic_from_url(u['loc']) for u in recent_urls[:10]],
+                        "total_pages": len(urls),
+                        "keyword_clusters": keyword_clusters,
+                        "strategic_pillars": pillars,
+                        "site_hierarchy": site_hierarchy,
+                        "top_themes": top_themes,
+                        "cadence_shift_percent": round(cadence_shift, 1),
+                        "publishing_velocity": round(recent_velocity, 2),
+                        "stale_content_pct": adv_result.get('metrics', {}).get('stale_content_percentage', 0) if adv_result.get('success') else 0
+                    })
+            except Exception as e:
+                logger.warning(f"Failed to fetch sitemap for {comp['domain']}: {e}")
+
+        # Stage 2: Differential Analysis (Non-AI Aggregation)
+        avg_competitor_velocity = sum(c['publishing_velocity'] for c in competitor_changes) / len(competitor_changes) if competitor_changes else 0
+        market_clusters = self._aggregate_clusters([c['keyword_clusters'] for c in competitor_changes])
+        
+        # Stage 3: AI Strategic Intelligence
+        # Extract rich user context from baseline
+        brand_analysis = baseline.get("brand_analysis", {})
+        seo_audit = baseline.get("seo_audit", {})
+        
+        user_niche = brand_analysis.get("industry") or "General Business"
+        user_topics = brand_analysis.get("topics") or []
+        if not user_topics and seo_audit.get("keywords"):
+             user_topics = seo_audit.get("keywords")[:5]
+
+        analysis_context = {
+            "user_profile": {
+                "website_url": baseline.get("website_url"),
+                "industry": user_niche,
+                "niche_description": brand_analysis.get("description") or brand_analysis.get("summary") or "",
+                "core_topics": user_topics,
+                "target_audience": baseline.get("target_audience") or {},
+                "business_objectives": brand_analysis.get("objectives") or "Growth",
+                "brand_voice": brand_analysis.get("voice") or "Professional",
+                "augmented_themes": brand_analysis.get("augmented_themes", []) # Added from Advertools
+            },
+            "market_intelligence": {
+                "market_clusters": market_clusters,
+                "competitors_analyzed_count": len(competitor_changes),
+                "market_opportunities_detected": ["Content Velocity Gap", "Topic Authority Shift", "Stale Content Replacement"],
+                "competitor_hierarchies": {c['name']: c['site_hierarchy'] for c in competitor_changes},
+                "competitor_content_themes": {c['name']: c['top_themes'] for c in competitor_changes}
+            },
+            "competitive_landscape_detailed": competitor_changes,
+        }
+        
+        # Call AI for strategic intelligence
+        strategic_intelligence = await ai_manager.generate_strategic_intelligence(analysis_context, user_id=user_id)
+        content_gaps = await ai_manager.generate_content_gap_analysis(analysis_context, user_id=user_id)
+
+        # Stage 4: Result Assembly
+        report = {
+            "week_commencing": seven_days_ago.date().isoformat(),
+            "generated_at": datetime.utcnow().isoformat(),
+            "metrics": {
+                "market_velocity": round(avg_competitor_velocity, 2),
+                "market_clusters": market_clusters[:5],
+                "aggressive_competitors": [c['name'] for c in competitor_changes if c['cadence_shift_percent'] > 50]
+            },
+            "insights": {
+                "the_big_move": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[0] if strategic_intelligence.get("success") else {},
+                "low_hanging_fruit": content_gaps.get("data", {}).get("content_recommendations", []) if content_gaps.get("success") else [],
+                "threat_alerts": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[1:] if strategic_intelligence.get("success") else []
+            },
+            "raw_data": {
+                "competitor_changes": competitor_changes
+            }
+        }
+        
+        return report
+
+    def _is_newer_than(self, lastmod: Optional[str], threshold: datetime) -> bool:
+        if not lastmod:
+            return False
+        try:
+            # Handle various ISO formats
+            dt_str = lastmod.replace('Z', '+00:00')
+            return datetime.fromisoformat(dt_str).replace(tzinfo=None) > threshold
+        except:
+            return False
+
+    def _aggregate_clusters(self, clusters_list: List[Dict[str, int]]) -> List[str]:
+        """Aggregate clusters across competitors to find market-wide themes."""
+        master: Dict[str, int] = {}
+        for cluster in clusters_list:
+            for k, v in cluster.items():
+                master[k] = master.get(k, 0) + 1 # Count competitor occurrences
+        return sorted(master, key=lambda x: master[x], reverse=True)[:10]
+
+    def _extract_topic_from_url(self, url: str) -> str:
+        """Helper to get a readable topic from a URL slug."""
+        try:
+            path = urlparse(url).path
+            slug = path.strip('/').split('/')[-1]
+            return slug.replace('-', ' ').replace('_', ' ').capitalize()
+        except:
+            return "New Content"
+
+    def _build_baseline(self, website_analysis: Dict[str, Any]) -> Dict[str, Any]:
+        if not isinstance(website_analysis, dict):
+            website_analysis = {}
+
+        baseline = {
+            "website_url": website_analysis.get("website_url"),
+            "brand_analysis": website_analysis.get("brand_analysis") or {},
+            "content_strategy_insights": website_analysis.get("content_strategy_insights") or {},
+            "seo_audit": website_analysis.get("seo_audit") or {},
+            "style_guidelines": website_analysis.get("style_guidelines") or {},
+            "style_patterns": website_analysis.get("style_patterns") or {}
+        }
+
+        return baseline
+
+    def _normalize_competitors(self, competitors: List[Dict[str, Any]], *, max_competitors: int) -> List[Dict[str, Any]]:
+        if not isinstance(competitors, list):
+            return []
+
+        seen_domains = set()
+        normalized: List[Dict[str, Any]] = []
+
+        for comp in competitors:
+            if not isinstance(comp, dict):
+                continue
+
+            raw_url = comp.get("url") or comp.get("website_url") or comp.get("domain") or ""
+            url = self._normalize_url(raw_url)
+            if not url:
+                continue
+
+            domain = self._extract_domain(url)
+            if not domain or domain in seen_domains:
+                continue
+
+            seen_domains.add(domain)
+            normalized.append({
+                "url": url,
+                "domain": domain,
+                "name": comp.get("name") or comp.get("title") or domain,
+                "summary": comp.get("summary") or comp.get("description") or ""
+            })
+
+            if len(normalized) >= max_competitors:
+                break
+
+        return normalized
+
+    def _normalize_url(self, raw: str) -> Optional[str]:
+        if not raw or not isinstance(raw, str):
+            return None
+
+        raw = raw.strip()
+        if not raw:
+            return None
+
+        if not raw.startswith(("http://", "https://")):
+            raw = "https://" + raw
+
+        try:
+            parsed = urlparse(raw)
+            if not parsed.scheme or not parsed.netloc:
+                return None
+            return f"{parsed.scheme}://{parsed.netloc}"
+        except Exception:
+            return None
+
+    def _extract_domain(self, url: str) -> Optional[str]:
+        try:
+            parsed = urlparse(url)
+            domain = (parsed.netloc or "").lower()
+            if domain.startswith("www."):
+                domain = domain[4:]
+            return domain or None
+        except Exception:
+            return None
+
+    async def _crawl_competitors(
+        self,
+        competitors: List[Dict[str, Any]],
+        *,
+        crawl_concurrency: int
+    ) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
+        semaphore = asyncio.Semaphore(max(1, int(crawl_concurrency)))
+
+        async def crawl_one(comp: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+            async with semaphore:
+                url = comp.get("url")
+                if not url:
+                    return comp, {"success": False, "error": "missing_url"}
+                try:
+                    return comp, await self.crawler.crawl_website(url)
+                except Exception as e:
+                    return comp, {"success": False, "error": str(e)}
+
+        tasks = [crawl_one(c) for c in competitors]
+        return await asyncio.gather(*tasks)
+
+    def _build_extraction_artifact(self, competitor_input: Dict[str, Any], crawl_result: Dict[str, Any]) -> Dict[str, Any]:
+        if not isinstance(crawl_result, dict) or not crawl_result.get("success"):
+            return {
+                "fetch_status": {
+                    "status": "failed",
+                    "error": crawl_result.get("error") if isinstance(crawl_result, dict) else "unknown_error"
+                }
+            }
+
+        content = crawl_result.get("content") if isinstance(crawl_result.get("content"), dict) else {}
+        title = content.get("title") or ""
+        description = content.get("description") or ""
+        headings = content.get("headings") if isinstance(content.get("headings"), list) else []
+        links = content.get("links") if isinstance(content.get("links"), list) else []
+        meta_tags = content.get("meta_tags") if isinstance(content.get("meta_tags"), dict) else {}
+        main_content = content.get("main_content") or ""
+        content_structure = content.get("content_structure") if isinstance(content.get("content_structure"), dict) else {}
+
+        nav_labels = self._extract_nav_labels(links)
+        h1_h2 = [h for h in headings if isinstance(h, str)][:25]
+        cta_signals = self._extract_cta_signals(main_content, links)
+        proof_signals = self._extract_proof_signals(main_content, links)
+
+        excerpt = main_content.strip()
+        if len(excerpt) > 2000:
+            excerpt = excerpt[:2000]
+
+        return {
+            "fetch_status": {
+                "status": "ok",
+                "fetched_url": crawl_result.get("url"),
+                "timestamp": crawl_result.get("timestamp")
+            },
+            "page_meta": {
+                "title": title,
+                "meta_description": description,
+                "og_title": meta_tags.get("og:title"),
+                "og_description": meta_tags.get("og:description")
+            },
+            "structure": {
+                "headings": h1_h2,
+                "nav_labels": nav_labels,
+                "content_structure": content_structure
+            },
+            "signals": {
+                "cta_signals": cta_signals,
+                "proof_signals": proof_signals
+            },
+            "content_excerpt": excerpt
+        }
+
+    def _extract_nav_labels(self, links: List[Dict[str, Any]]) -> List[str]:
+        labels: List[str] = []
+        for link in links[:200]:
+            if not isinstance(link, dict):
+                continue
+            text = (link.get("text") or "").strip()
+            if not text or len(text) > 50:
+                continue
+            labels.append(text)
+        deduped: List[str] = []
+        seen = set()
+        for label in labels:
+            key = label.lower()
+            if key in seen:
+                continue
+            seen.add(key)
+            deduped.append(label)
+            if len(deduped) >= 25:
+                break
+        return deduped
+
+    def _extract_cta_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
+        text = (main_content or "").lower()
+        keywords = ["get started", "start", "book", "demo", "trial", "pricing", "contact", "signup", "sign up", "subscribe"]
+        keyword_hits = [k for k in keywords if k in text]
+
+        link_texts = []
+        for link in links[:200]:
+            if isinstance(link, dict):
+                t = (link.get("text") or "").strip()
+                if t:
+                    link_texts.append(t.lower())
+
+        cta_link_hits = [k for k in keywords if any(k in lt for lt in link_texts)]
+        return {
+            "keyword_hits": keyword_hits[:10],
+            "link_cta_hits": list(dict.fromkeys(cta_link_hits))[:10]
+        }
+
+    def _extract_proof_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
+        text = (main_content or "").lower()
+        proof_keywords = ["case study", "testimonials", "customers", "trusted by", "reviews", "awards", "partners"]
+        hits = [k for k in proof_keywords if k in text]
+
+        link_hits = []
+        for link in links[:200]:
+            if not isinstance(link, dict):
+                continue
+            href = (link.get("href") or "").lower()
+            if any(k.replace(" ", "") in href.replace("-", "").replace("_", "") for k in ["case study", "testimonials", "customers"]):
+                link_hits.append(href)
+        return {
+            "keyword_hits": hits[:10],
+            "supporting_links": link_hits[:10]
+        }
+
+    async def _analyze_competitor_with_ai(
+        self,
+        *,
+        user_id: str,
+        baseline: Dict[str, Any],
+        competitor_input: Dict[str, Any],
+        extraction: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        if not isinstance(extraction, dict) or extraction.get("fetch_status", {}).get("status") != "ok":
+            return {
+                "status": "skipped",
+                "reason": "crawl_failed"
+            }
+
+        json_struct = {
+            "positioning": {
+                "value_prop": "string",
+                "target_audience": "string",
+                "market_tier": "string",
+                "primary_offer": "string"
+            },
+            "content_strategy": {
+                "themes": ["string"],
+                "messaging_angles": ["string"],
+                "cta_patterns": ["string"],
+                "tone_markers": ["string"]
+            },
+            "competitive_advantages": ["string"],
+            "weaknesses_or_risks": ["string"],
+            "comparison_to_user_baseline": {
+                "overlaps": ["string"],
+                "deltas": ["string"],
+                "opportunities": ["string"]
+            },
+            "confidence": {
+                "overall": "number",
+                "notes": ["string"]
+            }
+        }
+
+        prompt = (
+            "You are a competitive intelligence analyst.\n"
+            "Analyze the competitor homepage extraction and compare it to the user's Step 2 baseline insights.\n"
+            "Return strictly the requested JSON.\n\n"
+            f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
+            f"Competitor input: {json.dumps(competitor_input, ensure_ascii=False)}\n\n"
+            f"Homepage extraction: {json.dumps(extraction, ensure_ascii=False)}\n"
+        )
+
+        try:
+            raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
+            parsed = self._safe_json_parse(raw)
+            if isinstance(parsed, dict):
+                return parsed
+            return {"status": "failed", "error": "invalid_ai_json"}
+        except Exception as e:
+            logger.warning(f"AI competitor analysis failed for {competitor_input.get('domain')}: {e}")
+            return {"status": "failed", "error": str(e)}
+
+    async def _aggregate_with_ai(
+        self,
+        *,
+        user_id: str,
+        baseline: Dict[str, Any],
+        competitors: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        json_struct = {
+            "market_map": {
+                "clusters": [
+                    {
+                        "cluster_name": "string",
+                        "description": "string",
+                        "competitors": ["string"]
+                    }
+                ]
+            },
+            "common_patterns": {
+                "common_themes": ["string"],
+                "common_ctas": ["string"],
+                "common_proof_signals": ["string"]
+            },
+            "content_gaps_and_opportunities": [
+                {
+                    "gap": "string",
+                    "why_it_matters": "string",
+                    "recommended_content_types": ["string"],
+                    "impact": "string",
+                    "effort": "string"
+                }
+            ],
+            "strategic_recommendations": [
+                {
+                    "action": "string",
+                    "expected_impact": "string",
+                    "effort": "string",
+                    "first_steps": ["string"]
+                }
+            ],
+            "warnings": ["string"]
+        }
+
+        compact = []
+        for item in competitors:
+            comp = item.get("input") if isinstance(item, dict) else None
+            ai = item.get("ai_analysis") if isinstance(item, dict) else None
+            if isinstance(comp, dict) and isinstance(ai, dict):
+                compact.append({
+                    "domain": comp.get("domain"),
+                    "name": comp.get("name"),
+                    "ai_analysis": ai
+                })
+
+        prompt = (
+            "You are a senior strategy consultant.\n"
+            "Using the user's Step 2 baseline insights and per-competitor analyses, produce an aggregated market view.\n"
+            "Return strictly the requested JSON.\n\n"
+            f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
+            f"Per-competitor analyses: {json.dumps(compact, ensure_ascii=False)}\n"
+        )
+
+        try:
+            raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
+            parsed = self._safe_json_parse(raw)
+            if isinstance(parsed, dict):
+                return parsed
+            return {"warnings": ["invalid_ai_json"]}
+        except Exception as e:
+            logger.warning(f"AI aggregation failed: {e}")
+            return {"warnings": [str(e)]}
+
+    def _safe_json_parse(self, text: str) -> Any:
+        if not isinstance(text, str):
+            return None
+        cleaned = text.strip()
+        cleaned = re.sub(r"^```json\\s*", "", cleaned)
+        cleaned = re.sub(r"^```\\s*", "", cleaned)
+        cleaned = re.sub(r"```\\s*$", "", cleaned)
+        cleaned = cleaned.strip()
+        try:
+            return json.loads(cleaned)
+        except Exception:
+            match = re.search(r"\\{[\\s\\S]*\\}", cleaned)
+            if match:
+                try:
+                    return json.loads(match.group(0))
+                except Exception:
+                    return None
+            return None
+
--- a/backend/services/research/deep_crawl_service.py
+++ b/backend/services/research/deep_crawl_service.py
@@ -0,0 +1,270 @@
+"""
+Deep Crawl Service for Onboarding Step 3
+Handles deep crawling of user's website, combining Sitemap and Tavily data.
+"""
+
+import os
+import asyncio
+import httpx
+from typing import Dict, List, Any, Optional
+from datetime import datetime
+from loguru import logger
+from urllib.parse import urlparse
+
+from services.seo_tools.sitemap_service import SitemapService
+from services.research.tavily_service import TavilyService
+from services.database import get_session_for_user
+from models.crawled_content import EndUserWebsiteContent
+from models.website_analysis_monitoring_models import DeepWebsiteCrawlTask, DeepWebsiteCrawlExecutionLog
+
+class DeepCrawlService:
+    def __init__(self):
+        self.sitemap_service = SitemapService()
+        self.tavily_service = TavilyService()
+
+    async def execute_deep_crawl(self, user_id: str, website_url: str, task_id: Optional[int] = None) -> Dict[str, Any]:
+        """
+        Execute deep crawl for a user's website.
+        
+        1. Fetch URLs from Sitemap.
+        2. Crawl using Tavily.
+        3. Deduplicate URLs.
+        4. Check liveness (status code).
+        5. Save content to DB and File.
+        """
+        logger.info(f"Starting deep crawl for {website_url} (User: {user_id})")
+        
+        execution_start = datetime.utcnow()
+        db = get_session_for_user(user_id)
+        if not db:
+            raise Exception("Database connection failed")
+
+        try:
+            # 1. Sitemap Discovery
+            sitemap_urls = set()
+            try:
+                # Discover sitemap URL
+                sitemap_url = await self.sitemap_service.discover_sitemap_url(website_url)
+                if not sitemap_url:
+                    sitemap_url = f"{website_url.rstrip('/')}/sitemap.xml"
+                
+                # Analyze sitemap to get URLs
+                # We use analyze_sitemap directly to get raw URLs
+                sitemap_data = await self.sitemap_service.analyze_sitemap(sitemap_url)
+                
+                for url_entry in sitemap_data.get("urls", []):
+                    if isinstance(url_entry, dict) and "loc" in url_entry:
+                        sitemap_urls.add(url_entry["loc"])
+                
+                logger.info(f"Found {len(sitemap_urls)} URLs from sitemap")
+            except Exception as e:
+                logger.warning(f"Sitemap analysis failed: {e}")
+
+            # 2. Tavily Crawl
+            tavily_urls = set()
+            tavily_results = []
+            try:
+                # Use intelligent instructions
+                instructions = "Find all blog posts, articles, and main content pages. Ignore login, signup, and admin pages."
+                
+                crawl_result = await self.tavily_service.crawl(
+                    url=website_url,
+                    limit=50, # Limit to avoid excessive costs/time
+                    max_depth=2,
+                    extract_depth="basic",
+                    instructions=instructions
+                )
+                
+                if crawl_result.get("success"):
+                    for res in crawl_result.get("results", []):
+                        url = res.get("url")
+                        if url:
+                            tavily_urls.add(url)
+                            tavily_results.append(res)
+                
+                logger.info(f"Found {len(tavily_urls)} URLs from Tavily")
+            except Exception as e:
+                logger.warning(f"Tavily crawl failed: {e}")
+
+            # 3. Merge and Deduplicate
+            all_urls = sitemap_urls.union(tavily_urls)
+            unique_urls = list(all_urls)
+            logger.info(f"Total unique URLs to process: {len(unique_urls)}")
+
+            # 4. Process URLs (Liveness & Save)
+            processed_count = 0
+            success_count = 0
+            
+            # Create directory for documents if not exists
+            # We'll save in workspace/{user_id}/crawled_content/
+            # Note: Path logic should be consistent with project structure
+            # Assuming workspace path is available via env or config, or constructing it.
+            # Using relative path for now, adjusted to project root.
+            # The memory says: workspace/workspace_{user_id}/db/alwrity.db
+            # So workspace root is workspace/workspace_{user_id}/
+            workspace_dir = f"workspace/workspace_{user_id}/crawled_content"
+            os.makedirs(workspace_dir, exist_ok=True)
+
+            # Limit concurrent checks
+            sem = asyncio.Semaphore(10)
+            
+            async def process_url(url):
+                async with sem:
+                    return await self._process_single_url(url, user_id, website_url, workspace_dir, tavily_results)
+
+            tasks = [process_url(url) for url in unique_urls]
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+            processed_data = []
+            
+            # Save results to DB
+            for res in results:
+                if isinstance(res, dict):
+                    processed_data.append(res)
+                    if res.get("status_code") and 200 <= res.get("status_code") < 300:
+                        success_count += 1
+                        
+                        # Save to DB
+                        try:
+                            existing = db.query(EndUserWebsiteContent).filter(
+                                EndUserWebsiteContent.user_id == user_id,
+                                EndUserWebsiteContent.url == res["url"]
+                            ).first()
+                            
+                            if existing:
+                                existing.content = res.get("content")
+                                existing.title = res.get("title")
+                                existing.status_code = res.get("status_code")
+                                existing.crawled_at = datetime.utcnow()
+                            else:
+                                new_content = EndUserWebsiteContent(
+                                    user_id=user_id,
+                                    website_url=website_url,
+                                    url=res["url"],
+                                    title=res.get("title"),
+                                    content=res.get("content"),
+                                    status_code=res.get("status_code"),
+                                    crawled_at=datetime.utcnow()
+                                )
+                                db.add(new_content)
+                        except Exception as e:
+                            logger.error(f"Failed to save content to DB for {res['url']}: {e}")
+            
+            db.commit()
+            
+            # 5. Update Task Log if task_id provided
+            if task_id:
+                log = DeepWebsiteCrawlExecutionLog(
+                    task_id=task_id,
+                    status="success",
+                    result_data={
+                        "total_urls": len(unique_urls),
+                        "sitemap_urls": len(sitemap_urls),
+                        "tavily_urls": len(tavily_urls),
+                        "success_count": success_count,
+                        "processed_urls": processed_data[:100] # Store only a subset to avoid huge JSON
+                    },
+                    execution_time_ms=int((datetime.utcnow() - execution_start).total_seconds() * 1000)
+                )
+                db.add(log)
+                
+                # Update task
+                task = db.query(DeepWebsiteCrawlTask).filter(DeepWebsiteCrawlTask.id == task_id).first()
+                if task:
+                    task.last_executed = datetime.utcnow()
+                    task.last_success = datetime.utcnow()
+                    task.status = "active"
+                    task.consecutive_failures = 0
+                
+                db.commit()
+
+            return {
+                "success": True,
+                "total_urls": len(unique_urls),
+                "sitemap_urls": len(sitemap_urls),
+                "tavily_urls": len(tavily_urls),
+                "processed_urls": processed_data
+            }
+
+        except Exception as e:
+            logger.error(f"Deep crawl failed: {e}")
+            if task_id:
+                log = DeepWebsiteCrawlExecutionLog(
+                    task_id=task_id,
+                    status="failed",
+                    error_message=str(e),
+                    execution_time_ms=int((datetime.utcnow() - execution_start).total_seconds() * 1000)
+                )
+                db.add(log)
+                task = db.query(DeepWebsiteCrawlTask).filter(DeepWebsiteCrawlTask.id == task_id).first()
+                if task:
+                    task.last_executed = datetime.utcnow()
+                    task.last_failure = datetime.utcnow()
+                    task.failure_reason = str(e)
+                    task.consecutive_failures += 1
+                db.commit()
+            raise e
+        finally:
+            db.close()
+
+    async def _process_single_url(self, url: str, user_id: str, website_url: str, workspace_dir: str, tavily_results: List[Dict]):
+        """Check liveness, extract content, and save."""
+        status_code = None
+        error = None
+        content = None
+        title = None
+        
+        # 1. Liveness Check
+        try:
+            async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
+                resp = await client.get(url)
+                status_code = resp.status_code
+        except Exception as e:
+            error = str(e)
+            status_code = 0 # Failed
+
+        # 2. Get content (from Tavily results or generic extraction if needed)
+        # Check if we have content from Tavily
+        tavily_match = next((r for r in tavily_results if r.get("url") == url), None)
+        
+        if tavily_match:
+            content = tavily_match.get("raw_content") or tavily_match.get("content")
+            title = tavily_match.get("title")
+        elif status_code and 200 <= status_code < 300:
+            # Simple fetch content if valid
+            try:
+                async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
+                    resp = await client.get(url)
+                    content = resp.text
+                    # Naive title extraction
+                    if "<title>" in content:
+                        start = content.find("<title>") + 7
+                        end = content.find("</title>")
+                        if start > 6 and end > start:
+                            title = content[start:end]
+            except Exception:
+                pass
+
+        # 3. Save to Document
+        if content and title:
+            safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip()[:50]
+            if not safe_title:
+                safe_title = "untitled"
+            filename = f"{safe_title}_{int(datetime.utcnow().timestamp())}.txt"
+            filepath = os.path.join(workspace_dir, filename)
+            try:
+                with open(filepath, "w", encoding="utf-8") as f:
+                    f.write(f"URL: {url}\n")
+                    f.write(f"Title: {title}\n")
+                    f.write(f"Date: {datetime.utcnow()}\n\n")
+                    f.write(content)
+            except Exception as e:
+                logger.warning(f"Failed to write file for {url}: {e}")
+
+        return {
+            "url": url,
+            "status_code": status_code,
+            "error": error,
+            "title": title,
+            "content": content
+        }
--- a/backend/services/research/exa_service.py
+++ b/backend/services/research/exa_service.py
@@ -214,25 +214,71 @@ class ExaService:
            List of processed competitor data
        """
        competitors = []
-        user_domain = urlparse(user_url).netloc
+        try:
+            user_domain = urlparse(user_url).netloc
+        except Exception:
+            user_domain = ""
        
        # Extract results from the SDK response
-        results = getattr(search_result, 'results', [])
+        # Handle case where search_result might be a dict or an object
+        if isinstance(search_result, dict):
+            results = search_result.get('results', [])
+        else:
+            results = getattr(search_result, 'results', [])
        
        for result in results:
            try:
-                # Extract basic information from the result object
-                competitor_url = getattr(result, 'url', '')
-                competitor_domain = urlparse(competitor_url).netloc
+                # Helper to safely get attribute or dict key
+                def get_val(obj, key, default=None):
+                    if isinstance(obj, dict):
+                        return obj.get(key, default)
+                    return getattr(obj, key, default)
+
+                # Extract basic information
+                raw_url = get_val(result, 'url', '')
+                # Clean URL (remove backticks and whitespace that might be in the response)
+                competitor_url = raw_url.strip().strip('`').strip() if raw_url else ''
                
-                # Skip if it's the same domain as the user
-                if competitor_domain == user_domain:
+                # Fallback to ID if URL is missing/empty but ID looks like a URL
+                if not competitor_url:
+                    raw_id = get_val(result, 'id', '')
+                    cleaned_id = raw_id.strip().strip('`').strip() if raw_id else ''
+                    if cleaned_id and (cleaned_id.startswith('http://') or cleaned_id.startswith('https://')):
+                        competitor_url = cleaned_id
+                
+                if not competitor_url:
+                    continue
+
+                try:
+                    competitor_domain = urlparse(competitor_url).netloc
+                except Exception:
+                    competitor_domain = ""
+                
+                # Skip if it's the same domain as the user (fuzzy match)
+                if user_domain and competitor_domain and (user_domain in competitor_domain or competitor_domain in user_domain):
                    continue
                
                # Extract content insights
-                summary = getattr(result, 'summary', '')
-                highlights = getattr(result, 'highlights', [])
-                highlight_scores = getattr(result, 'highlight_scores', [])
+                summary = get_val(result, 'summary', '')
+                highlights = get_val(result, 'highlights', [])
+                highlight_scores = get_val(result, 'highlight_scores', [])
+                subpages = get_val(result, 'subpages', [])
+                
+                # Ensure subpages are dicts
+                processed_subpages = []
+                if subpages:
+                    for sp in subpages:
+                        if isinstance(sp, dict):
+                            processed_subpages.append(sp)
+                        elif hasattr(sp, '__dict__'):
+                            processed_subpages.append(sp.__dict__)
+                        else:
+                            processed_subpages.append({
+                                "id": getattr(sp, 'id', ''),
+                                "url": getattr(sp, 'url', ''),
+                                "title": getattr(sp, 'title', '')
+                            })
+                subpages = processed_subpages
                
                # Calculate competitive relevance score
                relevance_score = self._calculate_relevance_score(result, user_url)
@@ -240,14 +286,15 @@ class ExaService:
                competitor_data = {
                    "url": competitor_url,
                    "domain": competitor_domain,
-                    "title": getattr(result, 'title', ''),
-                    "published_date": getattr(result, 'published_date', None),
-                    "author": getattr(result, 'author', None),
-                    "favicon": getattr(result, 'favicon', None),
-                    "image": getattr(result, 'image', None),
+                    "title": get_val(result, 'title', ''),
+                    "published_date": get_val(result, 'published_date', None),
+                    "author": get_val(result, 'author', None),
+                    "favicon": get_val(result, 'favicon', None),
+                    "image": get_val(result, 'image', None),
                    "summary": summary,
                    "highlights": highlights,
                    "highlight_scores": highlight_scores,
+                    "subpages": subpages,
                    "relevance_score": relevance_score,
                    "competitive_insights": self._extract_competitive_insights(summary, highlights),
                    "content_analysis": self._analyze_content_quality(result)
@@ -439,6 +486,11 @@ class ExaService:
            
            # Log the raw Exa API response for debugging
            logger.info(f"Raw Exa social media response for {user_url}:")
+            if hasattr(result, 'to_json'):
+                 logger.info(result.to_json())
+            else:
+                 logger.info(str(result))
+
            logger.info(f"  - Request ID: {getattr(result, 'request_id', 'N/A')}")
            logger.info(f"  └─ Cost: ${getattr(getattr(result, 'cost_dollars', None), 'total', 0)}")
            # Note: Full raw response contains verbose content - logging only summary
@@ -477,9 +529,22 @@ class ExaService:
                import json
                import re
                
-                if answer_text.strip().startswith('{'):
+                logger.warning(f"Parsing Exa answer text: {answer_text[:200]}...")
+
+                # Clean markdown code blocks if present
+                clean_text = answer_text.strip()
+                if clean_text.startswith('```json'):
+                    clean_text = clean_text[7:]
+                if clean_text.startswith('```'):
+                    clean_text = clean_text[3:]
+                if clean_text.endswith('```'):
+                    clean_text = clean_text[:-3]
+                
+                clean_text = clean_text.strip()
+
+                if clean_text.startswith('{'):
                    # Direct JSON format
-                    answer_data = json.loads(answer_text.strip())
+                    answer_data = json.loads(clean_text)
                else:
                    # Parse markdown format with URLs
                    answer_data = {
--- a/backend/services/research/research_persona_scheduler.py
+++ b/backend/services/research/research_persona_scheduler.py
@@ -26,7 +26,7 @@ async def generate_research_persona_task(user_id: str):
        logger.info(f"Scheduled research persona generation started for user {user_id}")
        
        # Get database session
-        db = get_db_session()
+        db = get_db_session(user_id)
        if not db:
            logger.error(f"Failed to get database session for research persona generation (user: {user_id})")
            return
--- a/backend/services/research/research_persona_service.py
+++ b/backend/services/research/research_persona_service.py
@@ -9,13 +9,14 @@ from datetime import datetime, timedelta
 from loguru import logger
 from fastapi import HTTPException

+from sqlalchemy import text
 from services.database import get_db_session
 from models.onboarding import PersonaData, OnboardingSession
 from models.research_persona_models import ResearchPersona
 from .research_persona_prompt_builder import ResearchPersonaPromptBuilder
 from services.llm_providers.main_text_generation import llm_text_gen
-from services.onboarding.database_service import OnboardingDatabaseService
 from services.persona_data_service import PersonaDataService
+from api.content_planning.services.content_strategy.onboarding import OnboardingDataIntegrationService


 class ResearchPersonaService:
@@ -24,10 +25,62 @@ class ResearchPersonaService:
    CACHE_TTL_DAYS = 7  # 7-day cache TTL
    
    def __init__(self, db_session=None):
-        self.db = db_session or get_db_session()
+        self.db = db_session
        self.prompt_builder = ResearchPersonaPromptBuilder()
-        self.onboarding_service = OnboardingDatabaseService(db=self.db)
-        self.persona_data_service = PersonaDataService(db_session=self.db)
+        # self.persona_data_service was initialized here but unused in this service
+        self.integration_service = OnboardingDataIntegrationService()
+        self._research_persona_cols_checked = False
+
+    def _get_session(self, user_id: str):
+        """Helper to get a database session."""
+        if self.db:
+            return self.db, False
+        return get_db_session(user_id), True
+
+    def _ensure_research_persona_columns(self, session_db) -> None:
+        """Ensure research_persona columns exist in persona_data table (runtime migration)."""
+        if self._research_persona_cols_checked:
+            return
+        
+        try:
+            # Check if columns exist using PRAGMA (SQLite) or information_schema (PostgreSQL)
+            db_url = str(session_db.bind.url) if session_db.bind else ""
+            
+            if 'sqlite' in db_url.lower():
+                # SQLite: Use PRAGMA to check columns
+                result = session_db.execute(text("PRAGMA table_info(persona_data)"))
+                cols = {row[1] for row in result}  # Column name is at index 1
+                
+                if 'research_persona' not in cols:
+                    logger.info("Adding missing column research_persona to persona_data table")
+                    session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona JSON"))
+                    session_db.commit()
+                
+                if 'research_persona_generated_at' not in cols:
+                    logger.info("Adding missing column research_persona_generated_at to persona_data table")
+                    session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona_generated_at TIMESTAMP"))
+                    session_db.commit()
+            else:
+                # PostgreSQL: Try to query the columns (will fail if they don't exist)
+                try:
+                    session_db.execute(text("SELECT research_persona, research_persona_generated_at FROM persona_data LIMIT 0"))
+                except Exception:
+                    # Columns don't exist, add them
+                    logger.info("Adding missing columns research_persona and research_persona_generated_at to persona_data table")
+                    try:
+                        session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona JSONB"))
+                        session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona_generated_at TIMESTAMP"))
+                        session_db.commit()
+                    except Exception as alter_err:
+                        logger.error(f"Failed to add research_persona columns: {alter_err}")
+                        session_db.rollback()
+                        raise
+        except Exception as e:
+            logger.error(f"Error ensuring research_persona columns: {e}")
+            session_db.rollback()
+            raise
+        finally:
+            self._research_persona_cols_checked = True
    
    def get_cached_only(
        self, 
@@ -46,9 +99,16 @@ class ResearchPersonaService:
        Returns:
            ResearchPersona if exists in database, None otherwise
        """
+        db = None
+        should_close = False
        try:
+            db, should_close = self._get_session(user_id)
+            if not db:
+                logger.error(f"Could not get database session for user {user_id}")
+                return None
+                
            # Get persona data record
-            persona_data = self._get_persona_data_record(user_id)
+            persona_data = self._get_persona_data_record(user_id, db)
            
            if not persona_data:
                logger.debug(f"[get_cached_only] No persona data record found for user {user_id}")
@@ -110,6 +170,9 @@ class ResearchPersonaService:
        except Exception as e:
            logger.error(f"[get_cached_only] ❌ Error getting research persona for user {user_id}: {e}", exc_info=True)
            return None
+        finally:
+            if should_close and db:
+                db.close()

    def get_or_generate(
        self, 
@@ -126,9 +189,16 @@ class ResearchPersonaService:
        Returns:
            ResearchPersona if successful, None otherwise
        """
+        db = None
+        should_close = False
        try:
+            db, should_close = self._get_session(user_id)
+            if not db:
+                logger.error(f"Could not get database session for get_or_generate (user {user_id})")
+                return None
+                
            # Get persona data record
-            persona_data = self._get_persona_data_record(user_id)
+            persona_data = self._get_persona_data_record(user_id, db)
            
            if not persona_data:
                logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
@@ -168,18 +238,14 @@ class ResearchPersonaService:
            # 3. Parsing of existing persona failed
            try:
                logger.info(f"Generating research persona for user {user_id}")
-                research_persona = self.generate_research_persona(user_id)
+                research_persona = self.generate_research_persona(user_id, db)
            except HTTPException:
                # Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
                raise
            
            if research_persona:
-                # Save to database
-                if self.save_research_persona(user_id, research_persona):
-                    logger.info(f"✅ Research persona generated and saved for user {user_id}")
-                else:
-                    logger.warning(f"Failed to save research persona for user {user_id}")
-                
+                # generate_research_persona saves it automatically now
+                logger.info(f"✅ Research persona generated and saved for user {user_id}")
                return research_persona
            else:
                # Log detailed error for debugging expensive failures
@@ -196,22 +262,36 @@ class ResearchPersonaService:
        except Exception as e:
            logger.error(f"Error getting/generating research persona for user {user_id}: {e}")
            return None
+        finally:
+            if should_close and db:
+                db.close()
    
-    def generate_research_persona(self, user_id: str) -> Optional[ResearchPersona]:
+    def generate_research_persona(self, user_id: str, db=None) -> Optional[ResearchPersona]:
        """
        Generate a new research persona for the user.
        
        Args:
            user_id: User ID (Clerk string)
+            db: Optional database session
            
        Returns:
            ResearchPersona if successful, None otherwise
        """
+        session_db = None
+        should_close = False
        try:
+            session_db = db
+            if not session_db:
+                session_db, should_close = self._get_session(user_id)
+            
+            if not session_db:
+                logger.error(f"Could not get database session for generate_research_persona (user {user_id})")
+                return None
+
            logger.info(f"Generating research persona for user {user_id}")
            
            # Collect onboarding data
-            onboarding_data = self._collect_onboarding_data(user_id)
+            onboarding_data = self._collect_onboarding_data(user_id, session_db)
            
            if not onboarding_data:
                logger.warning(f"Insufficient onboarding data for user {user_id}")
@@ -275,6 +355,12 @@ class ResearchPersonaService:
                try:
                    research_persona = ResearchPersona(**persona_dict)
                    logger.info(f"✅ Research persona generated successfully for user {user_id}")
+                    
+                    # Save the generated persona
+                    save_success = self.save_research_persona(user_id, research_persona, session_db)
+                    if not save_success:
+                        logger.warning(f"Failed to save generated persona for user {user_id}")
+                    
                    return research_persona
                except Exception as validation_error:
                    logger.error(f"Failed to validate ResearchPersona from dict: {validation_error}")
@@ -297,6 +383,9 @@ class ResearchPersonaService:
        except Exception as e:
            logger.error(f"Error generating research persona for user {user_id}: {e}")
            return None
+        finally:
+            if should_close and session_db:
+                session_db.close()
    
    def is_cache_valid(self, persona_data: PersonaData) -> bool:
        """
@@ -323,7 +412,8 @@ class ResearchPersonaService:
    def save_research_persona(
        self, 
        user_id: str, 
-        research_persona: ResearchPersona
+        research_persona: ResearchPersona,
+        db=None
    ) -> bool:
        """
        Save research persona to database.
@@ -331,12 +421,23 @@ class ResearchPersonaService:
        Args:
            user_id: User ID (Clerk string)
            research_persona: ResearchPersona to save
+            db: Optional database session
            
        Returns:
            True if successful, False otherwise
        """
+        session_db = None
+        should_close = False
        try:
-            persona_data = self._get_persona_data_record(user_id)
+            session_db = db
+            if not session_db:
+                session_db, should_close = self._get_session(user_id)
+            
+            if not session_db:
+                logger.error(f"Could not get database session for save_research_persona (user {user_id})")
+                return False
+
+            persona_data = self._get_persona_data_record(user_id, session_db)
            
            if not persona_data:
                logger.error(f"No persona data record found for user {user_id}")
@@ -349,24 +450,33 @@ class ResearchPersonaService:
            persona_data.research_persona = persona_dict
            persona_data.research_persona_generated_at = datetime.utcnow()
            
-            self.db.commit()
+            session_db.commit()
            
            logger.info(f"✅ Research persona saved for user {user_id}")
            return True
            
        except Exception as e:
            logger.error(f"Error saving research persona for user {user_id}: {e}")
-            self.db.rollback()
+            if session_db:
+                session_db.rollback()
            return False
+        finally:
+            if should_close and session_db:
+                session_db.close()
    
-    def _get_persona_data_record(self, user_id: str) -> Optional[PersonaData]:
+    def _get_persona_data_record(self, user_id: str, db=None) -> Optional[PersonaData]:
        """Get PersonaData database record for user."""
        try:
+            session_db = db or self.db
+            if not session_db:
+                logger.error(f"No database session provided for _get_persona_data_record (user {user_id})")
+                return None
+
            # Ensure research_persona columns exist before querying
-            self.onboarding_service._ensure_research_persona_columns(self.db)
+            self._ensure_research_persona_columns(session_db)
            
            # Get onboarding session
-            session = self.db.query(OnboardingSession).filter(
+            session = session_db.query(OnboardingSession).filter(
                OnboardingSession.user_id == user_id
            ).first()
            
@@ -374,7 +484,7 @@ class ResearchPersonaService:
                return None
            
            # Get persona data
-            persona_data = self.db.query(PersonaData).filter(
+            persona_data = session_db.query(PersonaData).filter(
                PersonaData.session_id == session.id
            ).first()
            
@@ -384,7 +494,7 @@ class ResearchPersonaService:
            logger.error(f"Error getting persona data record for user {user_id}: {e}")
            return None
    
-    def _collect_onboarding_data(self, user_id: str) -> Optional[Dict[str, Any]]:
+    def _collect_onboarding_data(self, user_id: str, db=None) -> Optional[Dict[str, Any]]:
        """
        Collect all onboarding data needed for research persona generation.
        
@@ -392,40 +502,44 @@ class ResearchPersonaService:
            Dictionary with website_analysis, persona_data, research_preferences, business_info
        """
        try:
-            # Get website analysis
-            website_analysis = self.onboarding_service.get_website_analysis(user_id, self.db) or {}
+            session_db = db or self.db
+            if not session_db:
+                logger.error(f"No database session provided for _collect_onboarding_data (user {user_id})")
+                return None
+                
+            # Get integrated data via SSOT
+            integrated_data = self.integration_service.get_integrated_data_sync(user_id, session_db)
            
-            # Get persona data
-            persona_data_dict = self.onboarding_service.get_persona_data(user_id, self.db) or {}
+            if not integrated_data:
+                logger.warning(f"No integrated data found for user {user_id}")
+                return None
+                
+            website_analysis = integrated_data.get('website_analysis', {})
+            persona_data_dict = integrated_data.get('persona_data', {})
+            research_prefs = integrated_data.get('research_preferences', {})
+            canonical_profile = integrated_data.get('canonical_profile', {})
            
-            # Get research preferences
-            research_prefs = self.onboarding_service.get_research_preferences(user_id, self.db) or {}
-            
-            # Get business info - construct from persona data and website analysis
            business_info = {}
+            canonical_business = canonical_profile.get('business_info')
+            if isinstance(canonical_business, dict):
+                business_info.update(canonical_business)
+
+            # Use canonical profile data (SSOT) instead of manual logic if possible
+            # The canonical profile already handles logic for industry/target_audience from various sources
+            if not business_info.get('industry') and canonical_profile.get('industry'):
+                 business_info['industry'] = canonical_profile.get('industry')
            
-            # Try to extract from persona data
-            if persona_data_dict:
-                core_persona = persona_data_dict.get('corePersona') or persona_data_dict.get('core_persona')
-                if core_persona:
-                    if core_persona.get('industry'):
-                        business_info['industry'] = core_persona['industry']
-                    if core_persona.get('target_audience'):
-                        business_info['target_audience'] = core_persona['target_audience']
+            if not business_info.get('target_audience') and canonical_profile.get('target_audience'):
+                 business_info['target_audience'] = canonical_profile.get('target_audience')
            
-            # Fallback to website analysis if not in persona
+            # Fallback logic if canonical profile is missing these (though it should have them)
            if not business_info.get('industry') and website_analysis:
                target_audience_data = website_analysis.get('target_audience', {})
                if isinstance(target_audience_data, dict):
                    industry_focus = target_audience_data.get('industry_focus')
                    if industry_focus:
                        business_info['industry'] = industry_focus
-                    demographics = target_audience_data.get('demographics')
-                    if demographics:
-                        business_info['target_audience'] = demographics if isinstance(demographics, str) else str(demographics)
            
-            # Check if we have enough data - be more lenient since we can infer from minimal data
-            # We need at least some basic information to generate a meaningful persona
            has_basic_data = bool(
                website_analysis or
                persona_data_dict or
@@ -457,20 +571,17 @@ class ResearchPersonaService:
                business_info['inferred'] = True
            
            # Get competitor analysis data (if available)
-            competitor_analysis = None
-            try:
-                competitor_analysis = self.onboarding_service.get_competitor_analysis(user_id, self.db)
-                if competitor_analysis:
-                    logger.info(f"Found {len(competitor_analysis)} competitors for research persona generation")
-            except Exception as e:
-                logger.debug(f"Could not retrieve competitor analysis for persona generation: {e}")
+            # Use SSOT (Integrated data contains competitor info)
+            competitor_analysis = integrated_data.get('competitor_analysis')
+            if not competitor_analysis:
+                competitor_analysis = []
            
            return {
                "website_analysis": website_analysis,
                "persona_data": persona_data_dict,
                "research_preferences": research_prefs,
                "business_info": business_info,
-                "competitor_analysis": competitor_analysis  # Add competitor data for better preset generation
+                "competitor_analysis": competitor_analysis
            }
            
        except Exception as e:
--- a/backend/services/research/tavily_service.py
+++ b/backend/services/research/tavily_service.py
@@ -258,6 +258,112 @@ class TavilyService:
        results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
        
        return results
+
+    async def crawl(
+        self,
+        url: str,
+        limit: int = 50,
+        max_depth: int = 1,
+        max_breadth: int = 20,
+        extract_depth: str = "basic",
+        include_favicon: bool = False,
+        instructions: str = "",
+        allow_external: bool = True
+    ) -> Dict[str, Any]:
+        """
+        Crawl a website using Tavily API.
+        
+        Args:
+            url: The root URL to begin the crawl
+            limit: Total number of links the crawler will process
+            max_depth: Max depth of the crawl
+            max_breadth: Max number of links to follow per level
+            extract_depth: 'basic' or 'advanced'
+            include_favicon: Whether to include favicon
+            instructions: Natural language instructions for the crawler
+            allow_external: Whether to return external links
+            
+        Returns:
+            Dict containing crawl results
+        """
+        try:
+            self._try_initialize()
+            if not self.enabled:
+                raise ValueError("Tavily Service is not enabled - API key missing")
+            
+            logger.info(f"Starting Tavily crawl for: {url}")
+            
+            payload = {
+                "api_key": self.api_key,
+                "urls": [url] # Tavily extract/crawl might take a list or single URL. 
+                # Wait, if this is 'crawl', usually it takes one URL. 
+                # Let's double check standard Tavily API. 
+                # But since I can't check external docs, I will follow the MCP tool params.
+                # The MCP tool has 'url' (string).
+            }
+            
+            # NOTE: Tavily API structure for crawl might be different. 
+            # I'll assume there is a /crawl endpoint or similar.
+            # However, looking at standard Tavily python SDK, they often use 'extract' or 'search'.
+            # But 'crawl' is a distinct feature.
+            # I will use a generic request structure based on the tool parameters.
+            
+            # Re-constructing payload based on tool params
+            request_payload = {
+                "api_key": self.api_key,
+                "url": url,
+                "limit": limit,
+                "max_depth": max_depth,
+                "max_breadth": max_breadth,
+                "extract_depth": extract_depth,
+                "include_favicon": include_favicon,
+                "instructions": instructions,
+                "allow_external": allow_external
+            }
+
+            async with aiohttp.ClientSession() as session:
+                # Assuming the endpoint is /crawl based on the tool name
+                # If it fails, I'll need to adjust.
+                endpoint = f"{self.base_url}/crawl" 
+                
+                # Note: Tavily might not have a /crawl endpoint exposed this way in REST if it's new.
+                # But let's try.
+                
+                # Actually, wait. The user mentioned "Refer to the tavily mcp".
+                # The tool definition `mcp_tavily-remote-mcp_tavily_crawl` has the description.
+                
+                # I will proceed with /crawl.
+                
+                async with session.post(
+                    endpoint,
+                    json=request_payload,
+                    headers={"Content-Type": "application/json"},
+                    timeout=aiohttp.ClientTimeout(total=300) # Crawling takes longer
+                ) as response:
+                    if response.status == 200:
+                        result = await response.json()
+                        logger.info(f"Tavily crawl completed successfully.")
+                        return {
+                            "success": True,
+                            "results": result.get("results", []), # Assuming standard response
+                            "timestamp": datetime.utcnow().isoformat()
+                        }
+                    else:
+                        error_text = await response.text()
+                        logger.error(f"Tavily Crawl API error: {response.status} - {error_text}")
+                        return {
+                            "success": False,
+                            "error": f"Tavily API error: {response.status}",
+                            "details": error_text
+                        }
+                        
+        except Exception as e:
+            logger.error(f"Error in Tavily crawl: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "details": "An unexpected error occurred during crawl"
+            }
    
    async def search_industry_trends(
        self,