chore: bulk commit of local changes across blog writer, SEO dashboard, scheduler, docs-site, and frontend

2026-06-05 12:40:04 +05:30
parent b894bc0abb
commit e54aaa7a3e
74 changed files with 5667 additions and 996 deletions
--- a/backend/services/blog_writer/outline/grounding_engine.py
+++ b/backend/services/blog_writer/outline/grounding_engine.py
@@ -40,8 +40,10 @@ class GroundingContextEngine:
        }
        
        # Temporal relevance patterns
+        cy = str(datetime.now().year)
+        ny = str(datetime.now().year + 1)
        self.temporal_patterns = {
-            'recent': ['2024', '2025', 'latest', 'new', 'recent', 'current', 'updated'],
+            'recent': [cy, ny, 'latest', 'new', 'recent', 'current', 'updated'],
            'trending': ['trend', 'emerging', 'growing', 'increasing', 'rising'],
            'evergreen': ['fundamental', 'basic', 'principles', 'foundation', 'core']
        }
--- a/backend/services/blog_writer/outline/keyword_curator.py
+++ b/backend/services/blog_writer/outline/keyword_curator.py
@@ -137,6 +137,15 @@ class KeywordCurator:
            lines.append(f"### Competitive advantage signal (must weave into narrative): {content_gap[0]}")
            lines.append("   → This is your primary differentiation hook. Surface it prominently in the unique value section.")

+        lines.append("")
+        lines.append("### SUGGESTED SECTION → KEYWORD MAPPING")
+        lines.append("Map each outline section's keyword focus according to its narrative role:")
+        lines.append("- Hook / Introduction → lead with primary and trending keywords for timeliness & relevance")
+        lines.append("- Problem / Pain Point → anchor on secondary and long-tail keywords (informational intent)")
+        lines.append("- Solution / How-To → weave in primary and secondary keywords for solution-oriented search")
+        lines.append("- Comparison / Analysis → embed semantic keywords to prevent topical drift into tangents")
+        lines.append("- Case Studies / Evidence → surface content gap keywords as differentiation proof points")
+        lines.append("- Future / Trends → leverage trending and content gap keywords for forward-looking authority")
        lines.append("")
        lines.append("GUIDELINE: Treat these as the primary keyword anchors. You may include closely related")
        lines.append("intent-matching variations where natural, but avoid inserting every raw research keyword.")
@@ -176,7 +185,11 @@ class KeywordCurator:
        slot_key: Optional[str] = None,
    ) -> List[str]:
        """
-        Pick up to N items from a keyword list.
+        Pick up to N items from a keyword list with diversity sampling.
+        
+        When the raw list is significantly larger than the limit, selects
+        evenly-spaced entries to capture semantic diversity rather than
+        just the first N entries.
        
        Args:
            data: The raw keyword_analysis dict.
@@ -184,11 +197,24 @@ class KeywordCurator:
            slot_key: The internal slot name for looking up the limit.
                      Falls back to source_key if not provided.
        Returns:
-            Sliced list of at most N strings.
+            List of at most N strings with diversity sampling.
        """
        limit_key = slot_key or source_key
        limit = self.SLOTS.get(limit_key, 5)
        raw: Any = data.get(source_key, [])
        if not isinstance(raw, list):
            return []
-        return raw[:limit]
+        if len(raw) <= limit:
+            return raw
+        if len(raw) <= limit * 2:
+            return raw[:limit]
+        indices = set()
+        if limit >= 2:
+            indices.add(0)
+            indices.add(len(raw) - 1)
+            step = (len(raw) - 1) / max(limit - 1, 1)
+            for i in range(1, limit - 1):
+                indices.add(int(round(i * step)))
+        else:
+            indices.add(0)
+        return [raw[i] for i in sorted(indices) if i < len(raw)][:limit]
--- a/backend/services/blog_writer/outline/outline_generator.py
+++ b/backend/services/blog_writer/outline/outline_generator.py
@@ -124,7 +124,8 @@ class OutlineGenerator:
        content_angle_titles = self.title_generator.extract_content_angle_titles(research)
        
        # Combine AI-generated titles with content angles (full primary keywords for title variety)
-        title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
+        research_topic = getattr(request, 'topic', '') or ''
+        title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords, research_topic)
        
        logger.info(f"Generated optimized outline with {len(balanced_sections)} sections and {len(title_options)} title options")
        
@@ -224,7 +225,8 @@ class OutlineGenerator:
        content_angle_titles = self.title_generator.extract_content_angle_titles(research)
        
        # Combine AI-generated titles with content angles (full primary keywords for title variety)
-        title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
+        research_topic = getattr(request, 'topic', '') or ''
+        title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords, research_topic)
        
        await task_manager.update_progress(task_id, "✅ Outline generation and optimization completed successfully!")
        
--- a/backend/services/blog_writer/outline/prompt_builder.py
+++ b/backend/services/blog_writer/outline/prompt_builder.py
@@ -36,12 +36,56 @@ class PromptBuilder:
        competitor_text = ', '.join(research.competitor_analysis.get('top_competitors', [])) if research and research.competitor_analysis else "Not available"
        opportunity_text = ', '.join(research.competitor_analysis.get('opportunities', [])) if research and research.competitor_analysis else "Not available"
        advantages_text = ', '.join(research.competitor_analysis.get('competitive_advantages', [])) if research and research.competitor_analysis else "Not available"
+        competitor_headings_text = ', '.join(research.competitor_analysis.get('competitor_headings', [])[:3]) if research and research.competitor_analysis and research.competitor_analysis.get('competitor_headings') else ""
        
        # Extract additional UI-mapped context fields
        analysis_insights_text = (research.keyword_analysis.get('analysis_insights', '') or '') if research and research.keyword_analysis else ''
        market_positioning_text = (research.competitor_analysis.get('market_positioning', '') or '') if research and research.competitor_analysis else ''
        difficulty_score = research.keyword_analysis.get('difficulty', None) if research and research.keyword_analysis else None

+        # Extract top 3 authoritative source excerpts as factual data points
+        source_excerpts_text = ""
+        if sources:
+            sorted_sources = sorted(
+                [s for s in sources if (s.excerpt or s.summary)],
+                key=lambda s: s.credibility_score or 0.8, reverse=True
+            )[:3]
+            excerpts = []
+            for i, src in enumerate(sorted_sources, 1):
+                excerpt = src.excerpt or src.summary or ""
+                if len(excerpt) > 300:
+                    excerpt = excerpt[:297] + "..."
+                excerpts.append(f"  {i}. \"{src.title}\" — {excerpt}")
+            if excerpts:
+                source_excerpts_text = "FACTUAL DATA POINTS FROM RESEARCH:\n" + "\n".join(excerpts)
+
+        # Extract recency: newest source publication date
+        newest_date_str = ""
+        if sources:
+            valid_dates = [s.published_at for s in sources if s.published_at]
+            if valid_dates:
+                try:
+                    parsed = [d for d in valid_dates if d[:4].isdigit()]
+                    if parsed:
+                        sorted_dates = sorted(parsed, reverse=True)
+                        newest_date_str = f"Most Recent Source: {sorted_dates[0]}"
+                except Exception:
+                    pass
+
+        # Extract top grounding evidence snippets as verified data points
+        grounding_evidence_text = ""
+        if research and research.grounding_metadata and research.grounding_metadata.grounding_supports:
+            supports = research.grounding_metadata.grounding_supports
+            top_supports = [s for s in supports if s.segment_text and len(s.segment_text) > 20][:3]
+            if top_supports:
+                evidence_parts = []
+                for i, s in enumerate(top_supports, 1):
+                    text = s.segment_text[:250]
+                    if len(s.segment_text) > 250:
+                        text += "..."
+                    evidence_parts.append(f"  {i}. {text}")
+                grounding_evidence_text = "VERIFIED EVIDENCE (high-confidence snippets):\n" + "\n".join(evidence_parts)
+
        # Build selected angle prominence section
        if selected_content_angle and selected_content_angle.strip():
            selected_angle_section = f"""
@@ -106,8 +150,14 @@ Top Competitors: {competitor_text}
 Market Opportunities: {opportunity_text}
 Competitive Advantages: {advantages_text}
 {f"Market Positioning: {market_positioning_text}" if market_positioning_text else ""}
+{f"Competitor Headings (AVOID duplicating): {competitor_headings_text}" if competitor_headings_text else ""}

 RESEARCH SOURCES: {len(sources)} authoritative sources available
+{newest_date_str}
+
+{source_excerpts_text}
+
+{grounding_evidence_text}

 {f"CUSTOM INSTRUCTIONS: {custom_instructions}" if custom_instructions else ""}

--- a/backend/services/blog_writer/outline/title_generator.py
+++ b/backend/services/blog_writer/outline/title_generator.py
@@ -54,58 +54,58 @@ class TitleGenerator:
        Returns:
            Formatted title string
        """
-        if not angle or len(angle.strip()) < 10:  # Too short to be a good title
+        if not angle or len(angle.strip()) < 10:
            return ""
        
-        # Clean up the angle
        cleaned_angle = angle.strip()
        
-        # Capitalize first letter of each sentence and proper nouns
-        sentences = cleaned_angle.split('. ')
-        formatted_sentences = []
-        for sentence in sentences:
-            if sentence.strip():
-                # Use title case for better formatting
-                formatted_sentence = sentence.strip().title()
-                formatted_sentences.append(formatted_sentence)
-        
-        formatted_title = '. '.join(formatted_sentences)
-        
-        # Ensure it ends with proper punctuation
-        if not formatted_title.endswith(('.', '!', '?')):
-            formatted_title += '.'
+        # Use sentence case: capitalize first letter, rest as-is
+        if cleaned_angle:
+            cleaned_angle = cleaned_angle[0].upper() + cleaned_angle[1:]
        
        # Limit length to reasonable blog title size
-        if len(formatted_title) > 200:
-            formatted_title = formatted_title[:197] + "..."
+        if len(cleaned_angle) > 120:
+            cleaned_angle = cleaned_angle[:117] + "..."
        
-        return formatted_title
+        return cleaned_angle
    
-    def combine_title_options(self, ai_titles: List[str], content_angle_titles: List[str], primary_keywords: List[str]) -> List[str]:
+    def combine_title_options(self, ai_titles: List[str], content_angle_titles: List[str], primary_keywords: List[str], research_topic: str = "") -> List[str]:
        """
        Combine AI-generated titles with content angle titles, ensuring variety and quality.
        
+        AI titles (proper SEO titles generated by LLM) take priority.
+        Content angle titles (long-format descriptions) are used as fallback.
+        The research topic is the last resort when nothing else exists.
+        
        Args:
-            ai_titles: AI-generated title options
-            content_angle_titles: Titles derived from content angles
+            ai_titles: AI-generated title options (proper blog titles, 50-65 chars)
+            content_angle_titles: Titles derived from content angles (longer, descriptive)
            primary_keywords: Primary keywords for fallback generation
+            research_topic: Original user research topic as ultimate fallback
            
        Returns:
            Combined list of title options (max 6 total)
        """
        all_titles = []
        
-        # Add content angle titles first (these are research-based and valuable)
-        for title in content_angle_titles[:3]:  # Limit to top 3 content angles
-            if title and title not in all_titles:
-                all_titles.append(title)
-        
-        # Add AI-generated titles
+        # 1. AI-generated titles first (proper SEO titles from LLM)
        for title in ai_titles:
            if title and title not in all_titles:
                all_titles.append(title)
        
-        # Note: Removed fallback titles as requested - only use research and AI-generated titles
+        # 2. Content angle titles as fallback (research-based, but verbose)
+        for title in content_angle_titles[:3]:
+            if title and title not in all_titles:
+                all_titles.append(title)
+        
+        # 3. Research topic as last resort when nothing was generated
+        if not all_titles and research_topic:
+            all_titles.append(research_topic)
+        
+        # 4. Primary keyword fallback as absolute last resort
+        if not all_titles and primary_keywords:
+            kw = primary_keywords[0]
+            all_titles.append(kw)
        
        # Limit to 6 titles maximum for UI usability
        final_titles = all_titles[:6]
@@ -115,9 +115,10 @@ class TitleGenerator:
    
    def generate_fallback_titles(self, primary_keywords: List[str]) -> List[str]:
        """Generate fallback titles when AI generation fails."""
+        from datetime import datetime
        primary_keyword = primary_keywords[0] if primary_keywords else "Topic"
        return [
            f"The Complete Guide to {primary_keyword}",
            f"{primary_keyword}: Everything You Need to Know",
-            f"How to Master {primary_keyword} in 2024"
+            f"How to Master {primary_keyword} in {datetime.now().year}"
        ]
--- a/backend/services/blog_writer/research/data_filter.py
+++ b/backend/services/blog_writer/research/data_filter.py
@@ -432,7 +432,7 @@ class ResearchDataFilter:
            'how to', 'guide', 'tutorial', 'steps', 'process', 'method',
            'best practices', 'tips', 'strategies', 'techniques', 'approach',
            'comparison', 'vs', 'versus', 'difference', 'pros and cons',
-            'trends', 'future', '2024', '2025', 'emerging', 'new'
+            'trends', 'future', str(datetime.now().year), str(datetime.now().year + 1), 'emerging', 'new'
        ]
        
        for indicator in actionable_indicators:
--- a/backend/services/blog_writer/research/research_service.py
+++ b/backend/services/blog_writer/research/research_service.py
@@ -720,7 +720,7 @@ class ResearchService:
                url=src.get("url", ""),
                excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
                credibility_score=float(src.get("credibility_score", 0.8)),
-                published_at=str(src.get("publication_date", "2024-01-01")),
+                published_at=str(src.get("publication_date", f"{datetime.now().year}-01-01")),
                index=src.get("index"),
                source_type=src.get("type", "web")
            )
--- a/backend/services/blog_writer/research/research_strategies.py
+++ b/backend/services/blog_writer/research/research_strategies.py
@@ -6,6 +6,7 @@ Different strategies for executing research based on depth and focus.

 from abc import ABC, abstractmethod
 from typing import Dict, Any
+from datetime import datetime
 from loguru import logger

 from models.blog_models import BlogResearchRequest, ResearchMode, ResearchConfig
@@ -87,7 +88,7 @@ Provide analysis in this EXACT format:
 - For each: Quote/claim, source URL, published date, metric/context.

 REQUIREMENTS:
- Every claim MUST include a source URL (authoritative, recent: 2024-2025 preferred).
+- Every claim MUST include a source URL (authoritative, recent: {datetime.now().year}-{datetime.now().year + 1} preferred).
 - Use concrete numbers, dates, outcomes; avoid generic advice.
 - Keep bullets tight and scannable for spoken narration."""
        return prompt.strip()
@@ -116,7 +117,7 @@ Research Topic: "{topic}"{date_filter}{source_filter}

 Provide COMPLETE analysis in this EXACT format:

-## WHAT'S CHANGED (2024-2025)
+## WHAT'S CHANGED ({datetime.now().year}-{datetime.now().year + 1})
 [5-7 concise trend bullets with numbers + source URLs]

 ## PROOF & NUMBERS
@@ -151,7 +152,7 @@ Primary (3), Secondary (8-10), Long-tail (5-7) with intent hints.
 VERIFICATION REQUIREMENTS:
 - Minimum 2 authoritative sources per major claim.
 - Prefer industry reports > research papers > news > blogs.
- 2024-2025 data strongly preferred.
+- {datetime.now().year}-{datetime.now().year + 1} data strongly preferred.
 - All numbers must include timeframe and methodology.
 - Every bullet must be concise for spoken narration and actionable for {target_audience}."""
        return prompt.strip()
@@ -213,7 +214,7 @@ REQUIREMENTS:
 - Cite all claims with authoritative source URLs
 - Include specific numbers, dates, examples
 - Focus on actionable insights for {target_audience}
- Use 2024-2025 data when available"""
+- Use {datetime.now().year}-{datetime.now().year + 1} data when available"""
        return prompt.strip()


--- a/backend/services/database.py
+++ b/backend/services/database.py
@@ -36,6 +36,8 @@ from models.podcast_models import PodcastProject
 from models.research_models import ResearchProject
 # Video Studio models
 from models.video_models import VideoGenerationTask
+# YouTube Creator task models
+from models.youtube_task_models import YouTubeVideoTask
 # Bing Analytics models
 from models.bing_analytics_models import Base as BingAnalyticsBase

--- a/backend/services/gsc_brainstorm_service.py
+++ b/backend/services/gsc_brainstorm_service.py
@@ -47,6 +47,10 @@ class GSCBrainstormService:
        if not site_url:
            sites = self.gsc_service.get_site_list(user_id)
            if not sites:
+                logger.info(f"No GSC sites found for user {user_id} — falling back to AI-only brainstorm")
+                fallback = self._generate_ai_only_brainstorm(user_id, keywords, None, None, None)
+                if fallback:
+                    return fallback
                return {
                    "error": "No GSC sites found. Make sure your site is verified in Google Search Console.",
                    "content_opportunities": [],
@@ -70,6 +74,10 @@ class GSCBrainstormService:
        )

        if "error" in analytics:
+            logger.info(f"GSC analytics error for user {user_id}: {analytics.get('error')} — falling back to AI-only brainstorm")
+            fallback = self._generate_ai_only_brainstorm(user_id, keywords, site_url, start_date, end_date)
+            if fallback:
+                return fallback
            return {
                "error": analytics.get("error", "Failed to fetch GSC data"),
                "content_opportunities": [],
@@ -88,6 +96,10 @@ class GSCBrainstormService:
        pages_data = self._parse_page_rows(page_rows)

        if not keywords_data:
+            logger.info(f"No GSC keyword data for user {user_id} — falling back to AI-only brainstorm")
+            fallback = self._generate_ai_only_brainstorm(user_id, keywords, site_url, start_date, end_date)
+            if fallback:
+                return fallback
            return {
                "error": "No keyword data available for the selected period. This usually means your site is new to GSC or hasn't received search traffic yet.",
                "content_opportunities": [],
@@ -110,6 +122,10 @@ class GSCBrainstormService:
        logger.info(f"After topic filter: {len(keywords_data)} keywords, {len(pages_data)} pages")

        if not keywords_data:
+            logger.info(f"No GSC keywords matched topic '{keywords}' for user {user_id} — falling back to AI-only brainstorm")
+            fallback = self._generate_ai_only_brainstorm(user_id, keywords, site_url, start_date, end_date)
+            if fallback:
+                return fallback
            return {
                "error": "No GSC keywords matched your topic. Try a broader research topic or check your GSC data.",
                "content_opportunities": [],
@@ -155,6 +171,128 @@ class GSCBrainstormService:
            "summary": summary,
        }

+    # ------------------------------------------------------------------ #
+    #  AI-only fallback (when GSC has no data)
+    # ------------------------------------------------------------------ #
+
+    def _generate_ai_only_brainstorm(
+        self,
+        user_id: str,
+        keywords: str,
+        site_url: Optional[str],
+        start_date: Optional[str],
+        end_date: Optional[str],
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Generate topic ideas using AI alone when GSC data is unavailable.
+        Returns a brainstorm-shaped result with empty GSC-specific arrays
+        but populated ai_recommendations.
+        """
+        try:
+            prompt = f"""You are an expert content strategist helping a blog writer brainstorm topic ideas.
+
+The user is interested in writing about: "{keywords}"
+
+Since they are a new or early-stage website, there is no Google Search Console data available yet.
+Generate compelling blog post ideas they can write RIGHT NOW to start building traffic.
+
+For each suggestion include:
+1. A specific, compelling blog post TITLE (not a vague topic)
+2. The primary keyword it should target
+3. Why this topic will perform well (search demand, competition level, timing)
+4. The recommended content format (how-to, listicle, comparison, pillar page, etc.)
+5. Estimated difficulty level (Easy / Medium / Hard)
+
+Return your response in this EXACT JSON format (no markdown, no code fences):
+{{
+  "immediate_opportunities": [
+    {{
+      "title": "Specific Blog Post Title",
+      "keyword": "primary target keyword",
+      "reason": "Why this will perform well",
+      "format": "How-To Guide | Listicle | Comparison | Pillar Page | etc.",
+      "estimated_impact": "Beginner-friendly traffic opportunity"
+    }}
+  ],
+  "content_strategy": [
+    {{
+      "title": "Pillar Content Title",
+      "keyword": "target keyword",
+      "reason": "Strategic importance for building topical authority",
+      "format": "Pillar Page | Ultimate Guide | Resource",
+      "estimated_impact": "Foundation for long-term organic growth"
+    }}
+  ],
+  "long_term_strategy": [
+    {{
+      "title": "Authority Building Title",
+      "keyword": "target keyword",
+      "reason": "Establishes expertise and captures high-intent traffic over time",
+      "format": "Research-Backed Analysis | Expert Roundup | Original Study",
+      "estimated_impact": "Compound traffic growth over 6-12 months"
+    }}
+  ]
+}}
+
+IMPORTANT:
+- Provide 3-5 items in each category
+- All suggestions MUST relate to the user's interest in "{keywords}"
+- Titles should be specific, compelling, and SEO-aware
+- Prioritize topics with clear search intent and realistic ranking potential for a new site
+- Include a mix of easy wins (long-tail, low competition) and strategic pillar content
+- For estimated_impact, describe the opportunity type (not click numbers since we lack data)"""
+
+            system_prompt = (
+                "You are an expert content strategist specializing in SEO and blog topic generation. "
+                "You help new websites identify high-potential content topics even without search console data. "
+                "You always respond with valid JSON matching the requested format exactly."
+            )
+
+            result = llm_text_gen(
+                prompt=prompt,
+                system_prompt=system_prompt,
+                user_id=user_id,
+                flow_type="gsc_brainstorm_fallback",
+            )
+
+            if result:
+                parsed = self._parse_ai_response(result)
+                if parsed:
+                    return {
+                        "content_opportunities": [],
+                        "keyword_gaps": [],
+                        "quick_wins": [],
+                        "page_opportunities": [],
+                        "ai_recommendations": parsed,
+                        "summary": {
+                            "site_url": site_url or "",
+                            "date_range": {
+                                "start": start_date or "",
+                                "end": end_date or "",
+                            },
+                            "total_keywords_analyzed": 0,
+                            "total_impressions": 0,
+                            "total_clicks": 0,
+                            "avg_ctr": 0,
+                            "avg_position": 0,
+                            "ctr_vs_benchmark": 0,
+                            "health_score": 0,
+                            "keyword_distribution": {
+                                "positions_1_3": 0,
+                                "positions_4_10": 0,
+                                "positions_11_20": 0,
+                                "positions_21_plus": 0,
+                            },
+                            "top_keywords": [],
+                            "top_pages": [],
+                            "note": "AI-generated suggestions based on your topic. No GSC data was available — these are strategic recommendations, not data-driven insights."
+                        },
+                    }
+        except Exception as e:
+            logger.warning(f"AI-only brainstorm fallback failed for user {user_id}: {e}")
+
+        return None
+
    # ------------------------------------------------------------------ #
    #  Data parsing helpers
    # ------------------------------------------------------------------ #
--- a/backend/services/gsc_service.py
+++ b/backend/services/gsc_service.py
@@ -188,7 +188,6 @@ class GSCService:
                
            with sqlite3.connect(db_path) as conn:
                cursor = conn.cursor()
-                # Check if table exists first to avoid error on fresh DB
                cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='gsc_credentials'")
                if not cursor.fetchone():
                    return None
@@ -204,7 +203,6 @@ class GSCService:
                
                credentials_data = json.loads(result[0])
                
-                # Check for required fields, but allow connection without refresh token
                required_fields = ['token_uri', 'client_id', 'client_secret']
                missing_fields = [field for field in required_fields if not credentials_data.get(field)]
                
@@ -214,7 +212,6 @@ class GSCService:
                
                credentials = Credentials.from_authorized_user_info(credentials_data, self.scopes)
                
-                # Refresh token if needed and possible
                if credentials.expired:
                    if credentials.refresh_token:
                        try:
@@ -222,9 +219,11 @@ class GSCService:
                            self.save_user_credentials(user_id, credentials)
                        except Exception as e:
                            logger.error(f"Failed to refresh GSC token for user {user_id}: {e}")
+                            self.clear_incomplete_credentials(user_id)
                            return None
                    else:
                        logger.warning(f"GSC token expired for user {user_id} but no refresh token available - user needs to re-authorize")
+                        self.clear_incomplete_credentials(user_id)
                        return None
                
                return credentials
@@ -288,7 +287,6 @@ class GSCService:
        try:
            logger.info(f"Handling GSC OAuth callback with state: {state[:20]}...")
            
-            # Extract user_id from state
            if ':' not in state:
                logger.error(f"Invalid GSC state format: {state}")
                return False
@@ -300,17 +298,19 @@ class GSCService:
                logger.error(f"User database not found for user {user_id}")
                return False

-            # Verify state in user's DB (but don't delete yet — delete after successful token exchange)
-            with sqlite3.connect(db_path) as conn:
-                cursor = conn.cursor()
-                cursor.execute('SELECT user_id FROM gsc_oauth_states WHERE state = ?', (state,))
-                result = cursor.fetchone()
-                
-                if not result:
-                    logger.error(f"Invalid or expired GSC OAuth state for user {user_id}")
-                    return False
-            
-            # Exchange code for credentials
+            # Verify state in user's DB (best effort — if missing, attempt code exchange anyway)
+            state_valid = False
+            try:
+                with sqlite3.connect(db_path) as conn:
+                    cursor = conn.cursor()
+                    cursor.execute('SELECT user_id FROM gsc_oauth_states WHERE state = ?', (state,))
+                    state_valid = cursor.fetchone() is not None
+            except Exception as state_err:
+                logger.warning(f"State verification query failed, proceeding anyway: {state_err}")
+
+            if not state_valid:
+                logger.warning(f"GSC OAuth state not found in DB for user {user_id} — will attempt code exchange without state verification")
+
            if not self.client_config:
                logger.error("Cannot handle callback: Client configuration not loaded")
                return False
@@ -324,21 +324,30 @@ class GSCService:
            
            flow.fetch_token(code=authorization_code)
            credentials = flow.credentials
+
+            if not credentials or not credentials.token:
+                logger.error(f"Token exchange returned empty credentials for user {user_id}")
+                return False
            
-            # State consumed successfully — clean up
-            try:
-                with sqlite3.connect(db_path) as conn:
-                    cursor = conn.cursor()
-                    cursor.execute('DELETE FROM gsc_oauth_states WHERE state = ?', (state,))
-                    conn.commit()
-            except Exception as cleanup_err:
-                logger.warning(f"Failed to clean up OAuth state: {cleanup_err}")
+            # Clean up state if it was valid
+            if state_valid:
+                try:
+                    with sqlite3.connect(db_path) as conn:
+                        cursor = conn.cursor()
+                        cursor.execute('DELETE FROM gsc_oauth_states WHERE state = ?', (state,))
+                        conn.commit()
+                except Exception as cleanup_err:
+                    logger.warning(f"Failed to clean up OAuth state: {cleanup_err}")
            
-            # Save credentials
-            return self.save_user_credentials(user_id, credentials)
+            result = self.save_user_credentials(user_id, credentials)
+            if result:
+                logger.info(f"GSC OAuth callback succeeded for user {user_id} (state_valid={state_valid})")
+            else:
+                logger.error(f"GSC OAuth callback: token exchange succeeded but failed to save credentials for user {user_id}")
+            return result
            
        except Exception as e:
-            logger.error(f"Error handling GSC OAuth callback: {e}")
+            logger.error(f"Error handling GSC OAuth callback for user {user_id if 'user_id' in dir() else 'unknown'}: {e}")
            return False

    
@@ -726,6 +735,8 @@ class GSCService:
            with sqlite3.connect(db_path) as conn:
                cursor = conn.cursor()
                cursor.execute('DELETE FROM gsc_credentials WHERE user_id = ?', (user_id,))
+                cursor.execute('DELETE FROM gsc_data_cache WHERE user_id = ?', (user_id,))
+                cursor.execute('DELETE FROM gsc_oauth_states WHERE user_id = ?', (user_id,))
                conn.commit()
            
            logger.info(f"Cleared incomplete GSC credentials for user: {user_id}")
--- a/backend/services/integrations/wix/auth.py
+++ b/backend/services/integrations/wix/auth.py
@@ -66,12 +66,19 @@ class WixAuthService:
        response.raise_for_status()
        return response.json()

-    def get_site_info(self, access_token: str) -> Dict[str, Any]:
+    def get_site_info(self, access_token: str, meta_site_id: Optional[str] = None) -> Dict[str, Any]:
        headers = {
            'Authorization': f'Bearer {access_token}',
-            'Content-Type': 'application/json'
+            'Content-Type': 'application/json',
        }
+        if self.client_id:
+            headers['wix-client-id'] = self.client_id
+        if meta_site_id:
+            headers['wix-site-id'] = meta_site_id
        response = requests.get(f"{self.base_url}/sites/v1/site", headers=headers)
+        if response.status_code == 404:
+            logger.warning("Wix site info not found (404) — user may not have a published site or token lacks sites scope")
+            return {"_no_site": True, "error": "No Wix site found for this account"}
        response.raise_for_status()
        return response.json()

--- a/backend/services/integrations/wix/blog_publisher.py
+++ b/backend/services/integrations/wix/blog_publisher.py
@@ -295,39 +295,39 @@ def create_blog_post(
    wix_logger.log_token_info(token_length, has_blog_scope, meta_site_id)
    
    # Convert markdown to Ricos
-    ricos_content = convert_content_to_ricos(content, None)
+    # PRIMARY: Use Wix Ricos Documents API for best formatting support (tables, complex markdown, etc.)
+    # FALLBACK: Use custom parser if Wix API fails
+    ricos_content = None
+    try:
+        logger.info("Converting markdown via Wix Ricos Documents API...")
+        ricos_content = convert_via_wix_api(content, access_token, base_url)
+        logger.info(f"Wix API conversion succeeded: {len(ricos_content.get('nodes', []))} nodes")
+    except Exception as e:
+        logger.warning(f"Wix API conversion failed, falling back to custom parser: {e}")
+    
+    if not ricos_content or not isinstance(ricos_content, dict) or 'nodes' not in ricos_content:
+        logger.info("Using custom markdown parser for Ricos conversion")
+        ricos_content = convert_content_to_ricos(content, None)
+    
    nodes_count = len(ricos_content.get('nodes', []))
    wix_logger.log_ricos_conversion(nodes_count)
    
    # Validate Ricos content structure
-    # Per Wix Blog API documentation: richContent should ONLY contain 'nodes'
-    # The example in docs shows: { nodes: [...] } - no type, id, metadata, or documentStyle
    if not isinstance(ricos_content, dict):
-        logger.error(f"❌ richContent is not a dict: {type(ricos_content)}")
+        logger.error(f"richContent is not a dict: {type(ricos_content)}")
        raise ValueError("richContent must be a dictionary object")
    
    if 'nodes' not in ricos_content or not isinstance(ricos_content['nodes'], list):
-        logger.error(f"❌ richContent.nodes is missing or not a list: {ricos_content.get('nodes', 'MISSING')}")
+        logger.error(f"richContent.nodes is missing or not a list: {ricos_content.get('nodes', 'MISSING')}")
        raise ValueError("richContent must contain a 'nodes' array")
    
-    # Remove type and id fields (not expected by Blog API)
-    # NOTE: metadata is optional - Wix UPDATE endpoint example shows it, but CREATE example doesn't
-    # We'll keep it minimal (nodes only) for CREATE to match the recipe example
-    fields_to_remove = ['type', 'id']
-    for field in fields_to_remove:
+    # Remove top-level fields not expected by Blog API CREATE endpoint
+    # (Wix API converter may include type, id, metadata, documentStyle — strip them)
+    for field in ['type', 'id', 'metadata', 'documentStyle']:
        if field in ricos_content:
-            logger.debug(f"Removing '{field}' field from richContent (Blog API doesn't expect this)")
+            logger.debug(f"Removing '{field}' from richContent for Blog API compatibility")
            del ricos_content[field]
    
-    # Remove metadata and documentStyle - Blog API CREATE endpoint example shows only 'nodes'
-    # (UPDATE endpoint shows metadata, but we're using CREATE)
-    if 'metadata' in ricos_content:
-        logger.debug("Removing 'metadata' from richContent (CREATE endpoint expects only 'nodes')")
-        del ricos_content['metadata']
-    if 'documentStyle' in ricos_content:
-        logger.debug("Removing 'documentStyle' from richContent (CREATE endpoint expects only 'nodes')")
-        del ricos_content['documentStyle']
-    
    # Ensure we only have 'nodes' in richContent for CREATE endpoint
    ricos_content = {'nodes': ricos_content['nodes']}
    
--- a/backend/services/intelligence/sif_integration.py
+++ b/backend/services/intelligence/sif_integration.py
@@ -708,7 +708,48 @@ class SIFIntegrationService:
                themes = adv_insights.get('augmented_themes', [])
                if themes:
                    text_content += f"Augmented Themes: {', '.join(themes[:5])}. "
-                
+
+                freshness = adv_insights.get('freshness', {})
+                if freshness:
+                    text_content += (f"Content Freshness Score: {freshness.get('freshness_score', 'N/A')}. "
+                                     f"Publishing Velocity: {freshness.get('publishing_velocity', 0)}/week. "
+                                     f"Trend: {freshness.get('publishing_trend', 'unknown')}. "
+                                     f"Last 30d: {freshness.get('publishing_recency', {}).get('last_30d', 0)} pages. ")
+
+                link_health = adv_insights.get('link_health', {})
+                if link_health and 'error' not in link_health:
+                    text_content += (f"Internal Links: {link_health.get('internal_link_count', 0)}. "
+                                     f"External Links: {link_health.get('external_link_count', 0)}. "
+                                     f"Nofollow: {link_health.get('nofollow_link_count', 0)}. "
+                                     f"Avg Links/Page: {link_health.get('avg_links_per_page', 0)}. ")
+
+                redirects = adv_insights.get('redirect_audit', {})
+                if redirects and 'error' not in redirects:
+                    text_content += (f"Redirects: {redirects.get('total_redirects', 0)} total, "
+                                     f"{redirects.get('multi_hop_chains', 0)} multi-hop. ")
+
+                image_seo = adv_insights.get('image_seo', {})
+                if image_seo and 'error' not in image_seo:
+                    text_content += (f"Images: {image_seo.get('total_images', 0)} total, "
+                                     f"Alt Coverage: {image_seo.get('alt_coverage_percentage', 0)}%. ")
+
+                url_struct = adv_insights.get('url_structure', {})
+                if url_struct:
+                    text_content += (f"URL Structure: {url_struct.get('total_urls_analyzed', 0)} URLs, "
+                                     f"Avg Depth: {url_struct.get('directory_depth', {}).get('average_depth', 0)}. "
+                                     f"Params: {url_struct.get('parameter_usage', {}).get('percentage_with_params', 0)}%. ")
+
+                robots = adv_insights.get('robots_txt', {})
+                if robots and robots.get('success'):
+                    text_content += (f"Robots.txt: {robots.get('total_directives', 0)} directives, "
+                                     f"Compliance: {robots.get('compliance_score', 0)}/100. "
+                                     f"Issues: {len(robots.get('issues', []))}. ")
+
+                budget = adv_insights.get('crawl_budget', {})
+                if budget and budget.get('success'):
+                    text_content += (f"Crawl Budget: {budget.get('pages_crawled', 0)} crawled of {budget.get('sitemap_total_urls', 0)} URLs. "
+                                     f"Waste: {budget.get('waste_percentage', 0)}%. "
+                                     f"Score: {budget.get('optimization_score', 0)}. ")
            # Add Technical SEO overview
            tech_audit = dashboard_data.get('technical_seo_audit', {})
            if tech_audit:
--- a/backend/services/scheduler/core/failure_detection_service.py
+++ b/backend/services/scheduler/core/failure_detection_service.py
@@ -370,6 +370,136 @@ class FailureDetectionService:
                        "last_failure": task.last_failure.isoformat() if task.last_failure else None
                    })
            
+            # Check onboarding full website analysis tasks
+            from models.website_analysis_monitoring_models import OnboardingFullWebsiteAnalysisTask
+            onboarding_tasks = self.db.query(OnboardingFullWebsiteAnalysisTask).filter(
+                OnboardingFullWebsiteAnalysisTask.status == "needs_intervention"
+            )
+            if user_id:
+                onboarding_tasks = onboarding_tasks.filter(OnboardingFullWebsiteAnalysisTask.user_id == user_id)
+            
+            for task in onboarding_tasks.all():
+                pattern = self.analyze_task_failures(task.id, "onboarding_full_website_analysis", task.user_id)
+                tasks_needing_intervention.append({
+                    "task_id": task.id,
+                    "task_type": "onboarding_full_website_analysis",
+                    "user_id": task.user_id,
+                    "website_url": task.website_url,
+                    "failure_pattern": {
+                        "consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
+                        "recent_failures": pattern.recent_failures if pattern else 0,
+                        "failure_reason": pattern.failure_reason.value if pattern else "unknown",
+                        "last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
+                        "error_patterns": pattern.error_patterns if pattern else [],
+                    },
+                    "failure_reason": task.failure_reason,
+                    "last_failure": task.last_failure.isoformat() if task.last_failure else None
+                })
+            
+            # Check deep competitor analysis tasks
+            from models.website_analysis_monitoring_models import DeepCompetitorAnalysisTask
+            competitor_tasks = self.db.query(DeepCompetitorAnalysisTask).filter(
+                DeepCompetitorAnalysisTask.status == "needs_intervention"
+            )
+            if user_id:
+                competitor_tasks = competitor_tasks.filter(DeepCompetitorAnalysisTask.user_id == user_id)
+            
+            for task in competitor_tasks.all():
+                pattern = self.analyze_task_failures(task.id, "deep_competitor_analysis", task.user_id)
+                tasks_needing_intervention.append({
+                    "task_id": task.id,
+                    "task_type": "deep_competitor_analysis",
+                    "user_id": task.user_id,
+                    "website_url": task.website_url,
+                    "failure_pattern": {
+                        "consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
+                        "recent_failures": pattern.recent_failures if pattern else 0,
+                        "failure_reason": pattern.failure_reason.value if pattern else "unknown",
+                        "last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
+                        "error_patterns": pattern.error_patterns if pattern else [],
+                    },
+                    "failure_reason": task.failure_reason,
+                    "last_failure": task.last_failure.isoformat() if task.last_failure else None
+                })
+            
+            # Check SIF indexing tasks
+            from models.website_analysis_monitoring_models import SIFIndexingTask
+            sif_tasks = self.db.query(SIFIndexingTask).filter(
+                SIFIndexingTask.status == "needs_intervention"
+            )
+            if user_id:
+                sif_tasks = sif_tasks.filter(SIFIndexingTask.user_id == user_id)
+            
+            for task in sif_tasks.all():
+                pattern = self.analyze_task_failures(task.id, "sif_indexing", task.user_id)
+                tasks_needing_intervention.append({
+                    "task_id": task.id,
+                    "task_type": "sif_indexing",
+                    "user_id": task.user_id,
+                    "website_url": task.website_url,
+                    "failure_pattern": {
+                        "consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
+                        "recent_failures": pattern.recent_failures if pattern else 0,
+                        "failure_reason": pattern.failure_reason.value if pattern else "unknown",
+                        "last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
+                        "error_patterns": pattern.error_patterns if pattern else [],
+                    },
+                    "failure_reason": task.failure_reason,
+                    "last_failure": task.last_failure.isoformat() if task.last_failure else None
+                })
+            
+            # Check market trends tasks
+            from models.website_analysis_monitoring_models import MarketTrendsTask
+            trends_tasks = self.db.query(MarketTrendsTask).filter(
+                MarketTrendsTask.status == "needs_intervention"
+            )
+            if user_id:
+                trends_tasks = trends_tasks.filter(MarketTrendsTask.user_id == user_id)
+            
+            for task in trends_tasks.all():
+                pattern = self.analyze_task_failures(task.id, "market_trends", task.user_id)
+                tasks_needing_intervention.append({
+                    "task_id": task.id,
+                    "task_type": "market_trends",
+                    "user_id": task.user_id,
+                    "website_url": task.website_url,
+                    "failure_pattern": {
+                        "consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
+                        "recent_failures": pattern.recent_failures if pattern else 0,
+                        "failure_reason": pattern.failure_reason.value if pattern else "unknown",
+                        "last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
+                        "error_patterns": pattern.error_patterns if pattern else [],
+                    },
+                    "failure_reason": task.failure_reason,
+                    "last_failure": task.last_failure.isoformat() if task.last_failure else None
+                })
+            
+            # Check advertools tasks (paused tasks may also need attention)
+            from models.website_analysis_monitoring_models import AdvertoolsTask
+            advertools_tasks = self.db.query(AdvertoolsTask).filter(
+                AdvertoolsTask.status.in_(["needs_intervention", "failed"])
+            )
+            if user_id:
+                advertools_tasks = advertools_tasks.filter(AdvertoolsTask.user_id == user_id)
+            
+            for task in advertools_tasks.all():
+                pattern = self.analyze_task_failures(task.id, "advertools", task.user_id)
+                tasks_needing_intervention.append({
+                    "task_id": task.id,
+                    "task_type": "advertools",
+                    "user_id": task.user_id,
+                    "website_url": task.website_url,
+                    "failure_pattern": {
+                        "consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
+                        "recent_failures": pattern.recent_failures if pattern else 0,
+                        "failure_reason": pattern.failure_reason.value if pattern else "unknown",
+                        "last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
+                        "error_patterns": pattern.error_patterns if pattern else [],
+                    },
+                    "failure_reason": task.failure_reason,
+                    "last_failure": task.last_failure.isoformat() if task.last_failure else None
+                })
+            
            return tasks_needing_intervention
            
        except Exception as e:
--- a/backend/services/scheduler/executors/advertools_executor.py
+++ b/backend/services/scheduler/executors/advertools_executor.py
@@ -1,6 +1,7 @@
 import asyncio
 from datetime import datetime, timedelta
 from typing import Any, Dict, List
+from urllib.parse import urlparse
 from loguru import logger
 from sqlalchemy.orm import Session
 from sqlalchemy import text
@@ -63,27 +64,66 @@ class AdvertoolsExecutor:

            result = {}
            if task_type == 'content_audit':
-                # Phase 1: Audit content themes using sample URLs from sitemap
-                # First, get the sitemap to find recent URLs
+                # Phase 1: Get sitemap analysis (freshness, URL structure, pillars)
                sitemap_result = await self.advertools_service.analyze_sitemap(effective_url)
                
                audit_urls = []
+                url_structure = {}
+                freshness = {}
                if sitemap_result.get('success'):
-                    # Use the sample URLs returned by the service
-                    audit_urls = sitemap_result.get('metrics', {}).get('audit_sample_urls', [])
+                    metrics = sitemap_result.get('metrics', {})
+                    audit_urls = metrics.get('audit_sample_urls', [])
+                    url_structure = metrics.get('url_structure', {})
+                    freshness = {
+                        "freshness_score": metrics.get('freshness_score'),
+                        "publishing_velocity": metrics.get('publishing_velocity'),
+                        "stale_content_percentage": metrics.get('stale_content_percentage'),
+                        "publishing_recency": metrics.get('publishing_recency'),
+                        "publishing_trend": metrics.get('publishing_trend'),
+                    }
                
                if not audit_urls:
-                    # Fallback to homepage if sitemap fails or empty
                    audit_urls = [website_url]
                
-                # Run the audit on the sample
-                result = await self.advertools_service.audit_content(audit_urls)
+                # Phase 2: Theme analysis via content audit
+                audit_result = await self.advertools_service.audit_content(audit_urls)
+                
+                # Phase 3: Site structure analysis (links, redirects, image SEO)
+                site_domain = urlparse(website_url).netloc or website_url
+                structure_result = await self.advertools_service.analyze_site_structure(
+                    audit_urls, site_domain=site_domain
+                )
+                
+                # Phase 4: Robots.txt compliance analysis
+                robots_result = await self.advertools_service.analyze_robots_txt(website_url)
+                
+                # Phase 5: Crawl budget analysis
+                budget_result = await self.advertools_service.analyze_crawl_budget(
+                    effective_url, site_domain
+                )
+                
+                # Merge results
+                result = {
+                    "success": audit_result.get('success', False) or structure_result.get('success', False),
+                    "themes": audit_result.get('themes', []),
+                    "page_count": audit_result.get('page_count', 0),
+                    "avg_word_count": audit_result.get('avg_word_count', 0),
+                    "link_health": structure_result.get('link_health', {}),
+                    "redirect_audit": structure_result.get('redirect_audit', {}),
+                    "image_seo": structure_result.get('image_seo', {}),
+                    "page_status": structure_result.get('page_status', {}),
+                    "url_structure": url_structure,
+                    "freshness": freshness,
+                    "robots_txt": robots_result,
+                    "crawl_budget": budget_result,
+                    "timestamp": datetime.utcnow().isoformat()
+                }
                
                if result.get('success'):
                    await self._update_persona_augmentation(user_id, website_url, result, db)
                    
            elif task_type == 'site_health':
-                # Phase 1: Check site health (freshness, velocity)
+                # Site health: freshness, velocity, URL structure
                result = await self.advertools_service.analyze_sitemap(effective_url)
                
                if result.get('success'):
@@ -157,7 +197,8 @@ class AdvertoolsExecutor:

    async def _update_persona_augmentation(self, user_id: str, website_url: str, audit_result: Dict[str, Any], db: Session):
        """
-        Updates the user's Brand Persona with discovered themes from the content audit.
+        Updates the user's Brand Persona with discovered themes, site structure,
+        link health, and redirect data from the content audit.
        """
        try:
            session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
@@ -170,18 +211,40 @@ class AdvertoolsExecutor:
                self.logger.warning(f"No website analysis found for user {user_id}")
                return

-            # Update brand_analysis with augmented themes
            current_brand = analysis.brand_analysis or {}
            
-            # Add or update the 'augmented_themes' field
+            # Core themes
            current_brand['augmented_themes'] = audit_result.get('themes', [])
+            
+            # Link health
+            current_brand['link_health'] = audit_result.get('link_health', {})
+            
+            # Redirect audit
+            current_brand['redirect_audit'] = audit_result.get('redirect_audit', {})
+            
+            # Image SEO
+            current_brand['image_seo'] = audit_result.get('image_seo', {})
+            
+            # Page status distribution
+            current_brand['page_status'] = audit_result.get('page_status', {})
+            
+            # URL structure analysis
+            current_brand['url_structure'] = audit_result.get('url_structure', {})
+            
+            # Freshness
+            current_brand['freshness'] = audit_result.get('freshness', {})
+            
+            # Robots.txt compliance
+            current_brand['robots_txt'] = audit_result.get('robots_txt', {})
+            
+            # Crawl budget analysis
+            current_brand['crawl_budget'] = audit_result.get('crawl_budget', {})
+            
            current_brand['last_advertools_audit'] = datetime.utcnow().isoformat()
            
-            # Force SQLAlchemy to detect change in JSON field
            from sqlalchemy.orm.attributes import flag_modified
            flag_modified(analysis, "brand_analysis")
            
-            # Also update content_strategy_insights if relevant
            if 'avg_word_count' in audit_result:
                current_strategy = analysis.content_strategy_insights or {}
                current_strategy['avg_content_length'] = audit_result['avg_word_count']
@@ -196,7 +259,8 @@ class AdvertoolsExecutor:

    async def _update_site_health_metrics(self, user_id: str, website_url: str, health_result: Dict[str, Any], db: Session):
        """
-        Updates the WebsiteAnalysis with site health metrics (velocity, freshness).
+        Updates the WebsiteAnalysis with site health metrics (velocity, freshness,
+        URL structure analysis, freshness score).
        """
        try:
            session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
@@ -207,7 +271,6 @@ class AdvertoolsExecutor:
            if not analysis:
                return

-            # Update seo_audit with health metrics
            current_seo = analysis.seo_audit or {}
            metrics = health_result.get('metrics', {})
            
@@ -216,7 +279,11 @@ class AdvertoolsExecutor:
                "publishing_velocity": metrics.get('publishing_velocity'),
                "stale_content_count": metrics.get('stale_content_count'),
                "stale_content_percentage": metrics.get('stale_content_percentage'),
-                "top_pillars": metrics.get('top_pillars')
+                "freshness_score": metrics.get('freshness_score'),
+                "publishing_recency": metrics.get('publishing_recency'),
+                "publishing_trend": metrics.get('publishing_trend'),
+                "top_pillars": metrics.get('top_pillars'),
+                "url_structure": metrics.get('url_structure', {})
            }
            current_seo['last_advertools_health_check'] = datetime.utcnow().isoformat()
            
--- a/backend/services/seo/advertools_service.py
+++ b/backend/services/seo/advertools_service.py
@@ -1,12 +1,18 @@
 import advertools as adv
 import pandas as pd
 import asyncio
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, Tuple
 from datetime import datetime, timedelta
 from loguru import logger
 import json
 import os
 import tempfile
+from urllib.parse import urlparse
+from collections import Counter
+import urllib.request
+import urllib.error
+import socket
+import re

 class AdvertoolsService:
    """
@@ -19,51 +25,58 @@ class AdvertoolsService:

    async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
        """
-        Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
+        Analyzes a website's sitemap to extract metrics on publishing velocity, freshness,
+        URL structure patterns, and topic distribution.
        """
        try:
            self.logger.info(f"Analyzing sitemap: {sitemap_url}")
            
-            # advertools sitemap_to_df is blocking, run in executor
            loop = asyncio.get_event_loop()
            df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
            
            if df is None or df.empty:
                return {"success": False, "error": "Sitemap is empty or could not be parsed."}

-            # Convert lastmod to datetime
            if 'lastmod' in df.columns:
                df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
                
            total_urls = len(df)
            
-            # Handle potential empty datetime columns
-            if 'lastmod' in df.columns and not df['lastmod'].isna().all():
-                now = datetime.now(df['lastmod'].dt.tz)
-                thirty_days_ago = now - timedelta(days=30)
-                recent_urls = df[df['lastmod'] > thirty_days_ago]
-                six_months_ago = now - timedelta(days=180)
-                stale_urls = df[df['lastmod'] < six_months_ago]
-                
-                publishing_velocity = len(recent_urls) / 4.0 # URLs per week
-                stale_count = len(stale_urls)
-            else:
-                publishing_velocity = 0
-                stale_count = 0
+            # --- Content Freshness Scoring ---
+            freshness = self._compute_freshness(df)
            
-            # Enhanced Content Pillars (Top folder patterns - 3 levels deep)
-            def extract_hierarchy(url: str):
-                try:
-                    parts = urlparse(url).path.strip('/').split('/')
-                    if not parts or not parts[0]: return "home"
-                    return "/".join(parts[:2]) # Capture top 2 segments
-                except:
-                    return "other"
+            # --- URL Structure Analysis ---
+            url_structure = {}
+            if 'loc' in df.columns:
+                url_structure = await self._analyze_url_structure(df['loc'].tolist())
+            
+            # --- Content Pillars via url_to_df ---
+            pillars = {}
+            url_df = None
+            try:
+                url_df = adv.url_to_df(df['loc'])
+                if url_df is not None and not url_df.empty:
+                    dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
+                    if dir_cols:
+                        pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
+                        for col in dir_cols[1:3]:
+                            mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
+                            pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
+                        pillars = pillar_series.value_counts().head(15).to_dict()
+            except Exception:
+                fallback_pillars = {}
+                if 'loc' in df.columns:
+                    def extract_hierarchy(url: str):
+                        try:
+                            parts = urlparse(url).path.strip('/').split('/')
+                            if not parts or not parts[0]: return "home"
+                            return "/".join(parts[:2])
+                        except:
+                            return "other"
+                    fallback_pillars = df['loc'].apply(extract_hierarchy).value_counts().head(15).to_dict()
+                pillars = fallback_pillars

-            df['pillar'] = df['loc'].apply(extract_hierarchy)
-            pillars = df['pillar'].value_counts().head(15).to_dict()
-
-            # Return a sample of URLs for auditing (top 15 most recent if available)
+            # Sample URLs for auditing (top 15 most recent)
            audit_urls = []
            if 'lastmod' in df.columns and not df['lastmod'].isna().all():
                audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
@@ -74,10 +87,14 @@ class AdvertoolsService:
                "success": True,
                "metrics": {
                    "total_urls": total_urls,
-                    "publishing_velocity": round(publishing_velocity, 2),
-                    "stale_content_count": stale_count,
-                    "stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
+                    "publishing_velocity": freshness.get("publishing_velocity"),
+                    "stale_content_count": freshness.get("stale_count"),
+                    "stale_content_percentage": freshness.get("stale_percentage"),
+                    "freshness_score": freshness.get("freshness_score"),
+                    "publishing_recency": freshness.get("publishing_recency"),
+                    "publishing_trend": freshness.get("publishing_trend"),
                    "top_pillars": pillars,
+                    "url_structure": url_structure,
                    "audit_sample_urls": audit_urls
                },
                "timestamp": datetime.utcnow().isoformat()
@@ -86,6 +103,146 @@ class AdvertoolsService:
            self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
            return {"success": False, "error": str(e)}

+    def _compute_freshness(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """Compute content freshness, publishing velocity, and staleness metrics."""
+        result = {
+            "publishing_velocity": 0,
+            "stale_count": 0,
+            "stale_percentage": 0,
+            "freshness_score": 0,
+            "publishing_recency": {},
+            "publishing_trend": "unknown"
+        }
+        
+        if 'lastmod' not in df.columns or df['lastmod'].isna().all():
+            return result
+
+        lastmod = df['lastmod'].dropna()
+        if lastmod.empty:
+            return result
+
+        now = datetime.now(lastmod.dt.tz)
+        thirty_days_ago = now - timedelta(days=30)
+        ninety_days_ago = now - timedelta(days=90)
+        six_months_ago = now - timedelta(days=180)
+
+        recent_urls = df[df['lastmod'] > thirty_days_ago]
+        stale_urls = df[df['lastmod'] < six_months_ago]
+        
+        total_urls = len(df)
+        stale_count = len(stale_urls)
+        stale_percentage = round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0
+
+        # Publishing velocity: URLs per week over last 90 days
+        recent_90 = df[df['lastmod'] > ninety_days_ago]
+        publishing_velocity = round(len(recent_90) / 13.0, 2) if not recent_90.empty else 0
+
+        # Freshness score (0-100): weighted combination of metrics
+        non_stale_ratio = 1.0 - (stale_percentage / 100.0)
+        recency_ratio = len(recent_urls) / max(total_urls, 1)
+        velocity_score = min(publishing_velocity / 10.0, 1.0)
+        freshness_score = round((non_stale_ratio * 50 + recency_ratio * 30 + velocity_score * 20), 1)
+
+        # Publishing recency: URLs published in last 1d, 7d, 30d, 90d
+        publishing_recency = {
+            "last_24h": int(len(df[df['lastmod'] > (now - timedelta(days=1))])),
+            "last_7d": int(len(df[df['lastmod'] > (now - timedelta(days=7))])),
+            "last_30d": int(len(recent_urls)),
+            "last_90d": int(len(recent_90)),
+        }
+
+        # Publishing trend: compare recent 30d vs prior 30d
+        prior_30 = df[(df['lastmod'] <= thirty_days_ago) & (df['lastmod'] > (now - timedelta(days=60)))]
+        recent_count = len(recent_urls)
+        prior_count = len(prior_30)
+        if recent_count > prior_count * 1.1:
+            publishing_trend = "increasing"
+        elif recent_count < prior_count * 0.9:
+            publishing_trend = "decreasing"
+        else:
+            publishing_trend = "stable"
+
+        return {
+            "publishing_velocity": publishing_velocity,
+            "stale_count": stale_count,
+            "stale_percentage": stale_percentage,
+            "freshness_score": freshness_score,
+            "publishing_recency": publishing_recency,
+            "publishing_trend": publishing_trend
+        }
+
+    async def _analyze_url_structure(self, urls: List[str]) -> Dict[str, Any]:
+        """Analyze URL patterns for parameter bloat, directory depth, and path patterns."""
+        try:
+            loop = asyncio.get_event_loop()
+            url_df = await loop.run_in_executor(None, lambda: adv.url_to_df(urls))
+
+            if url_df is None or url_df.empty:
+                return {}
+
+            total = len(url_df)
+
+            # Query param analysis
+            has_query = url_df['query'].notna() & (url_df['query'] != '')
+            param_count = has_query.sum()
+            param_percentage = round((param_count / total) * 100, 2) if total > 0 else 0
+
+            # Extract individual parameters
+            all_params = []
+            param_frequency = {}
+            if param_count > 0:
+                for q in url_df.loc[has_query, 'query'].dropna().unique():
+                    for pair in q.split('&'):
+                        key = pair.split('=')[0] if '=' in pair else pair
+                        all_params.append(key)
+                from collections import Counter
+                param_frequency = dict(Counter(all_params).most_common(10))
+
+            # Directory depth analysis
+            dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
+            def count_depth(row):
+                for i, col in enumerate(dir_cols):
+                    val = row[col]
+                    if pd.isna(val) or str(val) == 'nan' or str(val).strip() == '':
+                        return i
+                return len(dir_cols)
+
+            depths = url_df.apply(count_depth, axis=1)
+            avg_depth = round(depths.mean(), 1) if not depths.empty else 0
+            max_depth = int(depths.max()) if not depths.empty else 0
+            depth_distribution = depths.value_counts().sort_index().head(10).to_dict()
+            depth_distribution = {str(k): int(v) for k, v in depth_distribution.items()}
+
+            # Protocol consistency
+            schemes = url_df['scheme'].value_counts().to_dict() if 'scheme' in url_df.columns else {}
+
+            # Subdomain analysis
+            netloc_counts = url_df['netloc'].value_counts() if 'netloc' in url_df.columns else None
+            unique_subdomains = int(netloc_counts.nunique()) if netloc_counts is not None else 0
+            primary_domain = netloc_counts.index[0] if netloc_counts is not None and not netloc_counts.empty else ""
+
+            return {
+                "total_urls_analyzed": total,
+                "parameter_usage": {
+                    "urls_with_params": int(param_count),
+                    "percentage_with_params": param_percentage,
+                    "top_parameters": param_frequency
+                },
+                "directory_depth": {
+                    "average_depth": avg_depth,
+                    "max_depth": max_depth,
+                    "distribution": depth_distribution
+                },
+                "protocols": {str(k): int(v) for k, v in schemes.items()},
+                "subdomains": {
+                    "primary": primary_domain,
+                    "unique_count": unique_subdomains
+                }
+            }
+        except Exception as e:
+            self.logger.warning(f"URL structure analysis failed: {e}")
+            return {}
+
    async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
        """
        Performs a shallow crawl and theme analysis using word frequency.
@@ -153,6 +310,512 @@ class AdvertoolsService:
                except Exception as e:
                    self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")

+    async def analyze_site_structure(self, url_list: List[str], site_domain: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Crawls a set of pages with link following to analyze internal link health,
+        redirect chains, and page-level SEO elements.
+        
+        Extracts metrics via crawlytics: link distribution, redirect chains, image SEO.
+        """
+        temp_file = None
+        try:
+            self.logger.info(f"Analyzing site structure for {len(url_list)} URLs, domain={site_domain}")
+            
+            with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
+                temp_file = tf.name
+
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, lambda: adv.crawl(
+                url_list=url_list,
+                output_file=temp_file,
+                follow_links=True,
+                allowed_domains=[site_domain] if site_domain else None,
+                custom_settings={
+                    'LOG_LEVEL': 'WARNING',
+                    'CLOSESPIDER_PAGECOUNT': 50,
+                    'DOWNLOAD_TIMEOUT': 30,
+                    'CONCURRENT_REQUESTS_PER_DOMAIN': 3,
+                    'DEPTH_LIMIT': 3,
+                }
+            ))
+            
+            if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
+                return {"success": False, "error": "Site structure crawl produced no output."}
+
+            crawl_df = pd.read_json(temp_file, lines=True)
+            page_count = len(crawl_df)
+            result = {"success": True, "page_count": page_count}
+
+            # --- Link Health via crawlytics ---
+            try:
+                internal_regex = site_domain if site_domain else None
+                link_df = adv.crawlytics.links(crawl_df, internal_url_regex=internal_regex)
+                if link_df is not None and not link_df.empty:
+                    total_links = len(link_df)
+                    internal_links = int(link_df['internal'].sum()) if 'internal' in link_df.columns else 0
+                    external_links = total_links - internal_links
+                    nofollow_links = int(link_df['nofollow'].sum()) if 'nofollow' in link_df.columns else 0
+
+                    # Count links per page
+                    links_per_page = link_df.groupby(level=0).size()
+                    avg_links_per_page = round(links_per_page.mean(), 1) if not links_per_page.empty else 0
+
+                    # Most common anchor text (internal links only)
+                    anchor_texts = []
+                    if 'text' in link_df.columns and 'internal' in link_df.columns:
+                        internal_anchors = link_df[link_df['internal'] == True]['text'].dropna()
+                        for t in internal_anchors:
+                            if isinstance(t, str) and t.strip():
+                                anchor_texts.extend([w.strip() for w in t.split() if len(w.strip()) > 2])
+                    from collections import Counter
+                    top_anchors = dict(Counter(anchor_texts).most_common(15)) if anchor_texts else {}
+
+                    result["link_health"] = {
+                        "total_links_found": total_links,
+                        "internal_link_count": internal_links,
+                        "external_link_count": external_links,
+                        "internal_link_percentage": round((internal_links / total_links) * 100, 1) if total_links > 0 else 0,
+                        "nofollow_link_count": nofollow_links,
+                        "avg_links_per_page": avg_links_per_page,
+                        "top_anchor_words": top_anchors
+                    }
+                else:
+                    result["link_health"] = {"error": "No links found in crawl data"}
+            except Exception as e:
+                self.logger.warning(f"Link analysis failed: {e}")
+                result["link_health"] = {"error": str(e)}
+
+            # --- Redirect Chain Audit via crawlytics ---
+            try:
+                redirect_df = adv.crawlytics.redirects(crawl_df)
+                if redirect_df is not None and not redirect_df.empty:
+                    total_redirects = len(redirect_df)
+                    redirect_chains = redirect_df['redirect_times'].nunique() if 'redirect_times' in redirect_df.columns else 0
+                    redirect_statuses = redirect_df['status'].value_counts().to_dict() if 'status' in redirect_df.columns else {}
+                    multi_hop = redirect_df[redirect_df['redirect_times'] > 1] if 'redirect_times' in redirect_df.columns else pd.DataFrame()
+
+                    result["redirect_audit"] = {
+                        "total_redirects": int(total_redirects),
+                        "unique_chains": int(redirect_chains),
+                        "status_distribution": {str(k): int(v) for k, v in redirect_statuses.items()},
+                        "multi_hop_chains": int(len(multi_hop)),
+                        "affected_pages": multi_hop.index.unique().tolist() if not multi_hop.empty else []
+                    }
+                else:
+                    result["redirect_audit"] = {"total_redirects": 0, "note": "No redirects detected"}
+            except Exception as e:
+                self.logger.warning(f"Redirect analysis failed: {e}")
+                result["redirect_audit"] = {"error": str(e)}
+
+            # --- Image SEO overview via crawlytics ---
+            try:
+                img_df = adv.crawlytics.images(crawl_df)
+                if img_df is not None and not img_df.empty:
+                    total_images = len(img_df)
+                    missing_alt = int(img_df['img_alt'].isna().sum()) if 'img_alt' in img_df.columns else 0
+                    alt_coverage = round(((total_images - missing_alt) / total_images) * 100, 1) if total_images > 0 else 0
+                    result["image_seo"] = {
+                        "total_images": total_images,
+                        "missing_alt_count": missing_alt,
+                        "alt_coverage_percentage": alt_coverage
+                    }
+            except Exception as e:
+                self.logger.warning(f"Image analysis failed: {e}")
+
+            # --- Page-level metrics ---
+            if 'status' in crawl_df.columns:
+                status_dist = crawl_df['status'].value_counts().to_dict()
+                result["page_status"] = {str(k): int(v) for k, v in status_dist.items()}
+            if 'title' in crawl_df.columns:
+                missing_titles = int(crawl_df['title'].isna().sum())
+                result["missing_titles"] = missing_titles
+            if 'meta_desc' in crawl_df.columns:
+                missing_descriptions = int(crawl_df['meta_desc'].isna().sum())
+                result["missing_descriptions"] = missing_descriptions
+
+            result["timestamp"] = datetime.utcnow().isoformat()
+            return result
+
+        except Exception as e:
+            self.logger.error(f"Failed to analyze site structure: {str(e)}")
+            return {"success": False, "error": str(e)}
+        finally:
+            if temp_file and os.path.exists(temp_file):
+                try:
+                    os.remove(temp_file)
+                except Exception as e:
+                    self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
+
+    async def analyze_robots_txt(self, website_url: str) -> Dict[str, Any]:
+        """
+        Fetch and analyze robots.txt for compliance issues.
+        Checks directives, sitemap declaration, crawl-delay, and common problems.
+        """
+        try:
+            self.logger.info(f"Analyzing robots.txt for {website_url}")
+            parsed = urlparse(website_url)
+            base_url = f"{parsed.scheme}://{parsed.netloc}"
+            robots_url = f"{base_url}/robots.txt"
+            result = {
+                "success": True,
+                "url": robots_url,
+                "accessible": True,
+                "total_directives": 0,
+                "user_agents_found": [],
+                "has_sitemap_directive": False,
+                "sitemap_urls": [],
+                "has_crawl_delay": False,
+                "disallow_rules": [],
+                "issues": [],
+                "compliance_score": 100,
+            }
+            loop = asyncio.get_event_loop()
+            try:
+                robots_df = await loop.run_in_executor(
+                    None, lambda: adv.robotstxt_to_df(robots_url)
+                )
+                if robots_df is None or robots_df.empty:
+                    raise ValueError("Empty result from robotstxt_to_df")
+            except Exception as adv_err:
+                self.logger.warning(f"adv.robotstxt_to_df failed, using manual fallback: {adv_err}")
+                robots_df = await loop.run_in_executor(
+                    None, lambda: self._parse_robots_txt_manual(robots_url)
+                )
+            if robots_df is None or robots_df.empty:
+                result["success"] = False
+                result["error"] = "Could not fetch or parse robots.txt"
+                result["accessible"] = False
+                return result
+
+            result["total_directives"] = len(robots_df)
+
+            if 'user_agent' in robots_df.columns:
+                result["user_agents_found"] = robots_df['user_agent'].dropna().unique().tolist()
+
+            rule_col = 'rule' if 'rule' in robots_df.columns else 'directive' if 'directive' in robots_df.columns else None
+            value_col = 'value' if 'value' in robots_df.columns else 'directive_value' if 'directive_value' in robots_df.columns else None
+
+            if rule_col and value_col:
+                rules_lower = robots_df[rule_col].astype(str).str.lower()
+                result["has_sitemap_directive"] = 'sitemap' in rules_lower.values
+                result["has_crawl_delay"] = 'crawl-delay' in rules_lower.values
+                has_disallow_all = any(
+                    str(row.get(value_col, '')).strip() == '/'
+                    for _, row in robots_df[robots_df[rule_col].astype(str).str.lower() == 'disallow'].iterrows()
+                ) if 'disallow' in rules_lower.values else False
+
+                disallow_mask = rules_lower == 'disallow'
+                if disallow_mask.any():
+                    for _, row in robots_df[disallow_mask].iterrows():
+                        val = str(row.get(value_col, ''))
+                        ua = str(row.get('user_agent', '*'))
+                        if val:
+                            result["disallow_rules"].append({"user_agent": ua, "path": val})
+
+                sitemap_mask = rules_lower == 'sitemap'
+                if sitemap_mask.any():
+                    result["sitemap_urls"] = robots_df.loc[sitemap_mask, value_col].dropna().unique().tolist()
+
+                if has_disallow_all:
+                    result["issues"].append({
+                        "severity": "critical", "code": "DISALLOW_ALL",
+                        "detail": "robots.txt disallows all user agents from all paths (Disallow: /)"
+                    })
+
+            if not result["has_sitemap_directive"]:
+                result["issues"].append({
+                    "severity": "warning", "code": "NO_SITEMAP",
+                    "detail": "No Sitemap directive found — search engines may miss pages"
+                })
+            if not result["has_crawl_delay"]:
+                result["issues"].append({
+                    "severity": "info", "code": "NO_CRAWL_DELAY",
+                    "detail": "No Crawl-delay directive set — not critical for most sites"
+                })
+
+            for issue in result["issues"]:
+                sev = issue["severity"]
+                if sev == "critical":
+                    result["compliance_score"] -= 30
+                elif sev == "warning":
+                    result["compliance_score"] -= 15
+                elif sev == "info":
+                    result["compliance_score"] -= 5
+            result["compliance_score"] = max(result["compliance_score"], 0)
+
+            return result
+
+        except Exception as e:
+            self.logger.error(f"Robots.txt analysis failed: {e}")
+            return {"success": False, "error": str(e), "url": robots_url if 'robots_url' in locals() else website_url}
+
+    def _parse_robots_txt_manual(self, url: str) -> pd.DataFrame:
+        """Fallback: manually fetch and parse robots.txt."""
+        records = []
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
+            with urllib.request.urlopen(req, timeout=15) as resp:
+                content = resp.read().decode("utf-8", errors="replace")
+            current_ua = "*"
+            for line in content.splitlines():
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                if line.lower().startswith("user-agent"):
+                    parts = line.split(":", 1)
+                    current_ua = parts[1].strip() if len(parts) > 1 else "*"
+                    continue
+                if ":" in line:
+                    directive, _, value = line.partition(":")
+                    records.append({
+                        "user_agent": current_ua,
+                        "rule": directive.strip(),
+                        "value": value.strip(),
+                    })
+        except Exception as e:
+            self.logger.warning(f"Manual robots.txt fetch failed: {e}")
+        if not records:
+            return pd.DataFrame()
+        return pd.DataFrame(records)
+
+    async def analyze_crawl_budget(self, sitemap_url: str, site_domain: str) -> Dict[str, Any]:
+        """
+        Analyze crawl budget by comparing sitemap inventory against actual crawl results.
+        Estimates budget utilization, waste from redirects/errors, and optimization score.
+        """
+        temp_file = None
+        try:
+            self.logger.info(f"Analyzing crawl budget for {site_domain}")
+            loop = asyncio.get_event_loop()
+
+            sitemap_df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
+            sitemap_total = len(sitemap_df) if sitemap_df is not None and not sitemap_df.empty else 0
+
+            start_url = f"https://{site_domain}" if not site_domain.startswith("http") else site_domain
+
+            with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
+                temp_file = tf.name
+
+            await loop.run_in_executor(None, lambda: adv.crawl(
+                url_list=[start_url],
+                output_file=temp_file,
+                follow_links=True,
+                allowed_domains=[site_domain],
+                custom_settings={
+                    'LOG_LEVEL': 'WARNING',
+                    'CLOSESPIDER_PAGECOUNT': 30,
+                    'DOWNLOAD_TIMEOUT': 15,
+                    'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
+                    'DEPTH_LIMIT': 2,
+                }
+            ))
+
+            if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
+                return {"success": False, "error": "Crawl produced no output"}
+
+            crawl_df = pd.read_json(temp_file, lines=True)
+            crawled_count = len(crawl_df)
+
+            status_dist = {}
+            if 'status' in crawl_df.columns:
+                raw = crawl_df['status'].value_counts().to_dict()
+                status_dist = {str(k): int(v) for k, v in raw.items()}
+
+            wasted = 0
+            for code_s in status_dist:
+                code = int(code_s)
+                if code >= 300 or code < 200:
+                    wasted += status_dist[code_s]
+
+            budget_usage_ratio = round(crawled_count / max(sitemap_total, 1), 3)
+            waste_ratio = round(wasted / max(crawled_count, 1), 3)
+
+            depth_dist = {}
+            if 'depth' in crawl_df.columns:
+                raw = crawl_df['depth'].value_counts().sort_index().to_dict()
+                depth_dist = {str(k): int(v) for k, v in raw.items()}
+
+            param_count = 0
+            url_col = 'url' if 'url' in crawl_df.columns else 'response_url' if 'response_url' in crawl_df.columns else None
+            if url_col:
+                param_count = int(crawl_df[url_col].astype(str).str.contains('?').sum())
+
+            optimization_score = max(0, round(100 - (waste_ratio * 100) - (budget_usage_ratio * 20), 1))
+
+            return {
+                "success": True,
+                "sitemap_total_urls": sitemap_total,
+                "pages_crawled": crawled_count,
+                "crawl_coverage_percentage": round(budget_usage_ratio * 100, 1),
+                "status_distribution": status_dist,
+                "wasted_crawl_requests": int(wasted),
+                "waste_percentage": round(waste_ratio * 100, 1),
+                "depth_distribution": depth_dist,
+                "urls_with_parameters": int(param_count),
+                "optimization_score": optimization_score,
+            }
+
+        except Exception as e:
+            self.logger.error(f"Crawl budget analysis failed: {e}")
+            return {"success": False, "error": str(e)}
+        finally:
+            if temp_file and os.path.exists(temp_file):
+                try: os.remove(temp_file)
+                except Exception: pass
+
+    async def sitemap_compare(self, sitemap_a: str, sitemap_b: str) -> Dict[str, Any]:
+        """
+        Compare two sitemaps for competitive content gap analysis.
+        Analyzes URL count, freshness, directory pillars, and identifies
+        patterns unique to each sitemap.
+        """
+        try:
+            self.logger.info(f"Comparing sitemaps: {sitemap_a} vs {sitemap_b}")
+            loop = asyncio.get_event_loop()
+
+            df_a = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_a))
+            df_b = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_b))
+
+            total_a = len(df_a) if df_a is not None and not df_a.empty else 0
+            total_b = len(df_b) if df_b is not None and not df_b.empty else 0
+            result = {
+                "success": True,
+                "sitemap_a": {"url": sitemap_a, "total_urls": total_a},
+                "sitemap_b": {"url": sitemap_b, "total_urls": total_b},
+                "url_count_diff": total_a - total_b,
+                "ratio": round(total_a / max(total_b, 1), 2),
+                "pillars_a": {},
+                "pillars_b": {},
+                "shared_pillars": [],
+                "unique_to_a": [],
+                "unique_to_b": [],
+                "freshness_comparison": {},
+                "overlap_score": 0,
+            }
+
+            if total_a == 0 or total_b == 0:
+                return result
+
+            def extract_pillars(df: pd.DataFrame, label: str) -> Tuple[dict, list]:
+                pillars = {}
+                if 'loc' in df.columns:
+                    try:
+                        url_df = adv.url_to_df(df['loc'])
+                        if url_df is not None and not url_df.empty:
+                            dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
+                            if dir_cols:
+                                pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
+                                for col in dir_cols[1:3]:
+                                    mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
+                                    pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
+                                pillars = pillar_series.value_counts().head(20).to_dict()
+                    except Exception:
+                        pass
+
+                if not pillars:
+                    seen = {}
+                    for url in df['loc'].dropna():
+                        parts = urlparse(url).path.strip('/').split('/')
+                        key = parts[0] if parts and parts[0] else "home"
+                        seen[key] = seen.get(key, 0) + 1
+                    pillars = dict(sorted(seen.items(), key=lambda x: x[1], reverse=True)[:20])
+
+                pillar_keys = list(pillars.keys()) if pillars else []
+                return pillars, pillar_keys
+
+            pillars_a, keys_a = extract_pillars(df_a, "a")
+            pillars_b, keys_b = extract_pillars(df_b, "b")
+            result["pillars_a"] = pillars_a
+            result["pillars_b"] = pillars_b
+
+            set_a = set(keys_a)
+            set_b = set(keys_b)
+            shared = set_a & set_b
+            result["shared_pillars"] = sorted(shared)
+            result["unique_to_a"] = sorted(set_a - set_b)
+            result["unique_to_b"] = sorted(set_b - set_a)
+
+            total_keys = max(len(set_a | set_b), 1)
+            overlap_count = len(shared)
+            result["overlap_score"] = round((overlap_count / total_keys) * 100, 1)
+
+            def compute_freshness_stats(df: pd.DataFrame) -> dict:
+                stats = {"has_lastmod": False, "recent_30d": 0, "total_with_dates": 0}
+                if 'lastmod' in df.columns:
+                    lm = pd.to_datetime(df['lastmod'], errors='coerce', utc=True).dropna()
+                    if not lm.empty:
+                        stats["has_lastmod"] = True
+                        stats["total_with_dates"] = int(len(lm))
+                        stats["recent_30d"] = int((lm > (datetime.now(lm.dt.tz) - timedelta(days=30))).sum())
+                return stats
+
+            result["freshness_comparison"] = {
+                "a": compute_freshness_stats(df_a),
+                "b": compute_freshness_stats(df_b),
+            }
+
+            return result
+
+        except Exception as e:
+            self.logger.error(f"Sitemap comparison failed: {e}")
+            return {"success": False, "error": str(e)}
+
+    async def compare_crawl_results(self, result_a: Dict[str, Any], result_b: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Compare two crawl analysis result dicts to surface changes over time.
+        Useful for tracking SEO improvements between scheduled executions.
+        """
+        try:
+            diff = {
+                "success": True,
+                "page_count_change": 0,
+                "status_distribution_changes": {},
+                "link_health_changes": {},
+                "redirect_changes": {},
+                "new_issues": [],
+                "resolved_issues": [],
+            }
+
+            pc_a = result_a.get("page_count", 0)
+            pc_b = result_b.get("page_count", 0)
+            diff["page_count_change"] = pc_b - pc_a
+
+            sd_a = result_a.get("page_status", {})
+            sd_b = result_b.get("page_status", {})
+            all_codes = set(list(sd_a.keys()) + list(sd_b.keys()))
+            for c in sorted(all_codes):
+                va = sd_a.get(c, 0)
+                vb = sd_b.get(c, 0)
+                change = vb - va
+                if change != 0:
+                    diff["status_distribution_changes"][c] = change
+
+            def _safe_diff(d_a: dict, d_b: dict, prefix: str) -> dict:
+                changes = {}
+                all_keys = set(list(d_a.keys()) + list(d_b.keys()))
+                for k in all_keys:
+                    va = d_a.get(k, 0)
+                    vb = d_b.get(k, 0)
+                    if isinstance(va, (int, float)) and isinstance(vb, (int, float)):
+                        change = round(vb - va, 2)
+                        if change != 0:
+                            changes[f"{prefix}_{k}"] = change
+                return changes
+
+            lh_a = result_a.get("link_health", {})
+            lh_b = result_b.get("link_health", {})
+            diff["link_health_changes"] = _safe_diff(lh_a, lh_b, "link")
+
+            rd_a = result_a.get("redirect_audit", {})
+            rd_b = result_b.get("redirect_audit", {})
+            diff["redirect_changes"] = _safe_diff(rd_a, rd_b, "redirect")
+
+            return diff
+
+        except Exception as e:
+            self.logger.error(f"Crawl comparison failed: {e}")
+            return {"success": False, "error": str(e)}
+
    async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
        """
        Analyzes linking patterns and social media presence using unique temporary files.
--- a/backend/services/seo/dashboard_service.py
+++ b/backend/services/seo/dashboard_service.py
@@ -454,14 +454,12 @@ class SEODashboardService:
    def _get_advertools_insights(self, user_id: str, site_url: str) -> Dict[str, Any]:
        """Fetch Advertools-based insights from WebsiteAnalysis and AdvertoolsTasks."""
        try:
-            # 1. Get augmented persona themes from WebsiteAnalysis
            session = self.db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
            if not session:
                return {}

            analysis = self.db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
            
-            # 2. Get latest tasks status
            tasks = self.db.query(AdvertoolsTask).filter(AdvertoolsTask.user_id == user_id).all()
            
            audit_status = "pending"
@@ -479,6 +477,14 @@ class SEODashboardService:

            return {
                "augmented_themes": brand_analysis.get('augmented_themes', []),
+                "link_health": brand_analysis.get('link_health', {}),
+                "redirect_audit": brand_analysis.get('redirect_audit', {}),
+                "image_seo": brand_analysis.get('image_seo', {}),
+                "page_status": brand_analysis.get('page_status', {}),
+                "url_structure": brand_analysis.get('url_structure', {}),
+                "freshness": brand_analysis.get('freshness', {}),
+                "robots_txt": brand_analysis.get('robots_txt', {}),
+                "crawl_budget": brand_analysis.get('crawl_budget', {}),
                "last_audit": brand_analysis.get('last_advertools_audit'),
                "site_health": seo_audit.get('site_health', {}),
                "last_health_check": seo_audit.get('last_advertools_health_check'),
--- a/backend/services/sif_integration_service.py
+++ b/backend/services/sif_integration_service.py
@@ -378,7 +378,48 @@ class SIFIntegrationService:
                themes = adv_insights.get('augmented_themes', [])
                if themes:
                    text_content += f"Augmented Themes: {', '.join(themes[:5])}. "
-                
+
+                freshness = adv_insights.get('freshness', {})
+                if freshness:
+                    text_content += (f"Content Freshness Score: {freshness.get('freshness_score', 'N/A')}. "
+                                     f"Publishing Velocity: {freshness.get('publishing_velocity', 0)}/week. "
+                                     f"Trend: {freshness.get('publishing_trend', 'unknown')}. "
+                                     f"Last 30d: {freshness.get('publishing_recency', {}).get('last_30d', 0)} pages. ")
+
+                link_health = adv_insights.get('link_health', {})
+                if link_health and 'error' not in link_health:
+                    text_content += (f"Internal Links: {link_health.get('internal_link_count', 0)}. "
+                                     f"External Links: {link_health.get('external_link_count', 0)}. "
+                                     f"Nofollow: {link_health.get('nofollow_link_count', 0)}. "
+                                     f"Avg Links/Page: {link_health.get('avg_links_per_page', 0)}. ")
+
+                redirects = adv_insights.get('redirect_audit', {})
+                if redirects and 'error' not in redirects:
+                    text_content += (f"Redirects: {redirects.get('total_redirects', 0)} total, "
+                                     f"{redirects.get('multi_hop_chains', 0)} multi-hop. ")
+
+                image_seo = adv_insights.get('image_seo', {})
+                if image_seo and 'error' not in image_seo:
+                    text_content += (f"Images: {image_seo.get('total_images', 0)} total, "
+                                     f"Alt Coverage: {image_seo.get('alt_coverage_percentage', 0)}%. ")
+
+                url_struct = adv_insights.get('url_structure', {})
+                if url_struct:
+                    text_content += (f"URL Structure: {url_struct.get('total_urls_analyzed', 0)} URLs, "
+                                     f"Avg Depth: {url_struct.get('directory_depth', {}).get('average_depth', 0)}. "
+                                     f"Params: {url_struct.get('parameter_usage', {}).get('percentage_with_params', 0)}%. ")
+
+                robots = adv_insights.get('robots_txt', {})
+                if robots and robots.get('success'):
+                    text_content += (f"Robots.txt: {robots.get('total_directives', 0)} directives, "
+                                     f"Compliance: {robots.get('compliance_score', 0)}/100. "
+                                     f"Issues: {len(robots.get('issues', []))}. ")
+
+                budget = adv_insights.get('crawl_budget', {})
+                if budget and budget.get('success'):
+                    text_content += (f"Crawl Budget: {budget.get('pages_crawled', 0)} crawled of {budget.get('sitemap_total_urls', 0)} URLs. "
+                                     f"Waste: {budget.get('waste_percentage', 0)}%. "
+                                     f"Score: {budget.get('optimization_score', 0)}. ")
            # Add Technical SEO overview
            tech_audit = dashboard_data.get('technical_seo_audit', {})
            if tech_audit:
--- a/backend/services/wix_service.py
+++ b/backend/services/wix_service.py
@@ -143,16 +143,18 @@ class WixService:
            access_token: Valid access token
            
        Returns:
-            Site information
+            Site information (or {_no_site: True} if no site exists)
        """
        token_str = normalize_token_string(access_token)
        if not token_str:
-            raise ValueError("Invalid access token format for create_blog_post")
+            return {"_no_site": True, "error": "Invalid access token format"}
+        meta = extract_meta_from_token(token_str)
+        meta_site_id = meta.get("metaSiteId")
        try:
-            return self.auth_service.get_site_info(token_str)
+            return self.auth_service.get_site_info(token_str, meta_site_id=meta_site_id)
        except requests.RequestException as e:
-            logger.error(f"Failed to get site info: {e}")
-            raise
+            logger.warning(f"Failed to get site info: {e}")
+            return {"_no_site": True, "error": str(e)}
    
    def get_current_member(self, access_token: str) -> Dict[str, Any]:
        """
--- a/backend/services/youtube/youtube_task_manager.py
+++ b/backend/services/youtube/youtube_task_manager.py
@@ -0,0 +1,387 @@
+"""
+YouTube Creator Task Manager
+
+Hybrid DB-backed + in-memory task manager for YouTube video operations.
+Writes task state to PostgreSQL so renders/combines/publishes survive
+server restarts. Falls back to in-memory dict when DB is unavailable.
+
+API surface matches Story Writer's TaskManager for drop-in compatibility.
+"""
+
+import uuid
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional
+from loguru import logger
+from sqlalchemy.orm import Session
+
+from models.youtube_task_models import YouTubeVideoTask, YouTubeTaskType, YouTubeTaskStatus
+from services.database import get_session_for_user, get_engine_for_user
+from models.subscription_models import Base as SubscriptionBase
+
+
+class YouTubeTaskManager:
+    """Hybrid persistent + in-memory task manager for YouTube Creator."""
+
+    def __init__(self):
+        self.task_storage: Dict[str, Dict[str, Any]] = {}
+        self._ensure_tables()
+
+    def _ensure_tables(self):
+        """Ensure youtube_video_tasks table exists for all initialised users."""
+        try:
+            from services.database import _user_engines
+            for user_id, engine in list(_user_engines.items()):
+                try:
+                    SubscriptionBase.metadata.create_all(bind=engine, checkfirst=True)
+                except Exception:
+                    pass
+        except Exception:
+            pass
+
+    def _get_db(self, user_id: str) -> Optional[Session]:
+        """Get a DB session for the given user. Returns None on failure."""
+        if not user_id:
+            return None
+        try:
+            session = get_session_for_user(user_id)
+            if session:
+                engine = get_engine_for_user(user_id)
+                SubscriptionBase.metadata.create_all(bind=engine, checkfirst=True)
+            return session
+        except Exception as e:
+            logger.warning(f"[YouTubeTaskManager] DB unavailable for user {user_id}: {e}")
+            return None
+
+    def _map_task_type(self, task_type_str: str) -> YouTubeTaskType:
+        """Map a string task type to the enum."""
+        mapping = {
+            "youtube_video_render": YouTubeTaskType.RENDER,
+            "youtube_scene_video_render": YouTubeTaskType.SCENE_RENDER,
+            "youtube_video_combine": YouTubeTaskType.COMBINE,
+            "youtube_combine_video": YouTubeTaskType.COMBINE,
+            "youtube_publish": YouTubeTaskType.PUBLISH,
+            "youtube_image_generation": YouTubeTaskType.IMAGE_GENERATION,
+            "youtube_audio_generation": YouTubeTaskType.AUDIO_GENERATION,
+        }
+        return mapping.get(task_type_str, YouTubeTaskType.RENDER)
+
+    def _map_status_to_enum(self, status: str) -> YouTubeTaskStatus:
+        """Map a frontend status string to the DB enum."""
+        mapping = {
+            "pending": YouTubeTaskStatus.PENDING,
+            "processing": YouTubeTaskStatus.PROCESSING,
+            "running": YouTubeTaskStatus.PROCESSING,
+            "completed": YouTubeTaskStatus.COMPLETED,
+            "failed": YouTubeTaskStatus.FAILED,
+        }
+        return mapping.get(status, YouTubeTaskStatus.PENDING)
+
+    def _map_status_from_enum(self, status: YouTubeTaskStatus) -> str:
+        """Map DB enum to frontend status string."""
+        mapping = {
+            YouTubeTaskStatus.PENDING: "pending",
+            YouTubeTaskStatus.PROCESSING: "processing",
+            YouTubeTaskStatus.COMPLETED: "completed",
+            YouTubeTaskStatus.FAILED: "failed",
+        }
+        return mapping.get(status, "pending")
+
+    def create_task(
+        self,
+        task_type: str = "youtube_video_render",
+        metadata: Optional[Dict[str, Any]] = None,
+        user_id: Optional[str] = None,
+    ) -> str:
+        """Create a new task. Persists to DB if user_id provided; always writes to in-memory."""
+        task_id = str(uuid.uuid4())
+        task_metadata = metadata or {}
+        now = datetime.now(timezone.utc)
+
+        # Always write to in-memory for fast lookups
+        self.task_storage[task_id] = {
+            "status": "pending",
+            "created_at": now,
+            "updated_at": now,
+            "result": None,
+            "error": None,
+            "progress_messages": [],
+            "task_type": task_type,
+            "progress": 0.0,
+            "metadata": task_metadata,
+        }
+
+        # Persist to DB
+        effective_user_id = user_id or task_metadata.get("owner_user_id")
+        if effective_user_id:
+            db = self._get_db(effective_user_id)
+            if db:
+                try:
+                    db_task = YouTubeVideoTask(
+                        task_id=task_id,
+                        user_id=effective_user_id,
+                        task_type=self._map_task_type(task_type),
+                        status=YouTubeTaskStatus.PENDING,
+                        progress=0.0,
+                        request_data=task_metadata if task_metadata else None,
+                        created_at=now,
+                        updated_at=now,
+                    )
+                    db.add(db_task)
+                    db.commit()
+                    logger.debug(f"[YouTubeTaskManager] Persisted task {task_id} to DB for user {effective_user_id}")
+                except Exception as e:
+                    logger.warning(f"[YouTubeTaskManager] Failed to persist task {task_id} to DB: {e}")
+                    db.rollback()
+                finally:
+                    db.close()
+
+        logger.info(f"[YouTubeTaskManager] Created task: {task_id} (type: {task_type})")
+        return task_id
+
+    def get_task_status(self, task_id: str, requester_user_id: Optional[str] = None) -> Optional[Dict[str, Any]]:
+        """Get task status. Checks in-memory first, then DB."""
+        # Check in-memory first (fast path)
+        if task_id in self.task_storage:
+            task = self.task_storage[task_id]
+            metadata = task.get("metadata", {}) or {}
+            owner_user_id = metadata.get("owner_user_id")
+
+            if requester_user_id is not None and owner_user_id is not None and requester_user_id != owner_user_id:
+                logger.warning(f"[YouTubeTaskManager] Task access denied for task {task_id}")
+                return None
+
+            response = {
+                "task_id": task_id,
+                "status": task["status"],
+                "progress": task.get("progress", 0.0),
+                "message": task.get("progress_messages", [])[-1] if task.get("progress_messages") else None,
+                "created_at": task["created_at"].isoformat() if task.get("created_at") else None,
+                "updated_at": task.get("updated_at", task.get("created_at")).isoformat() if task.get("updated_at") or task.get("created_at") else None,
+            }
+            if task["status"] == "completed" and task.get("result"):
+                response["result"] = task["result"]
+            if task["status"] == "failed" and task.get("error"):
+                response["error"] = task["error"]
+                if task.get("error_status") is not None:
+                    response["error_status"] = task["error_status"]
+                if task.get("error_data") is not None:
+                    response["error_data"] = task["error_data"]
+            return response
+
+        # Fall back to DB
+        if requester_user_id:
+            db = self._get_db(requester_user_id)
+            if db:
+                try:
+                    db_task = db.query(YouTubeVideoTask).filter(YouTubeVideoTask.task_id == task_id).first()
+                    if db_task:
+                        status_val = self._map_status_from_enum(db_task.status)
+                        response = {
+                            "task_id": db_task.task_id,
+                            "status": status_val,
+                            "progress": db_task.progress or 0.0,
+                            "message": db_task.message,
+                            "created_at": db_task.created_at.isoformat() if db_task.created_at else None,
+                            "updated_at": db_task.updated_at.isoformat() if db_task.updated_at else None,
+                        }
+                        if db_task.result:
+                            response["result"] = db_task.result if isinstance(db_task.result, dict) else db_task.result
+                        if db_task.error:
+                            response["error"] = db_task.error
+                            if isinstance(db_task.result, dict):
+                                if db_task.result.get("error_status") is not None:
+                                    response["error_status"] = db_task.result["error_status"]
+                                if db_task.result.get("error_data") is not None:
+                                    response["error_data"] = db_task.result["error_data"]
+                        return response
+                except Exception as e:
+                    logger.warning(f"[YouTubeTaskManager] DB lookup failed for task {task_id}: {e}")
+                finally:
+                    db.close()
+
+        return None
+
+    def update_task_status(
+        self,
+        task_id: str,
+        status: str,
+        progress: Optional[float] = None,
+        message: Optional[str] = None,
+        result: Optional[Dict[str, Any]] = None,
+        error: Optional[str] = None,
+        error_status: Optional[int] = None,
+        error_data: Optional[Dict[str, Any]] = None,
+    ):
+        """Update task status. Writes to both in-memory and DB."""
+        now = datetime.now(timezone.utc)
+
+        # Update in-memory
+        if task_id in self.task_storage:
+            task = self.task_storage[task_id]
+            task["status"] = status
+            task["updated_at"] = now
+            if progress is not None:
+                task["progress"] = progress
+            if message:
+                if "progress_messages" not in task:
+                    task["progress_messages"] = []
+                task["progress_messages"].append(message)
+                logger.info(f"[YouTubeTaskManager] Task {task_id}: {message} (progress: {progress}%)")
+            if result is not None:
+                task["result"] = result
+            if error is not None:
+                task["error"] = error
+                logger.error(f"[YouTubeTaskManager] Task {task_id} error: {error}")
+            if error_status is not None:
+                task["error_status"] = error_status
+            if error_data is not None:
+                task["error_data"] = error_data
+
+            # Try DB update
+            metadata = task.get("metadata", {}) or {}
+            user_id = metadata.get("owner_user_id")
+            self._update_db_task(task_id, user_id, status, progress, message, result, error, now)
+        else:
+            logger.warning(f"[YouTubeTaskManager] Cannot update non-existent task: {task_id}")
+
+    def _update_db_task(
+        self,
+        task_id: str,
+        user_id: Optional[str],
+        status: str,
+        progress: Optional[float],
+        message: Optional[str],
+        result: Optional[Dict[str, Any]],
+        error: Optional[str],
+        now: datetime,
+    ):
+        """Update task in DB."""
+        if not user_id:
+            return
+
+        db = self._get_db(user_id)
+        if not db:
+            return
+
+        try:
+            db_task = db.query(YouTubeVideoTask).filter(YouTubeVideoTask.task_id == task_id).first()
+            if db_task:
+                db_task.status = self._map_status_to_enum(status)
+                db_task.updated_at = now
+                if progress is not None:
+                    db_task.progress = progress
+                if message:
+                    db_task.message = message[:500] if message else None
+                if result:
+                    # Merge error fields into result if present
+                    existing_result = db_task.result if isinstance(db_task.result, dict) else {}
+                    existing_result.update(result)
+                    db_task.result = existing_result
+                if error:
+                    db_task.error = error
+                if status in ("completed", "failed"):
+                    db_task.completed_at = now
+                db.commit()
+                logger.debug(f"[YouTubeTaskManager] Persisted status update for task {task_id}")
+            else:
+                logger.debug(f"[YouTubeTaskManager] Task {task_id} not found in DB for update")
+        except Exception as e:
+            logger.warning(f"[YouTubeTaskManager] Failed to update DB task {task_id}: {e}")
+            db.rollback()
+        finally:
+            db.close()
+
+    def recover_stale_tasks(self, user_id: str):
+        """Mark in-flight tasks that were interrupted by server restart as failed.
+
+        Called on startup for each user to handle tasks that were 'processing'
+        when the server went down.
+        """
+        db = self._get_db(user_id)
+        if not db:
+            return 0
+
+        count = 0
+        try:
+            stale_tasks = db.query(YouTubeVideoTask).filter(
+                YouTubeVideoTask.user_id == user_id,
+                YouTubeVideoTask.status.in_([
+                    YouTubeTaskStatus.PENDING,
+                    YouTubeTaskStatus.PROCESSING,
+                ]),
+            ).all()
+
+            for task in stale_tasks:
+                task.status = YouTubeTaskStatus.FAILED
+                task.error = "Task interrupted by server restart"
+                task.message = "Marked as failed on server restart"
+                task.completed_at = datetime.now(timezone.utc)
+                task.updated_at = datetime.now(timezone.utc)
+                count += 1
+                logger.info(f"[YouTubeTaskManager] Recovered stale task {task.task_id} for user {user_id}")
+
+            if count > 0:
+                db.commit()
+                logger.info(f"[YouTubeTaskManager] Recovered {count} stale tasks for user {user_id}")
+        except Exception as e:
+            logger.warning(f"[YouTubeTaskManager] Failed to recover stale tasks: {e}")
+            db.rollback()
+        finally:
+            db.close()
+
+        return count
+
+    def cleanup_old_tasks(self):
+        """Remove in-memory tasks older than 1 hour. DB cleanup is handled by vacuum."""
+        now = datetime.now(timezone.utc)
+        cutoff = now.timestamp() - 3600  # 1 hour
+
+        tasks_to_remove = []
+        for task_id, task_data in self.task_storage.items():
+            created_at = task_data.get("created_at")
+            if created_at:
+                ts = created_at.timestamp() if hasattr(created_at, 'timestamp') else 0
+                if ts < cutoff:
+                    tasks_to_remove.append(task_id)
+
+        for task_id in tasks_to_remove:
+            del self.task_storage[task_id]
+            logger.debug(f"[YouTubeTaskManager] Cleaned up old in-memory task: {task_id}")
+
+    def cleanup_old_db_tasks(self, days: int = 7, user_id: Optional[str] = None):
+        """Delete completed/failed DB tasks older than N days."""
+        if not user_id:
+            return 0
+
+        db = self._get_db(user_id)
+        if not db:
+            return 0
+
+        count = 0
+        try:
+            from datetime import timedelta
+            cutoff = datetime.now(timezone.utc) - timedelta(days=days)
+            old_tasks = db.query(YouTubeVideoTask).filter(
+                YouTubeVideoTask.user_id == user_id,
+                YouTubeVideoTask.status.in_([YouTubeTaskStatus.COMPLETED, YouTubeTaskStatus.FAILED]),
+                YouTubeVideoTask.created_at < cutoff,
+            ).all()
+
+            for task in old_tasks:
+                db.delete(task)
+                count += 1
+
+            if count > 0:
+                db.commit()
+                logger.info(f"[YouTubeTaskManager] Cleaned up {count} old DB tasks for user {user_id}")
+        except Exception as e:
+            logger.warning(f"[YouTubeTaskManager] Failed to cleanup old DB tasks: {e}")
+            db.rollback()
+        finally:
+            db.close()
+
+        return count
+
+
+# Global singleton instance
+task_manager = YouTubeTaskManager()