chore: bulk commit of local changes across blog writer, SEO dashboard, scheduler, docs-site, and frontend

2026-06-05 12:40:04 +05:30
parent b894bc0abb
commit e54aaa7a3e
74 changed files with 5667 additions and 996 deletions
--- a/backend/services/seo/advertools_service.py
+++ b/backend/services/seo/advertools_service.py
@@ -1,12 +1,18 @@
 import advertools as adv
 import pandas as pd
 import asyncio
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, Tuple
 from datetime import datetime, timedelta
 from loguru import logger
 import json
 import os
 import tempfile
+from urllib.parse import urlparse
+from collections import Counter
+import urllib.request
+import urllib.error
+import socket
+import re

 class AdvertoolsService:
    """
@@ -19,51 +25,58 @@ class AdvertoolsService:

    async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
        """
-        Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
+        Analyzes a website's sitemap to extract metrics on publishing velocity, freshness,
+        URL structure patterns, and topic distribution.
        """
        try:
            self.logger.info(f"Analyzing sitemap: {sitemap_url}")
            
-            # advertools sitemap_to_df is blocking, run in executor
            loop = asyncio.get_event_loop()
            df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
            
            if df is None or df.empty:
                return {"success": False, "error": "Sitemap is empty or could not be parsed."}

-            # Convert lastmod to datetime
            if 'lastmod' in df.columns:
                df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
                
            total_urls = len(df)
            
-            # Handle potential empty datetime columns
-            if 'lastmod' in df.columns and not df['lastmod'].isna().all():
-                now = datetime.now(df['lastmod'].dt.tz)
-                thirty_days_ago = now - timedelta(days=30)
-                recent_urls = df[df['lastmod'] > thirty_days_ago]
-                six_months_ago = now - timedelta(days=180)
-                stale_urls = df[df['lastmod'] < six_months_ago]
-                
-                publishing_velocity = len(recent_urls) / 4.0 # URLs per week
-                stale_count = len(stale_urls)
-            else:
-                publishing_velocity = 0
-                stale_count = 0
+            # --- Content Freshness Scoring ---
+            freshness = self._compute_freshness(df)
            
-            # Enhanced Content Pillars (Top folder patterns - 3 levels deep)
-            def extract_hierarchy(url: str):
-                try:
-                    parts = urlparse(url).path.strip('/').split('/')
-                    if not parts or not parts[0]: return "home"
-                    return "/".join(parts[:2]) # Capture top 2 segments
-                except:
-                    return "other"
+            # --- URL Structure Analysis ---
+            url_structure = {}
+            if 'loc' in df.columns:
+                url_structure = await self._analyze_url_structure(df['loc'].tolist())
+            
+            # --- Content Pillars via url_to_df ---
+            pillars = {}
+            url_df = None
+            try:
+                url_df = adv.url_to_df(df['loc'])
+                if url_df is not None and not url_df.empty:
+                    dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
+                    if dir_cols:
+                        pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
+                        for col in dir_cols[1:3]:
+                            mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
+                            pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
+                        pillars = pillar_series.value_counts().head(15).to_dict()
+            except Exception:
+                fallback_pillars = {}
+                if 'loc' in df.columns:
+                    def extract_hierarchy(url: str):
+                        try:
+                            parts = urlparse(url).path.strip('/').split('/')
+                            if not parts or not parts[0]: return "home"
+                            return "/".join(parts[:2])
+                        except:
+                            return "other"
+                    fallback_pillars = df['loc'].apply(extract_hierarchy).value_counts().head(15).to_dict()
+                pillars = fallback_pillars

-            df['pillar'] = df['loc'].apply(extract_hierarchy)
-            pillars = df['pillar'].value_counts().head(15).to_dict()
-
-            # Return a sample of URLs for auditing (top 15 most recent if available)
+            # Sample URLs for auditing (top 15 most recent)
            audit_urls = []
            if 'lastmod' in df.columns and not df['lastmod'].isna().all():
                audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
@@ -74,10 +87,14 @@ class AdvertoolsService:
                "success": True,
                "metrics": {
                    "total_urls": total_urls,
-                    "publishing_velocity": round(publishing_velocity, 2),
-                    "stale_content_count": stale_count,
-                    "stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
+                    "publishing_velocity": freshness.get("publishing_velocity"),
+                    "stale_content_count": freshness.get("stale_count"),
+                    "stale_content_percentage": freshness.get("stale_percentage"),
+                    "freshness_score": freshness.get("freshness_score"),
+                    "publishing_recency": freshness.get("publishing_recency"),
+                    "publishing_trend": freshness.get("publishing_trend"),
                    "top_pillars": pillars,
+                    "url_structure": url_structure,
                    "audit_sample_urls": audit_urls
                },
                "timestamp": datetime.utcnow().isoformat()
@@ -86,6 +103,146 @@ class AdvertoolsService:
            self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
            return {"success": False, "error": str(e)}

+    def _compute_freshness(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """Compute content freshness, publishing velocity, and staleness metrics."""
+        result = {
+            "publishing_velocity": 0,
+            "stale_count": 0,
+            "stale_percentage": 0,
+            "freshness_score": 0,
+            "publishing_recency": {},
+            "publishing_trend": "unknown"
+        }
+        
+        if 'lastmod' not in df.columns or df['lastmod'].isna().all():
+            return result
+
+        lastmod = df['lastmod'].dropna()
+        if lastmod.empty:
+            return result
+
+        now = datetime.now(lastmod.dt.tz)
+        thirty_days_ago = now - timedelta(days=30)
+        ninety_days_ago = now - timedelta(days=90)
+        six_months_ago = now - timedelta(days=180)
+
+        recent_urls = df[df['lastmod'] > thirty_days_ago]
+        stale_urls = df[df['lastmod'] < six_months_ago]
+        
+        total_urls = len(df)
+        stale_count = len(stale_urls)
+        stale_percentage = round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0
+
+        # Publishing velocity: URLs per week over last 90 days
+        recent_90 = df[df['lastmod'] > ninety_days_ago]
+        publishing_velocity = round(len(recent_90) / 13.0, 2) if not recent_90.empty else 0
+
+        # Freshness score (0-100): weighted combination of metrics
+        non_stale_ratio = 1.0 - (stale_percentage / 100.0)
+        recency_ratio = len(recent_urls) / max(total_urls, 1)
+        velocity_score = min(publishing_velocity / 10.0, 1.0)
+        freshness_score = round((non_stale_ratio * 50 + recency_ratio * 30 + velocity_score * 20), 1)
+
+        # Publishing recency: URLs published in last 1d, 7d, 30d, 90d
+        publishing_recency = {
+            "last_24h": int(len(df[df['lastmod'] > (now - timedelta(days=1))])),
+            "last_7d": int(len(df[df['lastmod'] > (now - timedelta(days=7))])),
+            "last_30d": int(len(recent_urls)),
+            "last_90d": int(len(recent_90)),
+        }
+
+        # Publishing trend: compare recent 30d vs prior 30d
+        prior_30 = df[(df['lastmod'] <= thirty_days_ago) & (df['lastmod'] > (now - timedelta(days=60)))]
+        recent_count = len(recent_urls)
+        prior_count = len(prior_30)
+        if recent_count > prior_count * 1.1:
+            publishing_trend = "increasing"
+        elif recent_count < prior_count * 0.9:
+            publishing_trend = "decreasing"
+        else:
+            publishing_trend = "stable"
+
+        return {
+            "publishing_velocity": publishing_velocity,
+            "stale_count": stale_count,
+            "stale_percentage": stale_percentage,
+            "freshness_score": freshness_score,
+            "publishing_recency": publishing_recency,
+            "publishing_trend": publishing_trend
+        }
+
+    async def _analyze_url_structure(self, urls: List[str]) -> Dict[str, Any]:
+        """Analyze URL patterns for parameter bloat, directory depth, and path patterns."""
+        try:
+            loop = asyncio.get_event_loop()
+            url_df = await loop.run_in_executor(None, lambda: adv.url_to_df(urls))
+
+            if url_df is None or url_df.empty:
+                return {}
+
+            total = len(url_df)
+
+            # Query param analysis
+            has_query = url_df['query'].notna() & (url_df['query'] != '')
+            param_count = has_query.sum()
+            param_percentage = round((param_count / total) * 100, 2) if total > 0 else 0
+
+            # Extract individual parameters
+            all_params = []
+            param_frequency = {}
+            if param_count > 0:
+                for q in url_df.loc[has_query, 'query'].dropna().unique():
+                    for pair in q.split('&'):
+                        key = pair.split('=')[0] if '=' in pair else pair
+                        all_params.append(key)
+                from collections import Counter
+                param_frequency = dict(Counter(all_params).most_common(10))
+
+            # Directory depth analysis
+            dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
+            def count_depth(row):
+                for i, col in enumerate(dir_cols):
+                    val = row[col]
+                    if pd.isna(val) or str(val) == 'nan' or str(val).strip() == '':
+                        return i
+                return len(dir_cols)
+
+            depths = url_df.apply(count_depth, axis=1)
+            avg_depth = round(depths.mean(), 1) if not depths.empty else 0
+            max_depth = int(depths.max()) if not depths.empty else 0
+            depth_distribution = depths.value_counts().sort_index().head(10).to_dict()
+            depth_distribution = {str(k): int(v) for k, v in depth_distribution.items()}
+
+            # Protocol consistency
+            schemes = url_df['scheme'].value_counts().to_dict() if 'scheme' in url_df.columns else {}
+
+            # Subdomain analysis
+            netloc_counts = url_df['netloc'].value_counts() if 'netloc' in url_df.columns else None
+            unique_subdomains = int(netloc_counts.nunique()) if netloc_counts is not None else 0
+            primary_domain = netloc_counts.index[0] if netloc_counts is not None and not netloc_counts.empty else ""
+
+            return {
+                "total_urls_analyzed": total,
+                "parameter_usage": {
+                    "urls_with_params": int(param_count),
+                    "percentage_with_params": param_percentage,
+                    "top_parameters": param_frequency
+                },
+                "directory_depth": {
+                    "average_depth": avg_depth,
+                    "max_depth": max_depth,
+                    "distribution": depth_distribution
+                },
+                "protocols": {str(k): int(v) for k, v in schemes.items()},
+                "subdomains": {
+                    "primary": primary_domain,
+                    "unique_count": unique_subdomains
+                }
+            }
+        except Exception as e:
+            self.logger.warning(f"URL structure analysis failed: {e}")
+            return {}
+
    async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
        """
        Performs a shallow crawl and theme analysis using word frequency.
@@ -153,6 +310,512 @@ class AdvertoolsService:
                except Exception as e:
                    self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")

+    async def analyze_site_structure(self, url_list: List[str], site_domain: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Crawls a set of pages with link following to analyze internal link health,
+        redirect chains, and page-level SEO elements.
+        
+        Extracts metrics via crawlytics: link distribution, redirect chains, image SEO.
+        """
+        temp_file = None
+        try:
+            self.logger.info(f"Analyzing site structure for {len(url_list)} URLs, domain={site_domain}")
+            
+            with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
+                temp_file = tf.name
+
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, lambda: adv.crawl(
+                url_list=url_list,
+                output_file=temp_file,
+                follow_links=True,
+                allowed_domains=[site_domain] if site_domain else None,
+                custom_settings={
+                    'LOG_LEVEL': 'WARNING',
+                    'CLOSESPIDER_PAGECOUNT': 50,
+                    'DOWNLOAD_TIMEOUT': 30,
+                    'CONCURRENT_REQUESTS_PER_DOMAIN': 3,
+                    'DEPTH_LIMIT': 3,
+                }
+            ))
+            
+            if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
+                return {"success": False, "error": "Site structure crawl produced no output."}
+
+            crawl_df = pd.read_json(temp_file, lines=True)
+            page_count = len(crawl_df)
+            result = {"success": True, "page_count": page_count}
+
+            # --- Link Health via crawlytics ---
+            try:
+                internal_regex = site_domain if site_domain else None
+                link_df = adv.crawlytics.links(crawl_df, internal_url_regex=internal_regex)
+                if link_df is not None and not link_df.empty:
+                    total_links = len(link_df)
+                    internal_links = int(link_df['internal'].sum()) if 'internal' in link_df.columns else 0
+                    external_links = total_links - internal_links
+                    nofollow_links = int(link_df['nofollow'].sum()) if 'nofollow' in link_df.columns else 0
+
+                    # Count links per page
+                    links_per_page = link_df.groupby(level=0).size()
+                    avg_links_per_page = round(links_per_page.mean(), 1) if not links_per_page.empty else 0
+
+                    # Most common anchor text (internal links only)
+                    anchor_texts = []
+                    if 'text' in link_df.columns and 'internal' in link_df.columns:
+                        internal_anchors = link_df[link_df['internal'] == True]['text'].dropna()
+                        for t in internal_anchors:
+                            if isinstance(t, str) and t.strip():
+                                anchor_texts.extend([w.strip() for w in t.split() if len(w.strip()) > 2])
+                    from collections import Counter
+                    top_anchors = dict(Counter(anchor_texts).most_common(15)) if anchor_texts else {}
+
+                    result["link_health"] = {
+                        "total_links_found": total_links,
+                        "internal_link_count": internal_links,
+                        "external_link_count": external_links,
+                        "internal_link_percentage": round((internal_links / total_links) * 100, 1) if total_links > 0 else 0,
+                        "nofollow_link_count": nofollow_links,
+                        "avg_links_per_page": avg_links_per_page,
+                        "top_anchor_words": top_anchors
+                    }
+                else:
+                    result["link_health"] = {"error": "No links found in crawl data"}
+            except Exception as e:
+                self.logger.warning(f"Link analysis failed: {e}")
+                result["link_health"] = {"error": str(e)}
+
+            # --- Redirect Chain Audit via crawlytics ---
+            try:
+                redirect_df = adv.crawlytics.redirects(crawl_df)
+                if redirect_df is not None and not redirect_df.empty:
+                    total_redirects = len(redirect_df)
+                    redirect_chains = redirect_df['redirect_times'].nunique() if 'redirect_times' in redirect_df.columns else 0
+                    redirect_statuses = redirect_df['status'].value_counts().to_dict() if 'status' in redirect_df.columns else {}
+                    multi_hop = redirect_df[redirect_df['redirect_times'] > 1] if 'redirect_times' in redirect_df.columns else pd.DataFrame()
+
+                    result["redirect_audit"] = {
+                        "total_redirects": int(total_redirects),
+                        "unique_chains": int(redirect_chains),
+                        "status_distribution": {str(k): int(v) for k, v in redirect_statuses.items()},
+                        "multi_hop_chains": int(len(multi_hop)),
+                        "affected_pages": multi_hop.index.unique().tolist() if not multi_hop.empty else []
+                    }
+                else:
+                    result["redirect_audit"] = {"total_redirects": 0, "note": "No redirects detected"}
+            except Exception as e:
+                self.logger.warning(f"Redirect analysis failed: {e}")
+                result["redirect_audit"] = {"error": str(e)}
+
+            # --- Image SEO overview via crawlytics ---
+            try:
+                img_df = adv.crawlytics.images(crawl_df)
+                if img_df is not None and not img_df.empty:
+                    total_images = len(img_df)
+                    missing_alt = int(img_df['img_alt'].isna().sum()) if 'img_alt' in img_df.columns else 0
+                    alt_coverage = round(((total_images - missing_alt) / total_images) * 100, 1) if total_images > 0 else 0
+                    result["image_seo"] = {
+                        "total_images": total_images,
+                        "missing_alt_count": missing_alt,
+                        "alt_coverage_percentage": alt_coverage
+                    }
+            except Exception as e:
+                self.logger.warning(f"Image analysis failed: {e}")
+
+            # --- Page-level metrics ---
+            if 'status' in crawl_df.columns:
+                status_dist = crawl_df['status'].value_counts().to_dict()
+                result["page_status"] = {str(k): int(v) for k, v in status_dist.items()}
+            if 'title' in crawl_df.columns:
+                missing_titles = int(crawl_df['title'].isna().sum())
+                result["missing_titles"] = missing_titles
+            if 'meta_desc' in crawl_df.columns:
+                missing_descriptions = int(crawl_df['meta_desc'].isna().sum())
+                result["missing_descriptions"] = missing_descriptions
+
+            result["timestamp"] = datetime.utcnow().isoformat()
+            return result
+
+        except Exception as e:
+            self.logger.error(f"Failed to analyze site structure: {str(e)}")
+            return {"success": False, "error": str(e)}
+        finally:
+            if temp_file and os.path.exists(temp_file):
+                try:
+                    os.remove(temp_file)
+                except Exception as e:
+                    self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
+
+    async def analyze_robots_txt(self, website_url: str) -> Dict[str, Any]:
+        """
+        Fetch and analyze robots.txt for compliance issues.
+        Checks directives, sitemap declaration, crawl-delay, and common problems.
+        """
+        try:
+            self.logger.info(f"Analyzing robots.txt for {website_url}")
+            parsed = urlparse(website_url)
+            base_url = f"{parsed.scheme}://{parsed.netloc}"
+            robots_url = f"{base_url}/robots.txt"
+            result = {
+                "success": True,
+                "url": robots_url,
+                "accessible": True,
+                "total_directives": 0,
+                "user_agents_found": [],
+                "has_sitemap_directive": False,
+                "sitemap_urls": [],
+                "has_crawl_delay": False,
+                "disallow_rules": [],
+                "issues": [],
+                "compliance_score": 100,
+            }
+            loop = asyncio.get_event_loop()
+            try:
+                robots_df = await loop.run_in_executor(
+                    None, lambda: adv.robotstxt_to_df(robots_url)
+                )
+                if robots_df is None or robots_df.empty:
+                    raise ValueError("Empty result from robotstxt_to_df")
+            except Exception as adv_err:
+                self.logger.warning(f"adv.robotstxt_to_df failed, using manual fallback: {adv_err}")
+                robots_df = await loop.run_in_executor(
+                    None, lambda: self._parse_robots_txt_manual(robots_url)
+                )
+            if robots_df is None or robots_df.empty:
+                result["success"] = False
+                result["error"] = "Could not fetch or parse robots.txt"
+                result["accessible"] = False
+                return result
+
+            result["total_directives"] = len(robots_df)
+
+            if 'user_agent' in robots_df.columns:
+                result["user_agents_found"] = robots_df['user_agent'].dropna().unique().tolist()
+
+            rule_col = 'rule' if 'rule' in robots_df.columns else 'directive' if 'directive' in robots_df.columns else None
+            value_col = 'value' if 'value' in robots_df.columns else 'directive_value' if 'directive_value' in robots_df.columns else None
+
+            if rule_col and value_col:
+                rules_lower = robots_df[rule_col].astype(str).str.lower()
+                result["has_sitemap_directive"] = 'sitemap' in rules_lower.values
+                result["has_crawl_delay"] = 'crawl-delay' in rules_lower.values
+                has_disallow_all = any(
+                    str(row.get(value_col, '')).strip() == '/'
+                    for _, row in robots_df[robots_df[rule_col].astype(str).str.lower() == 'disallow'].iterrows()
+                ) if 'disallow' in rules_lower.values else False
+
+                disallow_mask = rules_lower == 'disallow'
+                if disallow_mask.any():
+                    for _, row in robots_df[disallow_mask].iterrows():
+                        val = str(row.get(value_col, ''))
+                        ua = str(row.get('user_agent', '*'))
+                        if val:
+                            result["disallow_rules"].append({"user_agent": ua, "path": val})
+
+                sitemap_mask = rules_lower == 'sitemap'
+                if sitemap_mask.any():
+                    result["sitemap_urls"] = robots_df.loc[sitemap_mask, value_col].dropna().unique().tolist()
+
+                if has_disallow_all:
+                    result["issues"].append({
+                        "severity": "critical", "code": "DISALLOW_ALL",
+                        "detail": "robots.txt disallows all user agents from all paths (Disallow: /)"
+                    })
+
+            if not result["has_sitemap_directive"]:
+                result["issues"].append({
+                    "severity": "warning", "code": "NO_SITEMAP",
+                    "detail": "No Sitemap directive found — search engines may miss pages"
+                })
+            if not result["has_crawl_delay"]:
+                result["issues"].append({
+                    "severity": "info", "code": "NO_CRAWL_DELAY",
+                    "detail": "No Crawl-delay directive set — not critical for most sites"
+                })
+
+            for issue in result["issues"]:
+                sev = issue["severity"]
+                if sev == "critical":
+                    result["compliance_score"] -= 30
+                elif sev == "warning":
+                    result["compliance_score"] -= 15
+                elif sev == "info":
+                    result["compliance_score"] -= 5
+            result["compliance_score"] = max(result["compliance_score"], 0)
+
+            return result
+
+        except Exception as e:
+            self.logger.error(f"Robots.txt analysis failed: {e}")
+            return {"success": False, "error": str(e), "url": robots_url if 'robots_url' in locals() else website_url}
+
+    def _parse_robots_txt_manual(self, url: str) -> pd.DataFrame:
+        """Fallback: manually fetch and parse robots.txt."""
+        records = []
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
+            with urllib.request.urlopen(req, timeout=15) as resp:
+                content = resp.read().decode("utf-8", errors="replace")
+            current_ua = "*"
+            for line in content.splitlines():
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                if line.lower().startswith("user-agent"):
+                    parts = line.split(":", 1)
+                    current_ua = parts[1].strip() if len(parts) > 1 else "*"
+                    continue
+                if ":" in line:
+                    directive, _, value = line.partition(":")
+                    records.append({
+                        "user_agent": current_ua,
+                        "rule": directive.strip(),
+                        "value": value.strip(),
+                    })
+        except Exception as e:
+            self.logger.warning(f"Manual robots.txt fetch failed: {e}")
+        if not records:
+            return pd.DataFrame()
+        return pd.DataFrame(records)
+
+    async def analyze_crawl_budget(self, sitemap_url: str, site_domain: str) -> Dict[str, Any]:
+        """
+        Analyze crawl budget by comparing sitemap inventory against actual crawl results.
+        Estimates budget utilization, waste from redirects/errors, and optimization score.
+        """
+        temp_file = None
+        try:
+            self.logger.info(f"Analyzing crawl budget for {site_domain}")
+            loop = asyncio.get_event_loop()
+
+            sitemap_df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
+            sitemap_total = len(sitemap_df) if sitemap_df is not None and not sitemap_df.empty else 0
+
+            start_url = f"https://{site_domain}" if not site_domain.startswith("http") else site_domain
+
+            with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
+                temp_file = tf.name
+
+            await loop.run_in_executor(None, lambda: adv.crawl(
+                url_list=[start_url],
+                output_file=temp_file,
+                follow_links=True,
+                allowed_domains=[site_domain],
+                custom_settings={
+                    'LOG_LEVEL': 'WARNING',
+                    'CLOSESPIDER_PAGECOUNT': 30,
+                    'DOWNLOAD_TIMEOUT': 15,
+                    'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
+                    'DEPTH_LIMIT': 2,
+                }
+            ))
+
+            if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
+                return {"success": False, "error": "Crawl produced no output"}
+
+            crawl_df = pd.read_json(temp_file, lines=True)
+            crawled_count = len(crawl_df)
+
+            status_dist = {}
+            if 'status' in crawl_df.columns:
+                raw = crawl_df['status'].value_counts().to_dict()
+                status_dist = {str(k): int(v) for k, v in raw.items()}
+
+            wasted = 0
+            for code_s in status_dist:
+                code = int(code_s)
+                if code >= 300 or code < 200:
+                    wasted += status_dist[code_s]
+
+            budget_usage_ratio = round(crawled_count / max(sitemap_total, 1), 3)
+            waste_ratio = round(wasted / max(crawled_count, 1), 3)
+
+            depth_dist = {}
+            if 'depth' in crawl_df.columns:
+                raw = crawl_df['depth'].value_counts().sort_index().to_dict()
+                depth_dist = {str(k): int(v) for k, v in raw.items()}
+
+            param_count = 0
+            url_col = 'url' if 'url' in crawl_df.columns else 'response_url' if 'response_url' in crawl_df.columns else None
+            if url_col:
+                param_count = int(crawl_df[url_col].astype(str).str.contains('?').sum())
+
+            optimization_score = max(0, round(100 - (waste_ratio * 100) - (budget_usage_ratio * 20), 1))
+
+            return {
+                "success": True,
+                "sitemap_total_urls": sitemap_total,
+                "pages_crawled": crawled_count,
+                "crawl_coverage_percentage": round(budget_usage_ratio * 100, 1),
+                "status_distribution": status_dist,
+                "wasted_crawl_requests": int(wasted),
+                "waste_percentage": round(waste_ratio * 100, 1),
+                "depth_distribution": depth_dist,
+                "urls_with_parameters": int(param_count),
+                "optimization_score": optimization_score,
+            }
+
+        except Exception as e:
+            self.logger.error(f"Crawl budget analysis failed: {e}")
+            return {"success": False, "error": str(e)}
+        finally:
+            if temp_file and os.path.exists(temp_file):
+                try: os.remove(temp_file)
+                except Exception: pass
+
+    async def sitemap_compare(self, sitemap_a: str, sitemap_b: str) -> Dict[str, Any]:
+        """
+        Compare two sitemaps for competitive content gap analysis.
+        Analyzes URL count, freshness, directory pillars, and identifies
+        patterns unique to each sitemap.
+        """
+        try:
+            self.logger.info(f"Comparing sitemaps: {sitemap_a} vs {sitemap_b}")
+            loop = asyncio.get_event_loop()
+
+            df_a = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_a))
+            df_b = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_b))
+
+            total_a = len(df_a) if df_a is not None and not df_a.empty else 0
+            total_b = len(df_b) if df_b is not None and not df_b.empty else 0
+            result = {
+                "success": True,
+                "sitemap_a": {"url": sitemap_a, "total_urls": total_a},
+                "sitemap_b": {"url": sitemap_b, "total_urls": total_b},
+                "url_count_diff": total_a - total_b,
+                "ratio": round(total_a / max(total_b, 1), 2),
+                "pillars_a": {},
+                "pillars_b": {},
+                "shared_pillars": [],
+                "unique_to_a": [],
+                "unique_to_b": [],
+                "freshness_comparison": {},
+                "overlap_score": 0,
+            }
+
+            if total_a == 0 or total_b == 0:
+                return result
+
+            def extract_pillars(df: pd.DataFrame, label: str) -> Tuple[dict, list]:
+                pillars = {}
+                if 'loc' in df.columns:
+                    try:
+                        url_df = adv.url_to_df(df['loc'])
+                        if url_df is not None and not url_df.empty:
+                            dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
+                            if dir_cols:
+                                pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
+                                for col in dir_cols[1:3]:
+                                    mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
+                                    pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
+                                pillars = pillar_series.value_counts().head(20).to_dict()
+                    except Exception:
+                        pass
+
+                if not pillars:
+                    seen = {}
+                    for url in df['loc'].dropna():
+                        parts = urlparse(url).path.strip('/').split('/')
+                        key = parts[0] if parts and parts[0] else "home"
+                        seen[key] = seen.get(key, 0) + 1
+                    pillars = dict(sorted(seen.items(), key=lambda x: x[1], reverse=True)[:20])
+
+                pillar_keys = list(pillars.keys()) if pillars else []
+                return pillars, pillar_keys
+
+            pillars_a, keys_a = extract_pillars(df_a, "a")
+            pillars_b, keys_b = extract_pillars(df_b, "b")
+            result["pillars_a"] = pillars_a
+            result["pillars_b"] = pillars_b
+
+            set_a = set(keys_a)
+            set_b = set(keys_b)
+            shared = set_a & set_b
+            result["shared_pillars"] = sorted(shared)
+            result["unique_to_a"] = sorted(set_a - set_b)
+            result["unique_to_b"] = sorted(set_b - set_a)
+
+            total_keys = max(len(set_a | set_b), 1)
+            overlap_count = len(shared)
+            result["overlap_score"] = round((overlap_count / total_keys) * 100, 1)
+
+            def compute_freshness_stats(df: pd.DataFrame) -> dict:
+                stats = {"has_lastmod": False, "recent_30d": 0, "total_with_dates": 0}
+                if 'lastmod' in df.columns:
+                    lm = pd.to_datetime(df['lastmod'], errors='coerce', utc=True).dropna()
+                    if not lm.empty:
+                        stats["has_lastmod"] = True
+                        stats["total_with_dates"] = int(len(lm))
+                        stats["recent_30d"] = int((lm > (datetime.now(lm.dt.tz) - timedelta(days=30))).sum())
+                return stats
+
+            result["freshness_comparison"] = {
+                "a": compute_freshness_stats(df_a),
+                "b": compute_freshness_stats(df_b),
+            }
+
+            return result
+
+        except Exception as e:
+            self.logger.error(f"Sitemap comparison failed: {e}")
+            return {"success": False, "error": str(e)}
+
+    async def compare_crawl_results(self, result_a: Dict[str, Any], result_b: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Compare two crawl analysis result dicts to surface changes over time.
+        Useful for tracking SEO improvements between scheduled executions.
+        """
+        try:
+            diff = {
+                "success": True,
+                "page_count_change": 0,
+                "status_distribution_changes": {},
+                "link_health_changes": {},
+                "redirect_changes": {},
+                "new_issues": [],
+                "resolved_issues": [],
+            }
+
+            pc_a = result_a.get("page_count", 0)
+            pc_b = result_b.get("page_count", 0)
+            diff["page_count_change"] = pc_b - pc_a
+
+            sd_a = result_a.get("page_status", {})
+            sd_b = result_b.get("page_status", {})
+            all_codes = set(list(sd_a.keys()) + list(sd_b.keys()))
+            for c in sorted(all_codes):
+                va = sd_a.get(c, 0)
+                vb = sd_b.get(c, 0)
+                change = vb - va
+                if change != 0:
+                    diff["status_distribution_changes"][c] = change
+
+            def _safe_diff(d_a: dict, d_b: dict, prefix: str) -> dict:
+                changes = {}
+                all_keys = set(list(d_a.keys()) + list(d_b.keys()))
+                for k in all_keys:
+                    va = d_a.get(k, 0)
+                    vb = d_b.get(k, 0)
+                    if isinstance(va, (int, float)) and isinstance(vb, (int, float)):
+                        change = round(vb - va, 2)
+                        if change != 0:
+                            changes[f"{prefix}_{k}"] = change
+                return changes
+
+            lh_a = result_a.get("link_health", {})
+            lh_b = result_b.get("link_health", {})
+            diff["link_health_changes"] = _safe_diff(lh_a, lh_b, "link")
+
+            rd_a = result_a.get("redirect_audit", {})
+            rd_b = result_b.get("redirect_audit", {})
+            diff["redirect_changes"] = _safe_diff(rd_a, rd_b, "redirect")
+
+            return diff
+
+        except Exception as e:
+            self.logger.error(f"Crawl comparison failed: {e}")
+            return {"success": False, "error": str(e)}
+
    async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
        """
        Analyzes linking patterns and social media presence using unique temporary files.
--- a/backend/services/seo/dashboard_service.py
+++ b/backend/services/seo/dashboard_service.py
@@ -454,14 +454,12 @@ class SEODashboardService:
    def _get_advertools_insights(self, user_id: str, site_url: str) -> Dict[str, Any]:
        """Fetch Advertools-based insights from WebsiteAnalysis and AdvertoolsTasks."""
        try:
-            # 1. Get augmented persona themes from WebsiteAnalysis
            session = self.db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
            if not session:
                return {}

            analysis = self.db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
            
-            # 2. Get latest tasks status
            tasks = self.db.query(AdvertoolsTask).filter(AdvertoolsTask.user_id == user_id).all()
            
            audit_status = "pending"
@@ -479,6 +477,14 @@ class SEODashboardService:

            return {
                "augmented_themes": brand_analysis.get('augmented_themes', []),
+                "link_health": brand_analysis.get('link_health', {}),
+                "redirect_audit": brand_analysis.get('redirect_audit', {}),
+                "image_seo": brand_analysis.get('image_seo', {}),
+                "page_status": brand_analysis.get('page_status', {}),
+                "url_structure": brand_analysis.get('url_structure', {}),
+                "freshness": brand_analysis.get('freshness', {}),
+                "robots_txt": brand_analysis.get('robots_txt', {}),
+                "crawl_budget": brand_analysis.get('crawl_budget', {}),
                "last_audit": brand_analysis.get('last_advertools_audit'),
                "site_health": seo_audit.get('site_health', {}),
                "last_health_check": seo_audit.get('last_advertools_health_check'),