Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts

2026-02-08 13:56:57 +05:30
parent 1db10ccd0f
commit e404a86502
333 changed files with 42223 additions and 10875 deletions
--- a/backend/services/seo_tools/content_strategy_service.py
+++ b/backend/services/seo_tools/content_strategy_service.py
@@ -5,10 +5,19 @@ AI-powered content strategy analyzer that provides insights into
 content gaps, opportunities, and competitive positioning.
 """

-from typing import Dict, Any, List, Optional
+import json
+import re
+import asyncio
+from typing import Dict, Any, List, Optional, Tuple
 from datetime import datetime
+import statistics
 from loguru import logger

+from ..llm_providers.main_text_generation import llm_text_gen
+from middleware.logging_middleware import seo_logger
+
+from .sitemap_service import SitemapService
+
 class ContentStrategyService:
    """Service for AI-powered content strategy analysis"""
    
@@ -22,30 +31,540 @@ class ContentStrategyService:
        website_url: str,
        competitors: List[str] = None,
        target_keywords: List[str] = None,
-        custom_parameters: Dict[str, Any] = None
+        custom_parameters: Dict[str, Any] = None,
+        user_id: Optional[str] = None
    ) -> Dict[str, Any]:
-        """Analyze content strategy and opportunities"""
-        # Placeholder implementation
-        return {
+        start_time = datetime.utcnow()
+
+        competitors = competitors or []
+        target_keywords = target_keywords or []
+        custom_parameters = custom_parameters or {}
+
+        sitemap_service = SitemapService()
+
+        discovered_user_sitemap = await sitemap_service.discover_sitemap_url(website_url)
+        user_sitemap_result = None
+        if discovered_user_sitemap:
+            user_sitemap_result = await sitemap_service.analyze_sitemap(
+                sitemap_url=discovered_user_sitemap,
+                analyze_content_trends=True,
+                analyze_publishing_patterns=True,
+                include_ai_insights=False
+            )
+
+        competitor_sitemaps: Dict[str, Optional[str]] = {}
+        competitor_results: Dict[str, Dict[str, Any]] = {}
+
+        for competitor_url in competitors[:5]:
+            sitemap_url = await sitemap_service.discover_sitemap_url(competitor_url)
+            competitor_sitemaps[competitor_url] = sitemap_url
+            if sitemap_url:
+                try:
+                    competitor_results[competitor_url] = await sitemap_service.analyze_sitemap(
+                        sitemap_url=sitemap_url,
+                        analyze_content_trends=True,
+                        analyze_publishing_patterns=True,
+                        include_ai_insights=False
+                    )
+                except Exception as e:
+                    competitor_results[competitor_url] = {"error": str(e)}
+
+        deterministic = self._build_deterministic_insights(
+            website_url=website_url,
+            user_sitemap_url=discovered_user_sitemap,
+            user_sitemap_result=user_sitemap_result,
+            competitor_sitemaps=competitor_sitemaps,
+            competitor_results=competitor_results,
+            target_keywords=target_keywords
+        )
+
+        ai_strategy = None
+        ai_error = None
+        if user_id:
+            try:
+                prompt = self._build_ai_prompt(
+                    website_url=website_url,
+                    target_keywords=target_keywords,
+                    custom_parameters=custom_parameters,
+                    deterministic_summary=deterministic
+                )
+                ai_response = llm_text_gen(
+                    prompt=prompt,
+                    system_prompt=self._get_system_prompt(),
+                    user_id=user_id
+                )
+                ai_strategy = self._parse_json_response(ai_response)
+
+                await seo_logger.log_ai_analysis(
+                    tool_name=self.service_name,
+                    prompt=prompt,
+                    response=ai_response,
+                    model_used="gemini-2.0-flash-001"
+                )
+            except Exception as e:
+                ai_error = str(e)
+
+        execution_time = (datetime.utcnow() - start_time).total_seconds()
+
+        result = {
            "website_url": website_url,
            "analysis_type": "content_strategy",
-            "competitors_analyzed": len(competitors) if competitors else 0,
-            "content_gaps": [
-                {"topic": "SEO best practices", "opportunity_score": 85, "difficulty": "Medium"},
-                {"topic": "Content marketing", "opportunity_score": 78, "difficulty": "Low"}
-            ],
-            "opportunities": [
-                {"type": "Trending topics", "count": 15, "potential_traffic": "High"},
-                {"type": "Long-tail keywords", "count": 45, "potential_traffic": "Medium"}
-            ],
-            "content_performance": {"top_performing": 12, "underperforming": 8},
-            "recommendations": [
-                "Create content around trending SEO topics",
-                "Optimize existing content for long-tail keywords",
-                "Develop content series for better engagement"
-            ],
-            "competitive_analysis": {"content_leadership": "moderate", "gaps_identified": 8}
+            "timestamp": datetime.utcnow().isoformat(),
+            "execution_time": execution_time,
+            "inputs": {
+                "competitors": competitors[:5],
+                "target_keywords": target_keywords,
+                "custom_parameters": custom_parameters
+            },
+            "data_sources": {
+                "user_sitemap_url": discovered_user_sitemap,
+                "competitor_sitemaps": competitor_sitemaps
+            },
+            "deterministic_insights": deterministic,
+            "ai_strategy": ai_strategy,
+            "ai_error": ai_error
        }
+
+        await seo_logger.log_tool_usage(
+            tool_name=self.service_name,
+            input_data={
+                "website_url": website_url,
+                "competitors_count": len(competitors),
+                "target_keywords_count": len(target_keywords),
+                "has_user_sitemap": bool(discovered_user_sitemap)
+            },
+            output_data={
+                "website_url": website_url,
+                "has_ai_strategy": bool(ai_strategy),
+                "has_ai_error": bool(ai_error),
+                "execution_time": execution_time
+            },
+            success=True if (ai_strategy is not None or deterministic is not None) else False
+        )
+
+        return result
+
+    async def analyze_competitive_sitemap_benchmarking(
+        self,
+        website_url: str,
+        competitors: List[str],
+        max_competitors: Optional[int] = None,
+        user_id: Optional[str] = None
+    ) -> Dict[str, Any]:
+        start_time = datetime.utcnow()
+        # Using WARNING level to ensure visibility in production logs as requested by user
+        logger.warning(f"🚀 [START] Competitive sitemap benchmarking for {website_url} with {len(competitors)} competitors")
+
+        competitors = [c for c in (competitors or []) if isinstance(c, str) and c.strip()]
+        if max_competitors:
+            competitors = competitors[: max(0, int(max_competitors))]
+        
+        if not competitors:
+            logger.warning(f"No competitors provided for benchmarking {website_url}")
+
+        sitemap_service = SitemapService()
+
+        logger.warning(f"🔍 [PROGRESS] Discovering user sitemap for {website_url}")
+        discovered_user_sitemap = await sitemap_service.discover_sitemap_url(website_url)
+        user_sitemap_result = None
+        user_error = None
+        if discovered_user_sitemap:
+            try:
+                logger.warning(f"⚡ [PROGRESS] Analyzing user sitemap: {discovered_user_sitemap}")
+                user_sitemap_result = await sitemap_service.analyze_sitemap(
+                    sitemap_url=discovered_user_sitemap,
+                    analyze_content_trends=True,
+                    analyze_publishing_patterns=True,
+                    include_ai_insights=False,
+                    user_id=user_id
+                )
+            except Exception as e:
+                user_error = str(e)
+                logger.error(f"Error analyzing user sitemap {discovered_user_sitemap}: {e}")
+        else:
+            user_error = "No sitemap discovered for your website. Please ensure your site has a valid sitemap.xml."
+            logger.warning(f"⚠️ No sitemap found for user website {website_url}")
+
+        competitor_sitemaps: Dict[str, Optional[str]] = {}
+        competitor_results: Dict[str, Dict[str, Any]] = {}
+        competitor_errors: Dict[str, str] = {}
+
+        logger.warning(f"🔍 [PROGRESS] Discovering sitemaps for {len(competitors)} competitors")
+        discovery_tasks = [sitemap_service.discover_sitemap_url(u) for u in competitors]
+        discovery_results = await asyncio.gather(*discovery_tasks, return_exceptions=True)
+        for i, url in enumerate(competitors):
+            res = discovery_results[i]
+            if isinstance(res, Exception):
+                competitor_sitemaps[url] = None
+                competitor_errors[url] = str(res)
+                logger.warning(f"Error discovering sitemap for competitor {url}: {res}")
+            else:
+                competitor_sitemaps[url] = res
+                if not res:
+                    competitor_errors[url] = "No sitemap found"
+                    logger.info(f"ℹ️ No sitemap found for competitor {url}")
+                else:
+                    logger.info(f"✅ Found sitemap for competitor {url}: {res}")
+
+        to_analyze = [(url, competitor_sitemaps.get(url)) for url in competitors if competitor_sitemaps.get(url)]
+        logger.warning(f"⚡ [PROGRESS] Analyzing {len(to_analyze)} competitor sitemaps")
+        
+        # Helper for safe analysis with timeout
+        async def analyze_with_timeout(url, sm):
+            try:
+                logger.warning(f"🕒 [START] Analyzing {url} with 300s timeout")
+                # 5 minute timeout per competitor to prevent total blocking
+                result = await asyncio.wait_for(
+                    sitemap_service.analyze_sitemap(
+                        sitemap_url=sm,
+                        analyze_content_trends=True,
+                        analyze_publishing_patterns=True,
+                        include_ai_insights=False,
+                        user_id=user_id
+                    ),
+                    timeout=300.0
+                )
+                logger.warning(f"✅ [DONE] Analysis finished for {url}")
+                return result
+            except asyncio.TimeoutError:
+                logger.error(f"⏱️ Analysis timed out for competitor {url} (limit: 300s)")
+                return TimeoutError(f"Analysis timed out after 300s")
+            except Exception as e:
+                msg = str(e)
+                if "URL returned a webpage" in msg or "Failed to parse sitemap XML" in msg or "no element found" in msg:
+                     logger.warning(f"⚠️ Analysis skipped for {url}: Invalid sitemap ({msg})")
+                else:
+                     logger.error(f"❌ Analysis failed for {url}: {e}")
+                return e
+
+        analysis_tasks = [
+            analyze_with_timeout(url, sm)
+            for (url, sm) in to_analyze
+        ]
+        analysis_results = await asyncio.gather(*analysis_tasks, return_exceptions=True)
+        for i, (url, _) in enumerate(to_analyze):
+            res = analysis_results[i]
+            if isinstance(res, Exception):
+                competitor_errors[url] = str(res)
+                if "URL returned a webpage" not in str(res) and "Failed to parse sitemap XML" not in str(res) and "no element found" not in str(res):
+                    logger.error(f"Error analyzing sitemap for competitor {url}: {res}")
+            else:
+                competitor_results[url] = res
+
+        user_summary = self._summarize_sitemap(user_sitemap_result)
+        competitor_summaries: Dict[str, Dict[str, Any]] = {}
+        for competitor_url, result in competitor_results.items():
+            if result and isinstance(result, dict) and "error" not in result:
+                competitor_summaries[competitor_url] = self._summarize_sitemap(result)
+
+        benchmark = self._build_competitive_sitemap_benchmark(
+            website_url=website_url,
+            user_summary=user_summary,
+            competitor_summaries=competitor_summaries
+        )
+
+        execution_time = (datetime.utcnow() - start_time).total_seconds()
+
+        return {
+            "analysis_type": "competitive_sitemap_benchmarking",
+            "timestamp": datetime.utcnow().isoformat(),
+            "execution_time": execution_time,
+            "inputs": {
+                "website_url": website_url,
+                "competitors": competitors,
+                "max_competitors": max_competitors
+            },
+            "data_sources": {
+                "user_sitemap_url": discovered_user_sitemap,
+                "competitor_sitemaps": competitor_sitemaps
+            },
+            "user": {
+                "summary": user_summary,
+                "error": user_error
+            },
+            "competitors": {
+                "summaries": competitor_summaries,
+                "errors": competitor_errors
+            },
+            "benchmark": benchmark
+        }
+
+    def _safe_ratio(self, numerator: Any, denominator: Any) -> Optional[float]:
+        try:
+            num = float(numerator)
+            den = float(denominator)
+            if den <= 0:
+                return None
+            return round(num / den, 4)
+        except Exception:
+            return None
+
+    def _as_float(self, value: Any) -> Optional[float]:
+        try:
+            if value is None:
+                return None
+            return float(value)
+        except Exception:
+            return None
+
+    def _median(self, values: List[Optional[float]]) -> Optional[float]:
+        cleaned = [v for v in values if isinstance(v, (int, float))]
+        if not cleaned:
+            return None
+        try:
+            return float(statistics.median(cleaned))
+        except Exception:
+            return None
+
+    def _build_competitive_sitemap_benchmark(
+        self,
+        website_url: str,
+        user_summary: Dict[str, Any],
+        competitor_summaries: Dict[str, Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        user_patterns = user_summary.get("top_url_patterns") or {}
+        user_sections = set(user_patterns.keys())
+
+        competitor_section_stats: Dict[str, Dict[str, Any]] = {}
+        competitor_metrics: List[Dict[str, Any]] = []
+
+        for competitor_url, summary in competitor_summaries.items():
+            patterns = summary.get("top_url_patterns") or {}
+            total_urls = summary.get("total_urls") or 0
+            span_days = (summary.get("date_range") or {}).get("span_days")
+            competitor_metrics.append({
+                "competitor_url": competitor_url,
+                "total_urls": summary.get("total_urls"),
+                "sections_count": len(patterns.keys()),
+                "average_path_depth": summary.get("average_path_depth"),
+                "max_path_depth": summary.get("max_path_depth"),
+                "publishing_velocity": summary.get("publishing_velocity"),
+                "lastmod_coverage": self._safe_ratio(summary.get("total_dated_urls"), total_urls) if isinstance(summary.get("total_dated_urls"), (int, float)) else None,
+                "span_days": span_days
+            })
+
+            for section, count in patterns.items():
+                if not section:
+                    continue
+                if section not in competitor_section_stats:
+                    competitor_section_stats[section] = {
+                        "competitor_presence": 0,
+                        "total_url_count": 0
+                    }
+                competitor_section_stats[section]["competitor_presence"] += 1
+                competitor_section_stats[section]["total_url_count"] += int(count or 0)
+
+        competitor_count = len(competitor_summaries)
+        missing_sections = []
+        for section, stats in sorted(
+            competitor_section_stats.items(),
+            key=lambda x: (x[1].get("competitor_presence", 0), x[1].get("total_url_count", 0)),
+            reverse=True
+        ):
+            # Filter out known non-content patterns:
+            # 1. Sections present in user site
+            # 2. Short sections <= 3 chars (likely language codes like /en, /es, /fr)
+            # 3. Common technical paths (wp-content, wp-includes, cgi-bin)
+            if section in user_sections:
+                continue
+            
+            if len(section) <= 3: # e.g., /es, /fr, /pt
+                continue
+                
+            if any(tech in section.lower() for tech in ['wp-content', 'wp-includes', 'cgi-bin', 'assets', 'static']):
+                continue
+
+            if competitor_count > 0 and stats.get("competitor_presence", 0) >= max(2, int(round(0.4 * competitor_count))):
+                missing_sections.append({
+                    "section": section,
+                    # Ensure presence is a normalized ratio (0.0 - 1.0)
+                    "competitor_presence": self._safe_ratio(stats.get("competitor_presence", 0), competitor_count) or 0,
+                    "competitor_count": stats.get("competitor_presence"),
+                    "total_url_count": stats.get("total_url_count", 0)
+                })
+        missing_sections = missing_sections[:15]
+
+        velocity_values = [self._as_float(s.get("publishing_velocity")) for s in competitor_summaries.values()]
+        depth_values = [self._as_float(s.get("average_path_depth")) for s in competitor_summaries.values()]
+        competitor_velocity_median = self._median(velocity_values)
+        competitor_depth_median = self._median(depth_values)
+
+        user_velocity = self._as_float(user_summary.get("publishing_velocity"))
+        user_depth = self._as_float(user_summary.get("average_path_depth"))
+        user_total_urls = user_summary.get("total_urls") or 0
+
+        opportunities = []
+        # Note: 'missing_sections' opportunity removed to avoid duplication with 'Competitor Content Strategy Patterns' section
+
+        # Insight 1: Content Volume Gap
+        competitor_total_urls_list = [m["total_urls"] for m in competitor_metrics if m.get("total_urls")]
+        competitor_urls_median = self._median(competitor_total_urls_list)
+        
+        if competitor_urls_median and user_total_urls < competitor_urls_median * 0.8:
+             opportunities.append({
+                "type": "content_volume_gap",
+                "title": "Competitors have significantly more content",
+                "metrics": {
+                    "user_total_pages": user_total_urls,
+                    "competitor_median_total_pages": int(competitor_urls_median)
+                }
+            })
+
+        # Insight 2: Publishing Velocity Gap
+        if competitor_velocity_median is not None and user_velocity is not None:
+            if user_velocity < competitor_velocity_median * 0.75:
+                opportunities.append({
+                    "type": "publishing_velocity_gap",
+                    "title": "Competitors appear to publish more frequently",
+                    "metrics": {
+                        "user_publishing_velocity": user_velocity,
+                        "competitor_median_publishing_velocity": competitor_velocity_median
+                    }
+                })
+
+        # Insight 3: Architecture Depth Gap
+        if competitor_depth_median is not None and user_depth is not None:
+            if user_depth < competitor_depth_median - 0.5:
+                opportunities.append({
+                    "type": "architecture_depth_gap",
+                    "title": "Competitors have deeper site structure",
+                    "metrics": {
+                        "user_average_path_depth": user_depth,
+                        "competitor_median_average_path_depth": competitor_depth_median
+                    }
+                })
+
+        competitor_metrics_sorted = sorted(
+            competitor_metrics,
+            key=lambda x: (x.get("total_urls") or 0),
+            reverse=True
+        )
+
+        return {
+            "website_url": website_url,
+            "competitors_analyzed": competitor_count,
+            "user_sections_count": len(user_sections),
+            "competitor_section_leaders": competitor_metrics_sorted[:10],
+            "gaps": {
+                "missing_sections": missing_sections
+            },
+            "opportunities": opportunities
+        }
+
+    def _summarize_sitemap(self, sitemap_result: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+        if not sitemap_result or not isinstance(sitemap_result, dict):
+            return {}
+        structure = sitemap_result.get("structure_analysis") or {}
+        trends = sitemap_result.get("content_trends") or {}
+        patterns = sitemap_result.get("publishing_patterns") or {}
+        return {
+            "total_urls": sitemap_result.get("total_urls"),
+            "top_url_patterns": structure.get("url_patterns") or {},
+            "file_types": structure.get("file_types") or {},
+            "average_path_depth": structure.get("average_path_depth"),
+            "max_path_depth": structure.get("max_path_depth"),
+            "publishing_velocity": trends.get("publishing_velocity"),
+            "date_range": trends.get("date_range") or {},
+            "total_dated_urls": trends.get("total_dated_urls"),
+            "priority_distribution": patterns.get("priority_distribution") or {},
+            "changefreq_distribution": patterns.get("changefreq_distribution") or {},
+        }
+
+    def _build_deterministic_insights(
+        self,
+        website_url: str,
+        user_sitemap_url: Optional[str],
+        user_sitemap_result: Optional[Dict[str, Any]],
+        competitor_sitemaps: Dict[str, Optional[str]],
+        competitor_results: Dict[str, Dict[str, Any]],
+        target_keywords: List[str]
+    ) -> Dict[str, Any]:
+        user_summary = self._summarize_sitemap(user_sitemap_result)
+        competitor_summaries: Dict[str, Dict[str, Any]] = {}
+        for competitor_url, result in competitor_results.items():
+            if result and isinstance(result, dict) and "error" not in result:
+                competitor_summaries[competitor_url] = self._summarize_sitemap(result)
+
+        user_sections = set((user_summary.get("top_url_patterns") or {}).keys())
+        competitor_section_union: Dict[str, int] = {}
+        for comp_summary in competitor_summaries.values():
+            patterns = comp_summary.get("top_url_patterns") or {}
+            for k, v in patterns.items():
+                competitor_section_union[k] = competitor_section_union.get(k, 0) + int(v or 0)
+
+        missing_vs_competitors = []
+        for section, count in sorted(competitor_section_union.items(), key=lambda x: x[1], reverse=True):
+            if section not in user_sections and section:
+                missing_vs_competitors.append({"section": section, "competitor_url_count": count})
+        missing_vs_competitors = missing_vs_competitors[:10]
+
+        keyword_hints = []
+        if target_keywords:
+            user_pattern_text = " ".join(sorted(user_sections))
+            for kw in target_keywords[:25]:
+                kw_clean = (kw or "").strip()
+                if not kw_clean:
+                    continue
+                hit = kw_clean.lower() in user_pattern_text.lower()
+                keyword_hints.append({"keyword": kw_clean, "seen_in_url_patterns": hit})
+
+        return {
+            "website_url": website_url,
+            "sitemap_found": bool(user_sitemap_url),
+            "user_sitemap_summary": user_summary,
+            "competitor_sitemap_summaries": competitor_summaries,
+            "gaps_vs_competitors": {
+                "missing_sections": missing_vs_competitors
+            },
+            "keyword_hints": keyword_hints
+        }
+
+    def _get_system_prompt(self) -> str:
+        return (
+            "You are an SEO and content strategy expert for non-technical content creators, "
+            "digital marketers, and solopreneurs. Return ONLY valid minified JSON."
+        )
+
+    def _build_ai_prompt(
+        self,
+        website_url: str,
+        target_keywords: List[str],
+        custom_parameters: Dict[str, Any],
+        deterministic_summary: Dict[str, Any]
+    ) -> str:
+        required_schema = {
+            "positioning_summary": "",
+            "content_gaps": [],
+            "topic_clusters": [],
+            "publishing_recommendations": {},
+            "quick_wins": [],
+            "risks": [],
+            "meta": {"confidence": 0.0, "inputs_used": []}
+        }
+
+        return (
+            "RULES:\n"
+            "- Return ONE single-line MINIFIED JSON object only.\n"
+            "- No markdown, code fences, or prose.\n"
+            "- Use EXACTLY the top-level keys from this schema: "
+            f"{list(required_schema.keys())}.\n"
+            "- For arrays of objects, keep objects small and consistent.\n\n"
+            f"WEBSITE: {website_url}\n"
+            f"TARGET_KEYWORDS: {target_keywords[:25]}\n"
+            f"CUSTOM_PARAMETERS: {custom_parameters}\n\n"
+            f"SITEMAP_DERIVED_DATA (compact): {json.dumps(deterministic_summary, ensure_ascii=False)[:12000]}\n\n"
+            "Now produce the strategy JSON."
+        )
+
+    def _parse_json_response(self, text: str) -> Dict[str, Any]:
+        cleaned = text.strip()
+        cleaned = cleaned.replace("```json", "").replace("```", "").strip()
+
+        match = re.search(r"\{.*\}", cleaned, flags=re.DOTALL)
+        if match:
+            cleaned = match.group(0)
+
+        return json.loads(cleaned)
    
    async def health_check(self) -> Dict[str, Any]:
        """Health check for the content strategy service"""
@@ -53,4 +572,4 @@ class ContentStrategyService:
            "status": "operational",
            "service": self.service_name,
            "last_check": datetime.utcnow().isoformat()
-        }
+        }
--- a/backend/services/seo_tools/meta_description_service.py
+++ b/backend/services/seo_tools/meta_description_service.py
@@ -27,7 +27,8 @@ class MetaDescriptionService:
        tone: str = "General",
        search_intent: str = "Informational Intent",
        language: str = "English",
-        custom_prompt: Optional[str] = None
+        custom_prompt: Optional[str] = None,
+        user_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Generate AI-powered meta descriptions based on keywords and parameters
@@ -65,7 +66,8 @@ class MetaDescriptionService:
            
            ai_response = llm_text_gen(
                prompt=prompt,
-                system_prompt=self._get_system_prompt(language)
+                system_prompt=self._get_system_prompt(language),
+                user_id=user_id
            )
            
            # Parse and structure the response
@@ -417,4 +419,4 @@ Focus on creating descriptions that will improve click-through rates for content
                "service": self.service_name,
                "error": str(e),
                "last_check": datetime.utcnow().isoformat()
-            }
+            }
--- a/backend/services/seo_tools/on_page_seo_service.py
+++ b/backend/services/seo_tools/on_page_seo_service.py
@@ -5,9 +5,13 @@ Comprehensive on-page SEO analyzer with AI-enhanced insights
 for content optimization and technical improvements.
 """

+import aiohttp
+from bs4 import BeautifulSoup
 from typing import Dict, Any, List, Optional
 from datetime import datetime
 from loguru import logger
+import re
+from urllib.parse import urlparse

 class OnPageSEOService:
    """Service for comprehensive on-page SEO analysis"""
@@ -17,6 +21,155 @@ class OnPageSEOService:
        self.service_name = "on_page_seo_analyzer"
        logger.info(f"Initialized {self.service_name}")
    
+    async def _fetch_page(self, url: str) -> tuple[Optional[str], int]:
+        """Fetch page content"""
+        try:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +https://alwrity.com)'
+            }
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url, headers=headers, timeout=10) as response:
+                    if response.status == 200:
+                        return await response.text(), 200
+                    return None, response.status
+        except Exception as e:
+            logger.error(f"Error fetching {url}: {str(e)}")
+            return None, 500
+
+    def _analyze_meta_tags(self, soup: BeautifulSoup) -> Dict[str, Any]:
+        """Analyze meta tags"""
+        title = soup.title.string if soup.title else None
+        meta_desc = soup.find('meta', attrs={'name': 'description'})
+        viewport = soup.find('meta', attrs={'name': 'viewport'})
+        robots = soup.find('meta', attrs={'name': 'robots'})
+        charset = soup.find('meta', attrs={'charset': True})
+        
+        # Social Tags
+        og_title = soup.find('meta', property='og:title')
+        og_desc = soup.find('meta', property='og:description')
+        og_image = soup.find('meta', property='og:image')
+        twitter_card = soup.find('meta', attrs={'name': 'twitter:card'})
+
+        issues = []
+        score = 100
+
+        # Title Analysis
+        if not title:
+            issues.append("Missing title tag")
+            score -= 20
+        elif len(title) < 30 or len(title) > 60:
+            issues.append(f"Title length ({len(title)} chars) should be 30-60 chars")
+            score -= 10
+
+        # Description Analysis
+        desc_content = meta_desc['content'] if meta_desc else None
+        if not desc_content:
+            issues.append("Missing meta description")
+            score -= 20
+        elif len(desc_content) < 70 or len(desc_content) > 160:
+            issues.append(f"Description length ({len(desc_content)} chars) should be 70-160 chars")
+            score -= 10
+
+        # Viewport
+        if not viewport:
+            issues.append("Missing viewport meta tag")
+            score -= 20
+        
+        og_found = list(filter(None, ['Title' if og_title else '', 'Desc' if og_desc else '', 'Image' if og_image else '']))
+
+        return {
+            "title_length": f"{len(title)} chars" if title else "Missing",
+            "meta_description_length": f"{len(desc_content)} chars" if desc_content else "Missing",
+            "has_viewport": bool(viewport),
+            "charset": charset['charset'] if charset else "Missing",
+            "robots_meta": robots['content'] if robots else "Missing (Default: index, follow)",
+            "og_tags": f"Found: {', '.join(og_found)}" if og_found else "None",
+            "twitter_card": twitter_card['content'] if twitter_card else "Missing",
+            "score": max(0, score),
+            "issues": issues
+        }
+
+    def _analyze_technical(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
+        """Analyze technical SEO elements"""
+        canonical = soup.find('link', attrs={'rel': 'canonical'})
+        schema = soup.find_all('script', type='application/ld+json')
+        
+        issues = []
+        score = 100
+
+        if not canonical:
+            issues.append("Missing canonical tag")
+            score -= 10
+        
+        # Check H1
+        h1_tags = soup.find_all('h1')
+        if len(h1_tags) == 0:
+            issues.append("Missing H1 tag")
+            score -= 20
+        elif len(h1_tags) > 1:
+            issues.append(f"Multiple H1 tags found ({len(h1_tags)})")
+            score -= 10
+
+        return {
+            "canonical_tag": canonical['href'] if canonical else "Missing",
+            "schema_markup": f"Found {len(schema)} schema objects",
+            "h1_count": len(h1_tags),
+            "score": max(0, score),
+            "issues": issues
+        }
+
+    def _analyze_content(self, soup: BeautifulSoup) -> Dict[str, Any]:
+        """Analyze content quality"""
+        # Remove scripts and styles
+        for script in soup(["script", "style"]):
+            script.extract()
+            
+        text = soup.get_text()
+        words = len(re.findall(r'\w+', text))
+        
+        images = soup.find_all('img')
+        images_without_alt = sum(1 for img in images if not img.get('alt'))
+        
+        issues = []
+        score = 100
+
+        if words < 300:
+            issues.append(f"Low word count ({words} words)")
+            score -= 20
+            
+        if images_without_alt > 0:
+            issues.append(f"{images_without_alt} images missing alt text")
+            score -= 10
+
+        return {
+            "word_count": words,
+            "total_images": len(images),
+            "images_without_alt": images_without_alt,
+            "readability": "Good" if words > 300 else "Needs Improvement", # Placeholder for readability algo
+            "score": max(0, score),
+            "issues": issues
+        }
+
+    def _analyze_url_structure(self, url: str) -> Dict[str, Any]:
+        parsed = urlparse(url)
+        return {
+            "protocol": parsed.scheme,
+            "domain": parsed.netloc,
+            "path_depth": len(parsed.path.strip('/').split('/')) if parsed.path else 0,
+            "is_https": parsed.scheme == 'https'
+        }
+
+    def _calculate_overall_score(self, *analyses) -> int:
+        total = sum(a.get('score', 0) for a in analyses)
+        return round(total / len(analyses))
+
+    def _generate_summary(self, *analyses) -> Dict[str, Any]:
+        critical_issues = []
+        for a in analyses:
+            for issue in a.get('issues', []):
+                critical_issues.append({"message": issue, "severity": "critical", "category": "SEO"})
+        return {"critical_issues": critical_issues}
+
    async def analyze_on_page_seo(
        self,
        url: str,
@@ -25,18 +178,53 @@ class OnPageSEOService:
        analyze_content_quality: bool = True
    ) -> Dict[str, Any]:
        """Analyze on-page SEO factors"""
-        # Placeholder implementation
-        return {
-            "url": url,
-            "overall_score": 75,
-            "title_analysis": {"score": 80, "issues": [], "recommendations": []},
-            "meta_description": {"score": 70, "issues": [], "recommendations": []},
-            "heading_structure": {"score": 85, "issues": [], "recommendations": []},
-            "content_analysis": {"score": 75, "word_count": 1500, "readability": "Good"},
-            "keyword_analysis": {"target_keywords": target_keywords or [], "optimization": "Moderate"},
-            "image_analysis": {"total_images": 10, "missing_alt": 2} if analyze_images else {},
-            "recommendations": ["Optimize meta description", "Add more target keywords"]
-        }
+        try:
+            # Add protocol if missing
+            if not url.startswith(('http://', 'https://')):
+                url = 'https://' + url
+
+            html_content, status_code = await self._fetch_page(url)
+            
+            if not html_content:
+                # Return error structure
+                return {
+                    "url": url,
+                    "overall_score": 0,
+                    "summary": {"critical_issues": [{"message": f"Failed to fetch URL (Status: {status_code})", "severity": "critical", "category": "Connectivity"}]},
+                    "meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {}
+                }
+            
+            soup = BeautifulSoup(html_content, 'html.parser')
+            
+            # Run Analyses
+            meta_analysis = self._analyze_meta_tags(soup)
+            technical_analysis = self._analyze_technical(soup, url)
+            content_analysis = self._analyze_content(soup)
+            url_analysis = self._analyze_url_structure(url)
+            
+            result = {
+                "url": url,
+                "overall_score": self._calculate_overall_score(meta_analysis, technical_analysis, content_analysis),
+                "meta": meta_analysis,
+                "technical": technical_analysis,
+                "content_health": content_analysis,
+                "url_structure": url_analysis,
+                "performance": {"load_time": "Real-time check pending"},
+                "accessibility": {"images_without_alt": content_analysis["images_without_alt"]},
+                "ux": {"viewport": meta_analysis["has_viewport"], "mobile_friendly": bool(meta_analysis["has_viewport"])},
+                "summary": self._generate_summary(meta_analysis, technical_analysis, content_analysis)
+            }
+            
+            return result
+            
+        except Exception as e:
+            logger.error(f"Error analyzing {url}: {str(e)}")
+            return {
+                "url": url,
+                "overall_score": 0,
+                "summary": {"critical_issues": [{"message": str(e), "severity": "critical", "category": "System"}]},
+                "meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {}
+            }
    
    async def health_check(self) -> Dict[str, Any]:
        """Health check for the on-page SEO service"""
@@ -44,4 +232,4 @@ class OnPageSEOService:
            "status": "operational",
            "service": self.service_name,
            "last_check": datetime.utcnow().isoformat()
-        }
+        }
--- a/backend/services/seo_tools/pagespeed_service.py
+++ b/backend/services/seo_tools/pagespeed_service.py
@@ -31,7 +31,8 @@ class PageSpeedService:
        url: str,
        strategy: str = "DESKTOP",
        locale: str = "en",
-        categories: List[str] = None
+        categories: List[str] = None,
+        user_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Analyze website performance using Google PageSpeed Insights
@@ -70,7 +71,7 @@ class PageSpeedService:
            structured_results = self._structure_pagespeed_results(pagespeed_data)
            
            # Generate AI-enhanced insights
-            ai_insights = await self._generate_ai_insights(structured_results, url, strategy)
+            ai_insights = await self._generate_ai_insights(structured_results, url, strategy, user_id=user_id)
            
            # Calculate optimization priority
            optimization_plan = self._create_optimization_plan(structured_results)
@@ -281,7 +282,8 @@ class PageSpeedService:
        self,
        structured_results: Dict[str, Any],
        url: str,
-        strategy: str
+        strategy: str,
+        user_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """Generate AI-powered insights and recommendations"""
        
@@ -299,7 +301,8 @@ class PageSpeedService:
            # Generate AI insights
            ai_response = llm_text_gen(
                prompt=prompt,
-                system_prompt=self._get_system_prompt()
+                system_prompt=self._get_system_prompt(),
+                user_id=user_id
            )
            
            # Parse AI response
@@ -598,4 +601,4 @@ Focus on practical advice that content creators and digital marketers can unders
                "service": self.service_name,
                "error": str(e),
                "last_check": datetime.utcnow().isoformat()
-            }
+            }
--- a/backend/services/seo_tools/sitemap_service.py
+++ b/backend/services/seo_tools/sitemap_service.py
@@ -8,12 +8,14 @@ content distribution, and publishing patterns for SEO optimization.
 import aiohttp
 import asyncio
 import re
+import json
 from typing import Dict, Any, List, Optional
 from datetime import datetime, timedelta
 from loguru import logger
 import xml.etree.ElementTree as ET
 from urllib.parse import urlparse, urljoin
 import pandas as pd
+import gzip

 from ..llm_providers.main_text_generation import llm_text_gen
 from middleware.logging_middleware import seo_logger
@@ -52,7 +54,9 @@ class SitemapService:
        self,
        sitemap_url: str,
        analyze_content_trends: bool = True,
-        analyze_publishing_patterns: bool = True
+        analyze_publishing_patterns: bool = True,
+        include_ai_insights: bool = True,
+        user_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Analyze website sitemap for structure and patterns
@@ -92,10 +96,11 @@ class SitemapService:
            if analyze_publishing_patterns and sitemap_data.get("urls"):
                publishing_patterns = self._analyze_publishing_patterns(sitemap_data["urls"])
            
-            # Generate AI insights
-            ai_insights = await self._generate_ai_insights(
-                structure_analysis, content_trends, publishing_patterns, sitemap_url
-            )
+            ai_insights = {}
+            if include_ai_insights:
+                ai_insights = await self._generate_ai_insights(
+                    structure_analysis, content_trends, publishing_patterns, sitemap_url, user_id=user_id
+                )
            
            execution_time = (datetime.utcnow() - start_time).total_seconds()
            
@@ -119,7 +124,8 @@ class SitemapService:
                input_data={
                    "sitemap_url": sitemap_url,
                    "analyze_content_trends": analyze_content_trends,
-                    "analyze_publishing_patterns": analyze_publishing_patterns
+                    "analyze_publishing_patterns": analyze_publishing_patterns,
+                    "include_ai_insights": include_ai_insights
                },
                output_data=result,
                success=True
@@ -145,19 +151,88 @@ class SitemapService:
            
            raise
    
-    async def _fetch_sitemap_data(self, sitemap_url: str) -> Dict[str, Any]:
+    async def _fetch_sitemap_data(self, sitemap_url: str, depth: int = 0, session: aiohttp.ClientSession = None) -> Dict[str, Any]:
        """Fetch and parse sitemap data"""
        
+        # Reduced max depth from 3 to 2 to prevent infinite recursion/hanging on massive sites
+        if depth > 2:
+            logger.info(f"🛑 Max recursion depth (2) reached for sitemap {sitemap_url}")
+            return {"urls": [], "sitemaps": [], "total_urls": 0}
+
+        # Use passed session or create a new local one if it's the top-level call
+        local_session = False
+        if session is None:
+            local_session = True
+            # Limit pool size and set strict timeouts
+            connector = aiohttp.TCPConnector(limit_per_host=5, force_close=True)
+            # Increased total timeout to 60s for slow sitemaps, but kept connect/read strict
+            timeout = aiohttp.ClientTimeout(total=60, connect=10, sock_read=30)
+            session = aiohttp.ClientSession(connector=connector, timeout=timeout)
+
        try:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(sitemap_url, timeout=aiohttp.ClientTimeout(total=30)) as response:
+            logger.info(f"🔍 Fetching sitemap: {sitemap_url} (depth={depth})")
+            # 10MB limit for sitemaps
+            MAX_SITEMAP_SIZE = 10 * 1024 * 1024 
+            
+            try:
+                async with session.get(sitemap_url) as response:
                    if response.status != 200:
                        raise Exception(f"Failed to fetch sitemap: HTTP {response.status}")
                    
-                    content = await response.text()
-                    
-                    # Parse XML
-                    root = ET.fromstring(content)
+                    # Check Content-Type header
+                    content_type = response.headers.get("Content-Type", "").lower()
+                    if "text/html" in content_type:
+                        raise Exception("URL returned a webpage (HTML), not a valid XML sitemap")
+
+                    # Check Content-Length header if available
+                    content_length = response.headers.get("Content-Length")
+                    if content_length and int(content_length) > MAX_SITEMAP_SIZE:
+                        raise Exception(f"Sitemap too large: {content_length} bytes")
+
+                    # Read with size limit (safe read)
+                    raw = await response.content.read(MAX_SITEMAP_SIZE + 1)
+                    if len(raw) > MAX_SITEMAP_SIZE:
+                        raise Exception(f"Sitemap size exceeds limit of {MAX_SITEMAP_SIZE} bytes")
+
+                    if sitemap_url.lower().endswith(".gz") or (len(raw) >= 2 and raw[0] == 0x1F and raw[1] == 0x8B):
+                        try:
+                            raw = gzip.decompress(raw)
+                        except Exception:
+                            pass
+
+                    try:
+                        content = raw.decode(response.charset or "utf-8", errors="replace")
+                    except Exception:
+                        content = raw.decode("utf-8", errors="replace")
+
+                    content_stripped = content.lstrip()
+
+                    if not content_stripped.startswith("<"):
+                        urls = []
+                        # Limit text sitemaps to 50k lines
+                        lines = content.splitlines()[:50000]
+                        for line in lines:
+                            line_clean = (line or "").strip()
+                            if not line_clean or line_clean.startswith("#"):
+                                continue
+                            if line_clean.startswith("http://") or line_clean.startswith("https://"):
+                                urls.append({"loc": line_clean})
+                        return {
+                            "urls": urls,
+                            "sitemaps": [],
+                            "total_urls": len(urls)
+                        }
+
+                    # Check for HTML content disguised as XML
+                    if content.strip().lower().startswith(("<!doctype html", "<html")):
+                        raise Exception("URL returned a webpage (HTML), not a valid XML sitemap")
+
+                    # Use defusedxml for safety if available, otherwise standard ET
+                    try:
+                        import defusedxml.ElementTree as DET
+                        root = DET.fromstring(content)
+                    except ImportError:
+                        root = ET.fromstring(content)
                    
                    # Handle different sitemap formats
                    urls = []
@@ -172,17 +247,28 @@ class SitemapService:
                                if loc is not None:
                                    sitemaps.append(loc.text)
                        
-                        # Fetch and parse nested sitemaps
-                        for nested_url in sitemaps[:10]:  # Limit to 10 sitemaps
-                            try:
-                                nested_data = await self._fetch_sitemap_data(nested_url)
-                                urls.extend(nested_data.get("urls", []))
-                            except Exception as e:
-                                logger.warning(f"Failed to fetch nested sitemap {nested_url}: {e}")
+                        # Fetch and parse nested sitemaps in parallel
+                        nested_tasks = []
+                        # Reduced nested limit from 10 to 5 to prevent fan-out explosion
+                        for nested_url in sitemaps[:5]: 
+                            nested_tasks.append(self._fetch_sitemap_data(nested_url, depth + 1, session))
+                        
+                        if nested_tasks:
+                            nested_results = await asyncio.gather(*nested_tasks, return_exceptions=True)
+                            for res in nested_results:
+                                if isinstance(res, Exception):
+                                    logger.warning(f"Failed to fetch nested sitemap: {res}")
+                                elif isinstance(res, dict):
+                                    urls.extend(res.get("urls", []))
                    
                    else:
                        # Regular sitemap with URLs
+                        # Limit to first 10k URLs per sitemap file to prevent memory issues
+                        url_count = 0
                        for url_element in root:
+                            if url_count >= 10000:
+                                break
+                                
                            if url_element.tag.endswith('url'):
                                url_data = {}
                                
@@ -192,18 +278,42 @@ class SitemapService:
                                
                                if 'loc' in url_data:
                                    urls.append(url_data)
+                                    url_count += 1
                    
                    return {
                        "urls": urls,
                        "sitemaps": sitemaps,
                        "total_urls": len(urls)
                    }
-                    
+            except Exception as e:
+                 # Re-raise to be caught by outer try/except
+                 raise e
+
        except ET.ParseError as e:
+            # Check if content is empty
+            if not content or not content.strip():
+                logger.warning(f"Sitemap is empty: {sitemap_url}")
+                return {"urls": [], "sitemaps": [], "total_urls": 0}
+
+            # Check if content looks like HTML to give a better error message
+            try:
+                if "content" in locals() and ("<html" in content.lower() or "<body" in content.lower() or "<div" in content.lower()):
+                    raise Exception("URL returned a webpage (HTML), not a valid XML sitemap")
+            except Exception:
+                pass
+            
+            logger.warning(f"Failed to parse sitemap XML: {e}")
            raise Exception(f"Failed to parse sitemap XML: {e}")
        except Exception as e:
-            logger.error(f"Error fetching sitemap data: {e}")
+            if "no element found" in str(e) or "not a valid XML sitemap" in str(e):
+                logger.warning(f"⚠️ Sitemap parsing failed for {sitemap_url}: {e}")
+            else:
+                logger.error(f"Error fetching sitemap data for {sitemap_url}: {e}")
            raise
+        finally:
+            # Only close the session if we created it
+            if local_session and session:
+                await session.close()
    
    def _analyze_sitemap_structure(self, sitemap_data: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze the structure of the sitemap"""
@@ -239,14 +349,60 @@ class SitemapService:
        # Calculate statistics
        avg_path_depth = sum(path_levels) / len(path_levels) if path_levels else 0
        
+        # Enhancement: Keyword Clustering & Strategic Pillar Mapping
+        keyword_clusters = self._cluster_keywords_from_urls(urls)
+        strategic_pillars = self._map_strategic_pillars(urls)
+        
        return {
            "total_urls": len(urls),
            "url_patterns": dict(sorted(url_patterns.items(), key=lambda x: x[1], reverse=True)[:10]),
            "file_types": dict(sorted(file_types.items(), key=lambda x: x[1], reverse=True)),
            "average_path_depth": round(avg_path_depth, 2),
            "max_path_depth": max(path_levels) if path_levels else 0,
+            "keyword_clusters": keyword_clusters,
+            "strategic_pillars": strategic_pillars,
            "structure_quality": self._assess_structure_quality(url_patterns, avg_path_depth)
        }
+
+    def _cluster_keywords_from_urls(self, urls: List[Dict[str, Any]]) -> Dict[str, int]:
+        """Extract and cluster keywords from URL slugs to identify content strategy focus."""
+        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'from', 'category', 'tag', 'blog', 'posts', 'archive'}
+        keywords: Dict[str, int] = {}
+        
+        for u in urls[:1000]: # Sample 1000 for performance
+            path = urlparse(u.get('loc', '')).path
+            # Split by non-alphanumeric and underscores
+            parts = re.split(r'[^a-zA-Z0-9]', path)
+            for part in parts:
+                p = part.lower()
+                if len(p) > 3 and p not in stop_words and not p.isdigit():
+                    keywords[p] = keywords.get(p, 0) + 1
+                    
+        # Return top 15 clusters
+        return dict(sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:15])
+
+    def _map_strategic_pillars(self, urls: List[Dict[str, Any]]) -> Dict[str, int]:
+        """Categorize URLs into strategic content pillars based on common path patterns."""
+        pillars = {
+            "Educational": ["blog", "guides", "how-to", "learn", "academy", "resource", "documentation", "docs"],
+            "Transactional": ["product", "features", "pricing", "plans", "solutions", "buy", "checkout", "cart"],
+            "Comparison": ["vs", "alternative", "comparison", "reviews", "best-of"],
+            "Company": ["about", "careers", "press", "contact", "team", "legal", "privacy", "terms"],
+            "Tools": ["calculator", "tool", "generator", "checker", "analyzer"]
+        }
+        
+        results = {k: 0 for k in pillars}
+        for u in urls:
+            loc = u.get('loc', '').lower()
+            found = False
+            for pillar, tokens in pillars.items():
+                if any(token in loc for token in tokens):
+                    results[pillar] += 1
+                    found = True
+                    break
+            # Optional: Add "Other" category if needed
+                    
+        return results
    
    def _analyze_content_trends(self, urls: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Analyze content publishing trends"""
@@ -334,7 +490,9 @@ class SitemapService:
        competitors: List[str] = None,
        industry_context: str = None,
        analyze_content_trends: bool = True,
-        analyze_publishing_patterns: bool = True
+        analyze_publishing_patterns: bool = True,
+        include_ai_insights: bool = True,
+        user_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """Enhanced sitemap analysis specifically for onboarding Step 3 competitive analysis"""
        
@@ -343,7 +501,9 @@ class SitemapService:
            analysis_result = await self.analyze_sitemap(
                sitemap_url=sitemap_url,
                analyze_content_trends=analyze_content_trends,
-                analyze_publishing_patterns=analyze_publishing_patterns
+                analyze_publishing_patterns=analyze_publishing_patterns,
+                include_ai_insights=include_ai_insights,
+                user_id=user_id
            )
            
            # Enhance with onboarding-specific insights
@@ -351,7 +511,8 @@ class SitemapService:
                analysis_result,
                user_url,
                competitors,
-                industry_context
+                industry_context,
+                user_id=user_id
            )
            
            # Combine results
@@ -374,7 +535,8 @@ class SitemapService:
        analysis_result: Dict[str, Any],
        user_url: str,
        competitors: List[str] = None,
-        industry_context: str = None
+        industry_context: str = None,
+        user_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """Generate onboarding-specific insights for competitive analysis"""
        
@@ -389,10 +551,37 @@ class SitemapService:
                user_url, competitors, industry_context
            )
            
+            # Define JSON schema for structured output
+            json_struct = {
+                "type": "object",
+                "properties": {
+                    "competitive_positioning": {"type": "string"},
+                    "content_gaps": {
+                        "type": "array",
+                        "items": {"type": "string"}
+                    },
+                    "growth_opportunities": {
+                        "type": "array",
+                        "items": {"type": "string"}
+                    },
+                    "industry_benchmarks": {
+                        "type": "array",
+                        "items": {"type": "string"}
+                    },
+                    "strategic_recommendations": {
+                        "type": "array",
+                        "items": {"type": "string"}
+                    }
+                },
+                "required": ["competitive_positioning", "content_gaps", "growth_opportunities", "industry_benchmarks", "strategic_recommendations"]
+            }
+
            # Generate AI insights
            ai_response = llm_text_gen(
                prompt=prompt,
-                system_prompt=self._get_onboarding_system_prompt()
+                system_prompt=self._get_onboarding_system_prompt(),
+                json_struct=json_struct,
+                user_id=user_id
            )
            
            # Parse and structure insights
@@ -402,7 +591,7 @@ class SitemapService:
            await seo_logger.log_ai_analysis(
                tool_name=f"{self.service_name}_onboarding",
                prompt=prompt,
-                response=ai_response,
+                response=ai_response if isinstance(ai_response, str) else str(ai_response),
                model_used="gemini-2.0-flash-001"
            )
            
@@ -422,7 +611,8 @@ class SitemapService:
        structure_analysis: Dict[str, Any],
        content_trends: Dict[str, Any],
        publishing_patterns: Dict[str, Any],
-        sitemap_url: str
+        sitemap_url: str,
+        user_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """Generate AI-powered insights for sitemap analysis"""
        
@@ -435,7 +625,8 @@ class SitemapService:
            # Generate AI insights
            ai_response = llm_text_gen(
                prompt=prompt,
-                system_prompt=self._get_system_prompt()
+                system_prompt=self._get_system_prompt(),
+                user_id=user_id
            )
            
            # Parse and structure insights
@@ -697,7 +888,12 @@ Focus on actionable insights for content creators and digital marketing professi
        try:
            # Test with a simple sitemap
            test_url = "https://www.google.com/sitemap.xml"
-            result = await self.analyze_sitemap(test_url, False, False)
+            result = await self.analyze_sitemap(
+                sitemap_url=test_url,
+                analyze_content_trends=False,
+                analyze_publishing_patterns=False,
+                include_ai_insights=False
+            )
            
            return {
                "status": "operational",
@@ -731,7 +927,7 @@ Focus on actionable insights for content creators and digital marketing professi
        
        competitor_info = ""
        if competitors:
-            competitor_info = f"\nCompetitors to consider: {', '.join(competitors[:5])}"
+            competitor_info = f"\nCompetitors to consider: {', '.join(competitors)}"
        
        industry_info = ""
        if industry_context:
@@ -753,12 +949,12 @@ Content Publishing Patterns:
 - Publishing Rate: {publishing_velocity:.2f} pages per day
 - Content Categories: {len(url_patterns)} main categories identified

-Please provide competitive analysis insights focusing on:
+Please provide competitive analysis insights focusing on the following sections:

-1. **COMPETITIVE POSITIONING**: How does this site's content structure compare to industry standards?
-2. **CONTENT GAPS**: What content categories or topics are missing based on the URL structure?
-3. **GROWTH OPPORTUNITIES**: Specific content expansion opportunities to compete better
-4. **INDUSTRY BENCHMARKS**: How does publishing frequency and content depth compare to competitors?
+1. **COMPETITIVE POSITIONING**: How does this site's content structure compare to industry standards? (Provide a brief paragraph)
+2. **CONTENT GAPS**: What content categories or topics are missing based on the URL structure? (List 3-5 specific gaps)
+3. **GROWTH OPPORTUNITIES**: Specific content expansion opportunities to compete better (List 3-5 opportunities)
+4. **INDUSTRY BENCHMARKS**: How does publishing frequency and content depth compare to competitors? (List 3 key comparisons)
 5. **STRATEGIC RECOMMENDATIONS**: 3-5 actionable steps for content strategy improvement

 Focus on actionable insights that help content creators understand their competitive position and identify growth opportunities.
@@ -783,69 +979,61 @@ Provide practical, data-driven insights that help content creators make informed

 Format your response as structured insights that can be easily parsed and displayed in a user interface."""

-    def _parse_onboarding_insights(self, ai_response: str) -> Dict[str, Any]:
+    def _parse_onboarding_insights(self, ai_response: Any) -> Dict[str, Any]:
        """Parse AI response for onboarding-specific insights"""
        
        try:
-            # Initialize structured response
-            insights = {
-                "competitive_positioning": "Analysis in progress...",
-                "content_gaps": [],
-                "growth_opportunities": [],
-                "industry_benchmarks": [],
-                "strategic_recommendations": []
+            insights = {}
+            
+            # If it's already a dict (structured output), use it
+            if isinstance(ai_response, dict):
+                insights = ai_response
+            elif isinstance(ai_response, str):
+                # Try to parse JSON string
+                try:
+                    insights = json.loads(ai_response)
+                except json.JSONDecodeError:
+                    # Try to extract JSON from markdown block
+                    json_match = re.search(r'```json\s*(.*?)\s*```', ai_response, re.DOTALL)
+                    if json_match:
+                        try:
+                            insights = json.loads(json_match.group(1))
+                        except json.JSONDecodeError:
+                            pass
+            
+            # Ensure all required keys exist
+            required_keys = [
+                "competitive_positioning", 
+                "content_gaps", 
+                "growth_opportunities", 
+                "industry_benchmarks", 
+                "strategic_recommendations"
+            ]
+            
+            # Validate and fill missing keys
+            validated_insights = {
+                "competitive_positioning": insights.get("competitive_positioning", "Analysis in progress..."),
+                "content_gaps": insights.get("content_gaps", []),
+                "growth_opportunities": insights.get("growth_opportunities", []),
+                "industry_benchmarks": insights.get("industry_benchmarks", []),
+                "strategic_recommendations": insights.get("strategic_recommendations", [])
            }
            
-            # Simple parsing logic - look for structured sections
-            lines = ai_response.split('\n')
-            current_section = None
-            
-            for line in lines:
-                line = line.strip()
-                if not line:
-                    continue
-                
-                # Detect sections
-                if any(keyword in line.lower() for keyword in ['competitive positioning', 'market position']):
-                    current_section = 'competitive_positioning'
-                    insights[current_section] = line
-                elif any(keyword in line.lower() for keyword in ['content gaps', 'missing content']):
-                    current_section = 'content_gaps'
-                elif any(keyword in line.lower() for keyword in ['growth opportunities', 'expansion']):
-                    current_section = 'growth_opportunities'
-                elif any(keyword in line.lower() for keyword in ['industry benchmarks', 'benchmarks']):
-                    current_section = 'industry_benchmarks'
-                elif any(keyword in line.lower() for keyword in ['strategic recommendations', 'recommendations']):
-                    current_section = 'strategic_recommendations'
-                elif line.startswith('-') or line.startswith('•'):
-                    # This is a list item
-                    if current_section and current_section in insights:
-                        if isinstance(insights[current_section], str):
-                            insights[current_section] = [insights[current_section]]
-                        insights[current_section].append(line[1:].strip())
-                elif current_section == 'competitive_positioning':
-                    # Append to competitive positioning text
-                    if insights[current_section] == "Analysis in progress...":
-                        insights[current_section] = line
+            # Ensure lists are actually lists
+            for key in required_keys[1:]:
+                if not isinstance(validated_insights[key], list):
+                    if isinstance(validated_insights[key], str):
+                        validated_insights[key] = [validated_insights[key]]
                    else:
-                        insights[current_section] += " " + line
-            
-            # Fallback: if no structured parsing worked, use the full response
-            if insights["competitive_positioning"] == "Analysis in progress...":
-                insights["competitive_positioning"] = ai_response[:500] + "..." if len(ai_response) > 500 else ai_response
-            
-            # Ensure lists are properly formatted
-            for key in ['content_gaps', 'growth_opportunities', 'industry_benchmarks', 'strategic_recommendations']:
-                if isinstance(insights[key], str):
-                    insights[key] = [insights[key]] if insights[key] else []
-            
-            return insights
+                        validated_insights[key] = []
+                        
+            return validated_insights
            
        except Exception as e:
            logger.error(f"Error parsing onboarding insights: {e}")
            return {
-                "competitive_positioning": ai_response[:300] + "..." if len(ai_response) > 300 else ai_response,
-                "content_gaps": ["Analysis parsing error - see full response above"],
+                "competitive_positioning": "Analysis unavailable",
+                "content_gaps": [],
                "growth_opportunities": [],
                "industry_benchmarks": [],
                "strategic_recommendations": []
@@ -889,6 +1077,48 @@ Format your response as structured insights that can be easily parsed and displa
            logger.error(f"Error discovering sitemap for {website_url}: {e}")
            return None

+    async def _find_sitemap_on_homepage(self, base_url: str) -> Optional[str]:
+        """
+        Check homepage for sitemap links in HTML.
+        
+        Args:
+            base_url: Base URL of the website
+            
+        Returns:
+            Sitemap URL if found on homepage, None otherwise
+        """
+        try:
+            logger.debug(f"Checking homepage for sitemap links: {base_url}")
+            
+            async with aiohttp.ClientSession() as session:
+                async with session.get(base_url, timeout=aiohttp.ClientTimeout(total=15), headers={"User-Agent": "ALwrity-SEO-Bot/1.0"}) as response:
+                    if response.status == 200:
+                        content = await response.text()
+                        
+                        # Look for sitemap links in href attributes
+                        # Matches: href="...sitemap.xml..." or href='...sitemap.xml...'
+                        # Simple regex to catch common variations
+                        sitemap_matches = re.findall(r'href=["\']([^"\']*[sS]itemap[^"\']*\.xml[^"\']*)["\']', content)
+                        
+                        for match in sitemap_matches:
+                            potential_url = match.strip()
+                            
+                            # Handle relative URLs
+                            if not potential_url.startswith(('http://', 'https://')):
+                                potential_url = urljoin(base_url, potential_url)
+                            
+                            logger.debug(f"Found potential sitemap link on homepage: {potential_url}")
+                            
+                            # Verify accessibility
+                            if await self._check_sitemap_url(potential_url, "homepage link"):
+                                return potential_url
+                                
+            return None
+            
+        except Exception as e:
+            logger.debug(f"Error checking homepage for sitemap: {e}")
+            return None
+
    async def _find_sitemap_in_robots_txt(self, base_url: str) -> Optional[str]:
        """
        Check robots.txt for sitemap directives.
@@ -1027,4 +1257,4 @@ Format your response as structured insights that can be easily parsed and displa
                    return response.status == 200
                    
        except Exception:
-            return False
+            return False
--- a/backend/services/seo_tools/technical_seo_service.py
+++ b/backend/services/seo_tools/technical_seo_service.py
@@ -5,8 +5,12 @@ Comprehensive technical SEO crawler and analyzer with AI-enhanced
 insights for website optimization and search engine compatibility.
 """

+import aiohttp
+import asyncio
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urljoin
+import time
 from typing import Dict, Any, List, Optional
-from datetime import datetime
 from loguru import logger

 class TechnicalSEOService:
@@ -16,6 +20,9 @@ class TechnicalSEOService:
        """Initialize the technical SEO service"""
        self.service_name = "technical_seo_analyzer"
        logger.info(f"Initialized {self.service_name}")
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +http://alwrity.com/bot)'
+        }
    
    async def analyze_technical_seo(
        self,
@@ -25,20 +32,115 @@ class TechnicalSEOService:
        analyze_performance: bool = True
    ) -> Dict[str, Any]:
        """Analyze technical SEO factors"""
-        # Placeholder implementation
-        return {
-            "url": url,
-            "pages_crawled": 25,
-            "crawl_depth": crawl_depth,
-            "technical_issues": [
-                {"type": "Missing robots.txt", "severity": "Medium", "pages_affected": 1},
-                {"type": "Slow loading pages", "severity": "High", "pages_affected": 3}
-            ],
-            "site_structure": {"internal_links": 150, "external_links": 25 if include_external_links else 0},
-            "performance_metrics": {"avg_load_time": 2.5, "largest_contentful_paint": 1.8} if analyze_performance else {},
-            "recommendations": ["Implement robots.txt", "Optimize page load speed"],
-            "crawl_summary": {"successful": 23, "errors": 2, "redirects": 5}
-        }
+        try:
+            start_time = time.time()
+            async with aiohttp.ClientSession(headers=self.headers) as session:
+                async with session.get(url, timeout=30) as response:
+                    load_time = time.time() - start_time
+                    status_code = response.status
+                    content = await response.text()
+                    headers = response.headers
+
+                    # Basic parsing
+                    soup = BeautifulSoup(content, 'html.parser')
+                    
+                    # 1. Meta Tags Analysis
+                    title = soup.title.string if soup.title else None
+                    meta_desc = soup.find('meta', attrs={'name': 'description'})
+                    meta_desc_content = meta_desc['content'] if meta_desc else None
+                    
+                    # 2. Heading Structure
+                    h1_tags = soup.find_all('h1')
+                    h2_tags = soup.find_all('h2')
+                    h3_tags = soup.find_all('h3')
+                    
+                    # 3. Image Analysis
+                    images = soup.find_all('img')
+                    images_without_alt = [img['src'] for img in images if not img.get('alt')]
+                    
+                    # 4. Link Analysis
+                    links = soup.find_all('a')
+                    internal_links = []
+                    external_links = []
+                    domain = urlparse(url).netloc
+                    
+                    for link in links:
+                        href = link.get('href')
+                        if not href:
+                            continue
+                        if href.startswith('http'):
+                            if domain in href:
+                                internal_links.append(href)
+                            else:
+                                external_links.append(href)
+                        elif href.startswith('/'):
+                             internal_links.append(urljoin(url, href))
+
+                    # 5. Technical Issues Detection
+                    issues = []
+                    
+                    # Status Code Issues
+                    if status_code != 200:
+                        issues.append({"type": f"Status Code {status_code}", "severity": "High", "pages_affected": 1})
+                    
+                    # Performance Issues
+                    if load_time > 2.0:
+                        issues.append({"type": "Slow Server Response", "severity": "Medium", "pages_affected": 1})
+                    
+                    # Meta Issues
+                    if not title:
+                        issues.append({"type": "Missing Title Tag", "severity": "High", "pages_affected": 1})
+                    elif len(title) > 60:
+                        issues.append({"type": "Title Tag Too Long", "severity": "Low", "pages_affected": 1})
+                        
+                    if not meta_desc_content:
+                        issues.append({"type": "Missing Meta Description", "severity": "High", "pages_affected": 1})
+                    
+                    # Content Structure Issues
+                    if not h1_tags:
+                        issues.append({"type": "Missing H1 Tag", "severity": "High", "pages_affected": 1})
+                    elif len(h1_tags) > 1:
+                        issues.append({"type": "Multiple H1 Tags", "severity": "Medium", "pages_affected": 1})
+                        
+                    # Image Issues
+                    if images_without_alt:
+                        issues.append({"type": "Images Missing Alt Text", "severity": "Medium", "pages_affected": len(images_without_alt)})
+                    
+                    # Security Issues
+                    if url.startswith('http:'):
+                        issues.append({"type": "Insecure Protocol (HTTP)", "severity": "High", "pages_affected": 1})
+                    
+                    return {
+                        "url": url,
+                        "pages_crawled": 1, # Currently single page
+                        "crawl_depth": 1,
+                        "technical_issues": issues,
+                        "site_structure": {
+                            "internal_links": len(internal_links),
+                            "external_links": len(external_links) if include_external_links else 0,
+                            "h1_count": len(h1_tags),
+                            "h2_count": len(h2_tags),
+                            "h3_count": len(h3_tags)
+                        },
+                        "performance_metrics": {
+                            "response_time": round(load_time, 3),
+                            "content_size": len(content)
+                        } if analyze_performance else {},
+                        "recommendations": [issue['type'] for issue in issues],
+                        "crawl_summary": {
+                            "successful": 1 if status_code == 200 else 0,
+                            "errors": 1 if status_code >= 400 else 0,
+                            "redirects": 1 if 300 <= status_code < 400 else 0
+                        }
+                    }
+
+        except Exception as e:
+            logger.error(f"Error in technical SEO analysis: {e}")
+            return {
+                "url": url,
+                "error": str(e),
+                "technical_issues": [{"type": "Crawl Failed", "severity": "High", "pages_affected": 1}]
+            }
    
    async def health_check(self) -> Dict[str, Any]:
        """Health check for the technical SEO service"""