ALwrity/backend/services/seo_tools/pagespeed_service.py

"""
Google PageSpeed Insights Service

AI-enhanced PageSpeed analysis service that provides comprehensive
performance insights with actionable recommendations for optimization.
"""

import aiohttp
import asyncio
from typing import Dict, Any, List, Optional
from datetime import datetime
from loguru import logger
import os

from ..llm_providers.main_text_generation import llm_text_gen
from middleware.logging_middleware import seo_logger


class PageSpeedService:
    """Service for Google PageSpeed Insights analysis with AI enhancement"""

    def __init__(self):
        """Initialize the PageSpeed service"""
        self.service_name = "pagespeed_analyzer"
        self.api_key = os.getenv("GOOGLE_PAGESPEED_API_KEY")
        self.base_url = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
        logger.info(f"Initialized {self.service_name}")

    async def analyze_pagespeed(
        self,
        url: str,
        strategy: str = "DESKTOP",
        locale: str = "en",
        categories: List[str] = None
    ) -> Dict[str, Any]:
        """
        Analyze website performance using Google PageSpeed Insights

        Args:
            url: URL to analyze
            strategy: Analysis strategy (DESKTOP/MOBILE)
            locale: Locale for analysis
            categories: Categories to analyze

        Returns:
            Dictionary containing performance analysis and AI insights
        """
        try:
            start_time = datetime.utcnow()

            if categories is None:
                categories = ["performance", "accessibility", "best-practices", "seo"]

            # Validate inputs
            if not url:
                raise ValueError("URL is required")

            if strategy not in ["DESKTOP", "MOBILE"]:
                raise ValueError("Strategy must be DESKTOP or MOBILE")

            logger.info(f"Analyzing PageSpeed for URL: {url} (Strategy: {strategy})")

            # Fetch PageSpeed data
            pagespeed_data = await self._fetch_pagespeed_data(url, strategy, locale, categories)

            if not pagespeed_data:
                raise Exception("Failed to fetch PageSpeed data")

            # Extract and structure the data
            structured_results = self._structure_pagespeed_results(pagespeed_data)

            # Generate AI-enhanced insights
            ai_insights = await self._generate_ai_insights(structured_results, url, strategy)

            # Calculate optimization priority
            optimization_plan = self._create_optimization_plan(structured_results)

            execution_time = (datetime.utcnow() - start_time).total_seconds()

            result = {
                "url": url,
                "strategy": strategy,
                "analysis_date": datetime.utcnow().isoformat(),
                "core_web_vitals": structured_results.get("core_web_vitals", {}),
                "category_scores": structured_results.get("category_scores", {}),
                "metrics": structured_results.get("metrics", {}),
                "opportunities": structured_results.get("opportunities", []),
                "diagnostics": structured_results.get("diagnostics", []),
                "ai_insights": ai_insights,
                "optimization_plan": optimization_plan,
                "raw_data": {
                    "lighthouse_version": pagespeed_data.get("lighthouseResult", {}).get("lighthouseVersion"),
                    "fetch_time": pagespeed_data.get("analysisUTCTimestamp"),
                    "categories_analyzed": categories
                },
                "execution_time": execution_time
            }

            # Log the operation
            await seo_logger.log_tool_usage(
                tool_name=self.service_name,
                input_data={
                    "url": url,
                    "strategy": strategy,
                    "locale": locale,
                    "categories": categories
                },
                output_data=result,
                success=True
            )

            await seo_logger.log_external_api_call(
                api_name="Google PageSpeed Insights",
                endpoint=self.base_url,
                response_code=200,
                response_time=execution_time,
                request_data={"url": url, "strategy": strategy}
            )

            logger.info(f"PageSpeed analysis completed for {url}")
            return result

        except Exception as e:
            logger.error(f"Error analyzing PageSpeed for {url}: {e}")

            # Log the error
            await seo_logger.log_tool_usage(
                tool_name=self.service_name,
                input_data={
                    "url": url,
                    "strategy": strategy,
                    "locale": locale,
                    "categories": categories
                },
                output_data={"error": str(e)},
                success=False
            )

            raise

    async def _fetch_pagespeed_data(
        self,
        url: str,
        strategy: str,
        locale: str,
        categories: List[str]
    ) -> Dict[str, Any]:
        """Fetch data from Google PageSpeed Insights API"""

        # Build API URL
        api_url = f"{self.base_url}?url={url}&strategy={strategy}&locale={locale}"

        # Add categories
        for category in categories:
            api_url += f"&category={category}"

        # Add API key if available
        if self.api_key:
            api_url += f"&key={self.api_key}"

        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(api_url, timeout=aiohttp.ClientTimeout(total=60)) as response:
                    if response.status == 200:
                        data = await response.json()
                        return data
                    else:
                        error_text = await response.text()
                        logger.error(f"PageSpeed API error {response.status}: {error_text}")

                        if response.status == 429:
                            raise Exception("PageSpeed API rate limit exceeded")
                        elif response.status == 400:
                            raise Exception(f"Invalid URL or parameters: {error_text}")
                        else:
                            raise Exception(f"PageSpeed API error: {response.status}")

        except asyncio.TimeoutError:
            raise Exception("PageSpeed API request timed out")
        except Exception as e:
            logger.error(f"Error fetching PageSpeed data: {e}")
            raise

    def _structure_pagespeed_results(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """Structure PageSpeed results into organized format"""

        lighthouse_result = data.get("lighthouseResult", {})
        categories = lighthouse_result.get("categories", {})
        audits = lighthouse_result.get("audits", {})

        # Extract category scores
        category_scores = {}
        for category_name, category_data in categories.items():
            category_scores[category_name] = {
                "score": round(category_data.get("score", 0) * 100),
                "title": category_data.get("title", ""),
                "description": category_data.get("description", "")
            }

        # Extract Core Web Vitals
        core_web_vitals = {}
        cwv_metrics = ["largest-contentful-paint", "first-input-delay", "cumulative-layout-shift"]

        for metric in cwv_metrics:
            if metric in audits:
                audit_data = audits[metric]
                core_web_vitals[metric] = {
                    "score": audit_data.get("score"),
                    "displayValue": audit_data.get("displayValue"),
                    "numericValue": audit_data.get("numericValue"),
                    "title": audit_data.get("title"),
                    "description": audit_data.get("description")
                }

        # Extract key metrics
        key_metrics = {}
        important_metrics = [
            "first-contentful-paint",
            "speed-index",
            "largest-contentful-paint",
            "interactive",
            "total-blocking-time",
            "cumulative-layout-shift"
        ]

        for metric in important_metrics:
            if metric in audits:
                audit_data = audits[metric]
                key_metrics[metric] = {
                    "score": audit_data.get("score"),
                    "displayValue": audit_data.get("displayValue"),
                    "numericValue": audit_data.get("numericValue"),
                    "title": audit_data.get("title")
                }

        # Extract opportunities (performance improvements)
        opportunities = []
        for audit_id, audit_data in audits.items():
            if (audit_data.get("scoreDisplayMode") == "numeric" and
                audit_data.get("score") is not None and
                audit_data.get("score") < 1 and
                audit_data.get("details", {}).get("overallSavingsMs", 0) > 0):

                opportunities.append({
                    "id": audit_id,
                    "title": audit_data.get("title", ""),
                    "description": audit_data.get("description", ""),
                    "score": audit_data.get("score", 0),
                    "savings_ms": audit_data.get("details", {}).get("overallSavingsMs", 0),
                    "savings_bytes": audit_data.get("details", {}).get("overallSavingsBytes", 0),
                    "displayValue": audit_data.get("displayValue", "")
                })

        # Sort opportunities by potential savings
        opportunities.sort(key=lambda x: x["savings_ms"], reverse=True)

        # Extract diagnostics
        diagnostics = []
        for audit_id, audit_data in audits.items():
            if (audit_data.get("scoreDisplayMode") == "informative" or
                (audit_data.get("score") is not None and audit_data.get("score") < 1)):

                if audit_id not in [op["id"] for op in opportunities]:
                    diagnostics.append({
                        "id": audit_id,
                        "title": audit_data.get("title", ""),
                        "description": audit_data.get("description", ""),
                        "score": audit_data.get("score"),
                        "displayValue": audit_data.get("displayValue", "")
                    })

        return {
            "category_scores": category_scores,
            "core_web_vitals": core_web_vitals,
            "metrics": key_metrics,
            "opportunities": opportunities[:10],  # Top 10 opportunities
            "diagnostics": diagnostics[:10]  # Top 10 diagnostics
        }

    async def _generate_ai_insights(
        self,
        structured_results: Dict[str, Any],
        url: str,
        strategy: str
    ) -> Dict[str, Any]:
        """Generate AI-powered insights and recommendations"""

        try:
            # Prepare data for AI analysis
            performance_score = structured_results.get("category_scores", {}).get("performance", {}).get("score", 0)
            opportunities = structured_results.get("opportunities", [])
            core_web_vitals = structured_results.get("core_web_vitals", {})

            # Build AI prompt
            prompt = self._build_ai_analysis_prompt(
                url, strategy, performance_score, opportunities, core_web_vitals
            )

            # Generate AI insights
            ai_response = llm_text_gen(
                prompt=prompt,
                system_prompt=self._get_system_prompt()
            )

            # Parse AI response
            insights = self._parse_ai_insights(ai_response)

            # Log AI analysis
            await seo_logger.log_ai_analysis(
                tool_name=self.service_name,
                prompt=prompt,
                response=ai_response,
                model_used="gemini-2.0-flash-001"
            )

            return insights

        except Exception as e:
            logger.error(f"Error generating AI insights: {e}")
            return {
                "summary": "AI analysis unavailable",
                "priority_actions": [],
                "technical_recommendations": [],
                "business_impact": "Analysis could not be completed"
            }

    def _build_ai_analysis_prompt(
        self,
        url: str,
        strategy: str,
        performance_score: int,
        opportunities: List[Dict],
        core_web_vitals: Dict
    ) -> str:
        """Build AI prompt for performance analysis"""

        opportunities_text = "\n".join([
            f"- {opp['title']}: {opp['displayValue']} (Potential savings: {opp['savings_ms']}ms)"
            for opp in opportunities[:5]
        ])

        cwv_text = "\n".join([
            f"- {metric.replace('-', ' ').title()}: {data.get('displayValue', 'N/A')}"
            for metric, data in core_web_vitals.items()
        ])

        prompt = f"""
Analyze this website performance data and provide actionable insights for digital marketers and content creators:

Website: {url}
Device: {strategy}
Performance Score: {performance_score}/100

Core Web Vitals:
{cwv_text}

Top Performance Opportunities:
{opportunities_text}

Please provide:
1. Executive Summary (2-3 sentences for non-technical users)
2. Top 3 Priority Actions (specific, actionable steps)
3. Technical Recommendations (for developers)
4. Business Impact Assessment (how performance affects conversions, SEO, user experience)
5. Quick Wins (easy improvements that can be implemented immediately)

Focus on practical advice that content creators and digital marketers can understand and act upon.
"""

        return prompt

    def _get_system_prompt(self) -> str:
        """Get system prompt for AI analysis"""
        return """You are a web performance expert specializing in translating technical PageSpeed data into actionable business insights.
        Your audience includes content creators, digital marketers, and solopreneurs who need to understand how website performance impacts their business goals.

        Provide clear, actionable recommendations that balance technical accuracy with business practicality.
        Always explain the "why" behind recommendations and their potential impact on user experience, SEO, and conversions.
        """

    def _parse_ai_insights(self, ai_response: str) -> Dict[str, Any]:
        """Parse AI response into structured insights"""

        # Initialize default structure
        insights = {
            "summary": "",
            "priority_actions": [],
            "technical_recommendations": [],
            "business_impact": "",
            "quick_wins": []
        }

        try:
            # Split response into sections
            sections = ai_response.split('\n\n')

            current_section = None
            for section in sections:
                section = section.strip()
                if not section:
                    continue

                # Identify section type
                if 'executive summary' in section.lower() or 'summary' in section.lower():
                    insights["summary"] = self._extract_content(section)
                elif 'priority actions' in section.lower() or 'top 3' in section.lower():
                    insights["priority_actions"] = self._extract_list_items(section)
                elif 'technical recommendations' in section.lower():
                    insights["technical_recommendations"] = self._extract_list_items(section)
                elif 'business impact' in section.lower():
                    insights["business_impact"] = self._extract_content(section)
                elif 'quick wins' in section.lower():
                    insights["quick_wins"] = self._extract_list_items(section)

            # Fallback parsing if sections not clearly identified
            if not any(insights.values()):
                insights["summary"] = ai_response[:300] + "..." if len(ai_response) > 300 else ai_response

        except Exception as e:
            logger.error(f"Error parsing AI insights: {e}")
            insights["summary"] = "AI analysis completed but parsing failed"

        return insights

    def _extract_content(self, section: str) -> str:
        """Extract content from a section, removing headers"""
        lines = section.split('\n')
        content_lines = []

        for line in lines:
            line = line.strip()
            if line and not line.endswith(':') and not line.startswith('#'):
                content_lines.append(line)

        return ' '.join(content_lines)

    def _extract_list_items(self, section: str) -> List[str]:
        """Extract list items from a section"""
        items = []
        lines = section.split('\n')

        for line in lines:
            line = line.strip()
            if line and (line.startswith('-') or line.startswith('*') or
                        line[0].isdigit() and '.' in line[:3]):
                # Remove bullet points and numbering
                clean_line = line.lstrip('-*0123456789. ').strip()
                if clean_line:
                    items.append(clean_line)

        return items[:5]  # Limit to 5 items per section

    def _create_optimization_plan(self, structured_results: Dict[str, Any]) -> Dict[str, Any]:
        """Create a prioritized optimization plan"""

        opportunities = structured_results.get("opportunities", [])
        category_scores = structured_results.get("category_scores", {})

        # Calculate priority score for each opportunity
        prioritized_opportunities = []
        for opp in opportunities:
            priority_score = self._calculate_priority_score(opp)
            prioritized_opportunities.append({
                **opp,
                "priority_score": priority_score,
                "difficulty": self._estimate_difficulty(opp["id"]),
                "impact": self._estimate_impact(opp["savings_ms"])
            })

        # Sort by priority score
        prioritized_opportunities.sort(key=lambda x: x["priority_score"], reverse=True)

        # Create implementation phases
        phases = {
            "immediate": [],  # High impact, low difficulty
            "short_term": [],  # Medium impact or difficulty
            "long_term": []   # High difficulty but important
        }

        for opp in prioritized_opportunities:
            if opp["difficulty"] == "Low" and opp["impact"] in ["High", "Medium"]:
                phases["immediate"].append(opp)
            elif opp["difficulty"] in ["Low", "Medium"]:
                phases["short_term"].append(opp)
            else:
                phases["long_term"].append(opp)

        return {
            "overall_assessment": self._generate_overall_assessment(category_scores),
            "prioritized_opportunities": prioritized_opportunities[:10],
            "implementation_phases": phases,
            "estimated_improvement": self._estimate_total_improvement(prioritized_opportunities[:5])
        }

    def _calculate_priority_score(self, opportunity: Dict[str, Any]) -> int:
        """Calculate priority score for an opportunity"""
        savings_ms = opportunity.get("savings_ms", 0)
        savings_bytes = opportunity.get("savings_bytes", 0)

        # Base score from time savings
        score = min(savings_ms / 100, 50)  # Cap at 50 points

        # Add points for byte savings
        score += min(savings_bytes / 10000, 25)  # Cap at 25 points

        # Bonus points for specific high-impact optimizations
        high_impact_audits = [
            "unused-javascript",
            "render-blocking-resources",
            "largest-contentful-paint-element",
            "cumulative-layout-shift"
        ]

        if opportunity.get("id") in high_impact_audits:
            score += 25

        return min(int(score), 100)

    def _estimate_difficulty(self, audit_id: str) -> str:
        """Estimate implementation difficulty"""

        easy_fixes = [
            "unused-css-rules",
            "unused-javascript",
            "render-blocking-resources",
            "image-size-responsive"
        ]

        medium_fixes = [
            "largest-contentful-paint-element",
            "cumulative-layout-shift",
            "total-blocking-time"
        ]

        if audit_id in easy_fixes:
            return "Low"
        elif audit_id in medium_fixes:
            return "Medium"
        else:
            return "High"

    def _estimate_impact(self, savings_ms: int) -> str:
        """Estimate performance impact"""
        if savings_ms >= 1000:
            return "High"
        elif savings_ms >= 500:
            return "Medium"
        else:
            return "Low"

    def _generate_overall_assessment(self, category_scores: Dict[str, Any]) -> str:
        """Generate overall performance assessment"""

        performance_score = category_scores.get("performance", {}).get("score", 0)

        if performance_score >= 90:
            return "Excellent performance with minor optimization opportunities"
        elif performance_score >= 70:
            return "Good performance with some areas for improvement"
        elif performance_score >= 50:
            return "Average performance requiring attention to key areas"
        else:
            return "Poor performance requiring immediate optimization efforts"

    def _estimate_total_improvement(self, top_opportunities: List[Dict]) -> Dict[str, Any]:
        """Estimate total improvement from top opportunities"""

        total_savings_ms = sum(opp.get("savings_ms", 0) for opp in top_opportunities)
        total_savings_mb = sum(opp.get("savings_bytes", 0) for opp in top_opportunities) / (1024 * 1024)

        # Estimate score improvement (rough calculation)
        estimated_score_gain = min(total_savings_ms / 200, 30)  # Conservative estimate

        return {
            "potential_time_savings": f"{total_savings_ms/1000:.1f} seconds",
            "potential_size_savings": f"{total_savings_mb:.1f} MB",
            "estimated_score_improvement": f"+{estimated_score_gain:.0f} points",
            "confidence": "Medium" if total_savings_ms > 1000 else "Low"
        }

    async def health_check(self) -> Dict[str, Any]:
        """Health check for the PageSpeed service"""
        try:
            # Test with a simple URL
            test_url = "https://example.com"
            result = await self.analyze_pagespeed(test_url, "DESKTOP", "en", ["performance"])

            return {
                "status": "operational",
                "service": self.service_name,
                "api_key_configured": bool(self.api_key),
                "test_passed": bool(result.get("category_scores")),
                "last_check": datetime.utcnow().isoformat()
            }
        except Exception as e:
            return {
                "status": "error",
                "service": self.service_name,
                "error": str(e),
                "last_check": datetime.utcnow().isoformat()
            }