Add AI SEO tools with FastAPI endpoints and comprehensive services

Co-authored-by: ajay.calsoft <ajay.calsoft@gmail.com>
2025-08-24 11:47:42 +00:00
parent 5d8d1cfb73
commit 512f82b7b0
17 changed files with 3637 additions and 0 deletions
--- a/backend/services/seo_tools/pagespeed_service.py
+++ b/backend/services/seo_tools/pagespeed_service.py
@@ -0,0 +1,601 @@
+"""
+Google PageSpeed Insights Service
+
+AI-enhanced PageSpeed analysis service that provides comprehensive
+performance insights with actionable recommendations for optimization.
+"""
+
+import aiohttp
+import asyncio
+from typing import Dict, Any, List, Optional
+from datetime import datetime
+from loguru import logger
+import os
+
+from ..llm_providers.main_text_generation import llm_text_gen
+from ...middleware.logging_middleware import seo_logger
+
+
+class PageSpeedService:
+    """Service for Google PageSpeed Insights analysis with AI enhancement"""
+    
+    def __init__(self):
+        """Initialize the PageSpeed service"""
+        self.service_name = "pagespeed_analyzer"
+        self.api_key = os.getenv("GOOGLE_PAGESPEED_API_KEY")
+        self.base_url = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
+        logger.info(f"Initialized {self.service_name}")
+    
+    async def analyze_pagespeed(
+        self,
+        url: str,
+        strategy: str = "DESKTOP",
+        locale: str = "en",
+        categories: List[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Analyze website performance using Google PageSpeed Insights
+        
+        Args:
+            url: URL to analyze
+            strategy: Analysis strategy (DESKTOP/MOBILE)
+            locale: Locale for analysis
+            categories: Categories to analyze
+            
+        Returns:
+            Dictionary containing performance analysis and AI insights
+        """
+        try:
+            start_time = datetime.utcnow()
+            
+            if categories is None:
+                categories = ["performance", "accessibility", "best-practices", "seo"]
+            
+            # Validate inputs
+            if not url:
+                raise ValueError("URL is required")
+            
+            if strategy not in ["DESKTOP", "MOBILE"]:
+                raise ValueError("Strategy must be DESKTOP or MOBILE")
+            
+            logger.info(f"Analyzing PageSpeed for URL: {url} (Strategy: {strategy})")
+            
+            # Fetch PageSpeed data
+            pagespeed_data = await self._fetch_pagespeed_data(url, strategy, locale, categories)
+            
+            if not pagespeed_data:
+                raise Exception("Failed to fetch PageSpeed data")
+            
+            # Extract and structure the data
+            structured_results = self._structure_pagespeed_results(pagespeed_data)
+            
+            # Generate AI-enhanced insights
+            ai_insights = await self._generate_ai_insights(structured_results, url, strategy)
+            
+            # Calculate optimization priority
+            optimization_plan = self._create_optimization_plan(structured_results)
+            
+            execution_time = (datetime.utcnow() - start_time).total_seconds()
+            
+            result = {
+                "url": url,
+                "strategy": strategy,
+                "analysis_date": datetime.utcnow().isoformat(),
+                "core_web_vitals": structured_results.get("core_web_vitals", {}),
+                "category_scores": structured_results.get("category_scores", {}),
+                "metrics": structured_results.get("metrics", {}),
+                "opportunities": structured_results.get("opportunities", []),
+                "diagnostics": structured_results.get("diagnostics", []),
+                "ai_insights": ai_insights,
+                "optimization_plan": optimization_plan,
+                "raw_data": {
+                    "lighthouse_version": pagespeed_data.get("lighthouseResult", {}).get("lighthouseVersion"),
+                    "fetch_time": pagespeed_data.get("analysisUTCTimestamp"),
+                    "categories_analyzed": categories
+                },
+                "execution_time": execution_time
+            }
+            
+            # Log the operation
+            await seo_logger.log_tool_usage(
+                tool_name=self.service_name,
+                input_data={
+                    "url": url,
+                    "strategy": strategy,
+                    "locale": locale,
+                    "categories": categories
+                },
+                output_data=result,
+                success=True
+            )
+            
+            await seo_logger.log_external_api_call(
+                api_name="Google PageSpeed Insights",
+                endpoint=self.base_url,
+                response_code=200,
+                response_time=execution_time,
+                request_data={"url": url, "strategy": strategy}
+            )
+            
+            logger.info(f"PageSpeed analysis completed for {url}")
+            return result
+            
+        except Exception as e:
+            logger.error(f"Error analyzing PageSpeed for {url}: {e}")
+            
+            # Log the error
+            await seo_logger.log_tool_usage(
+                tool_name=self.service_name,
+                input_data={
+                    "url": url,
+                    "strategy": strategy,
+                    "locale": locale,
+                    "categories": categories
+                },
+                output_data={"error": str(e)},
+                success=False
+            )
+            
+            raise
+    
+    async def _fetch_pagespeed_data(
+        self,
+        url: str,
+        strategy: str,
+        locale: str,
+        categories: List[str]
+    ) -> Dict[str, Any]:
+        """Fetch data from Google PageSpeed Insights API"""
+        
+        # Build API URL
+        api_url = f"{self.base_url}?url={url}&strategy={strategy}&locale={locale}"
+        
+        # Add categories
+        for category in categories:
+            api_url += f"&category={category}"
+        
+        # Add API key if available
+        if self.api_key:
+            api_url += f"&key={self.api_key}"
+        
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(api_url, timeout=aiohttp.ClientTimeout(total=60)) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        return data
+                    else:
+                        error_text = await response.text()
+                        logger.error(f"PageSpeed API error {response.status}: {error_text}")
+                        
+                        if response.status == 429:
+                            raise Exception("PageSpeed API rate limit exceeded")
+                        elif response.status == 400:
+                            raise Exception(f"Invalid URL or parameters: {error_text}")
+                        else:
+                            raise Exception(f"PageSpeed API error: {response.status}")
+                        
+        except asyncio.TimeoutError:
+            raise Exception("PageSpeed API request timed out")
+        except Exception as e:
+            logger.error(f"Error fetching PageSpeed data: {e}")
+            raise
+    
+    def _structure_pagespeed_results(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Structure PageSpeed results into organized format"""
+        
+        lighthouse_result = data.get("lighthouseResult", {})
+        categories = lighthouse_result.get("categories", {})
+        audits = lighthouse_result.get("audits", {})
+        
+        # Extract category scores
+        category_scores = {}
+        for category_name, category_data in categories.items():
+            category_scores[category_name] = {
+                "score": round(category_data.get("score", 0) * 100),
+                "title": category_data.get("title", ""),
+                "description": category_data.get("description", "")
+            }
+        
+        # Extract Core Web Vitals
+        core_web_vitals = {}
+        cwv_metrics = ["largest-contentful-paint", "first-input-delay", "cumulative-layout-shift"]
+        
+        for metric in cwv_metrics:
+            if metric in audits:
+                audit_data = audits[metric]
+                core_web_vitals[metric] = {
+                    "score": audit_data.get("score"),
+                    "displayValue": audit_data.get("displayValue"),
+                    "numericValue": audit_data.get("numericValue"),
+                    "title": audit_data.get("title"),
+                    "description": audit_data.get("description")
+                }
+        
+        # Extract key metrics
+        key_metrics = {}
+        important_metrics = [
+            "first-contentful-paint",
+            "speed-index", 
+            "largest-contentful-paint",
+            "interactive",
+            "total-blocking-time",
+            "cumulative-layout-shift"
+        ]
+        
+        for metric in important_metrics:
+            if metric in audits:
+                audit_data = audits[metric]
+                key_metrics[metric] = {
+                    "score": audit_data.get("score"),
+                    "displayValue": audit_data.get("displayValue"),
+                    "numericValue": audit_data.get("numericValue"),
+                    "title": audit_data.get("title")
+                }
+        
+        # Extract opportunities (performance improvements)
+        opportunities = []
+        for audit_id, audit_data in audits.items():
+            if (audit_data.get("scoreDisplayMode") == "numeric" and 
+                audit_data.get("score") is not None and 
+                audit_data.get("score") < 1 and
+                audit_data.get("details", {}).get("overallSavingsMs", 0) > 0):
+                
+                opportunities.append({
+                    "id": audit_id,
+                    "title": audit_data.get("title", ""),
+                    "description": audit_data.get("description", ""),
+                    "score": audit_data.get("score", 0),
+                    "savings_ms": audit_data.get("details", {}).get("overallSavingsMs", 0),
+                    "savings_bytes": audit_data.get("details", {}).get("overallSavingsBytes", 0),
+                    "displayValue": audit_data.get("displayValue", "")
+                })
+        
+        # Sort opportunities by potential savings
+        opportunities.sort(key=lambda x: x["savings_ms"], reverse=True)
+        
+        # Extract diagnostics
+        diagnostics = []
+        for audit_id, audit_data in audits.items():
+            if (audit_data.get("scoreDisplayMode") == "informative" or
+                (audit_data.get("score") is not None and audit_data.get("score") < 1)):
+                
+                if audit_id not in [op["id"] for op in opportunities]:
+                    diagnostics.append({
+                        "id": audit_id,
+                        "title": audit_data.get("title", ""),
+                        "description": audit_data.get("description", ""),
+                        "score": audit_data.get("score"),
+                        "displayValue": audit_data.get("displayValue", "")
+                    })
+        
+        return {
+            "category_scores": category_scores,
+            "core_web_vitals": core_web_vitals,
+            "metrics": key_metrics,
+            "opportunities": opportunities[:10],  # Top 10 opportunities
+            "diagnostics": diagnostics[:10]  # Top 10 diagnostics
+        }
+    
+    async def _generate_ai_insights(
+        self,
+        structured_results: Dict[str, Any],
+        url: str,
+        strategy: str
+    ) -> Dict[str, Any]:
+        """Generate AI-powered insights and recommendations"""
+        
+        try:
+            # Prepare data for AI analysis
+            performance_score = structured_results.get("category_scores", {}).get("performance", {}).get("score", 0)
+            opportunities = structured_results.get("opportunities", [])
+            core_web_vitals = structured_results.get("core_web_vitals", {})
+            
+            # Build AI prompt
+            prompt = self._build_ai_analysis_prompt(
+                url, strategy, performance_score, opportunities, core_web_vitals
+            )
+            
+            # Generate AI insights
+            ai_response = llm_text_gen(
+                prompt=prompt,
+                system_prompt=self._get_system_prompt()
+            )
+            
+            # Parse AI response
+            insights = self._parse_ai_insights(ai_response)
+            
+            # Log AI analysis
+            await seo_logger.log_ai_analysis(
+                tool_name=self.service_name,
+                prompt=prompt,
+                response=ai_response,
+                model_used="gemini-2.0-flash-001"
+            )
+            
+            return insights
+            
+        except Exception as e:
+            logger.error(f"Error generating AI insights: {e}")
+            return {
+                "summary": "AI analysis unavailable",
+                "priority_actions": [],
+                "technical_recommendations": [],
+                "business_impact": "Analysis could not be completed"
+            }
+    
+    def _build_ai_analysis_prompt(
+        self,
+        url: str,
+        strategy: str,
+        performance_score: int,
+        opportunities: List[Dict],
+        core_web_vitals: Dict
+    ) -> str:
+        """Build AI prompt for performance analysis"""
+        
+        opportunities_text = "\n".join([
+            f"- {opp['title']}: {opp['displayValue']} (Potential savings: {opp['savings_ms']}ms)"
+            for opp in opportunities[:5]
+        ])
+        
+        cwv_text = "\n".join([
+            f"- {metric.replace('-', ' ').title()}: {data.get('displayValue', 'N/A')}"
+            for metric, data in core_web_vitals.items()
+        ])
+        
+        prompt = f"""
+Analyze this website performance data and provide actionable insights for digital marketers and content creators:
+
+Website: {url}
+Device: {strategy}
+Performance Score: {performance_score}/100
+
+Core Web Vitals:
+{cwv_text}
+
+Top Performance Opportunities:
+{opportunities_text}
+
+Please provide:
+1. Executive Summary (2-3 sentences for non-technical users)
+2. Top 3 Priority Actions (specific, actionable steps)
+3. Technical Recommendations (for developers)
+4. Business Impact Assessment (how performance affects conversions, SEO, user experience)
+5. Quick Wins (easy improvements that can be implemented immediately)
+
+Focus on practical advice that content creators and digital marketers can understand and act upon.
+"""
+        
+        return prompt
+    
+    def _get_system_prompt(self) -> str:
+        """Get system prompt for AI analysis"""
+        return """You are a web performance expert specializing in translating technical PageSpeed data into actionable business insights. 
+        Your audience includes content creators, digital marketers, and solopreneurs who need to understand how website performance impacts their business goals.
+        
+        Provide clear, actionable recommendations that balance technical accuracy with business practicality.
+        Always explain the "why" behind recommendations and their potential impact on user experience, SEO, and conversions.
+        """
+    
+    def _parse_ai_insights(self, ai_response: str) -> Dict[str, Any]:
+        """Parse AI response into structured insights"""
+        
+        # Initialize default structure
+        insights = {
+            "summary": "",
+            "priority_actions": [],
+            "technical_recommendations": [],
+            "business_impact": "",
+            "quick_wins": []
+        }
+        
+        try:
+            # Split response into sections
+            sections = ai_response.split('\n\n')
+            
+            current_section = None
+            for section in sections:
+                section = section.strip()
+                if not section:
+                    continue
+                
+                # Identify section type
+                if 'executive summary' in section.lower() or 'summary' in section.lower():
+                    insights["summary"] = self._extract_content(section)
+                elif 'priority actions' in section.lower() or 'top 3' in section.lower():
+                    insights["priority_actions"] = self._extract_list_items(section)
+                elif 'technical recommendations' in section.lower():
+                    insights["technical_recommendations"] = self._extract_list_items(section)
+                elif 'business impact' in section.lower():
+                    insights["business_impact"] = self._extract_content(section)
+                elif 'quick wins' in section.lower():
+                    insights["quick_wins"] = self._extract_list_items(section)
+            
+            # Fallback parsing if sections not clearly identified
+            if not any(insights.values()):
+                insights["summary"] = ai_response[:300] + "..." if len(ai_response) > 300 else ai_response
+                
+        except Exception as e:
+            logger.error(f"Error parsing AI insights: {e}")
+            insights["summary"] = "AI analysis completed but parsing failed"
+        
+        return insights
+    
+    def _extract_content(self, section: str) -> str:
+        """Extract content from a section, removing headers"""
+        lines = section.split('\n')
+        content_lines = []
+        
+        for line in lines:
+            line = line.strip()
+            if line and not line.endswith(':') and not line.startswith('#'):
+                content_lines.append(line)
+        
+        return ' '.join(content_lines)
+    
+    def _extract_list_items(self, section: str) -> List[str]:
+        """Extract list items from a section"""
+        items = []
+        lines = section.split('\n')
+        
+        for line in lines:
+            line = line.strip()
+            if line and (line.startswith('-') or line.startswith('*') or 
+                        line[0].isdigit() and '.' in line[:3]):
+                # Remove bullet points and numbering
+                clean_line = line.lstrip('-*0123456789. ').strip()
+                if clean_line:
+                    items.append(clean_line)
+        
+        return items[:5]  # Limit to 5 items per section
+    
+    def _create_optimization_plan(self, structured_results: Dict[str, Any]) -> Dict[str, Any]:
+        """Create a prioritized optimization plan"""
+        
+        opportunities = structured_results.get("opportunities", [])
+        category_scores = structured_results.get("category_scores", {})
+        
+        # Calculate priority score for each opportunity
+        prioritized_opportunities = []
+        for opp in opportunities:
+            priority_score = self._calculate_priority_score(opp)
+            prioritized_opportunities.append({
+                **opp,
+                "priority_score": priority_score,
+                "difficulty": self._estimate_difficulty(opp["id"]),
+                "impact": self._estimate_impact(opp["savings_ms"])
+            })
+        
+        # Sort by priority score
+        prioritized_opportunities.sort(key=lambda x: x["priority_score"], reverse=True)
+        
+        # Create implementation phases
+        phases = {
+            "immediate": [],  # High impact, low difficulty
+            "short_term": [],  # Medium impact or difficulty
+            "long_term": []   # High difficulty but important
+        }
+        
+        for opp in prioritized_opportunities:
+            if opp["difficulty"] == "Low" and opp["impact"] in ["High", "Medium"]:
+                phases["immediate"].append(opp)
+            elif opp["difficulty"] in ["Low", "Medium"]:
+                phases["short_term"].append(opp)
+            else:
+                phases["long_term"].append(opp)
+        
+        return {
+            "overall_assessment": self._generate_overall_assessment(category_scores),
+            "prioritized_opportunities": prioritized_opportunities[:10],
+            "implementation_phases": phases,
+            "estimated_improvement": self._estimate_total_improvement(prioritized_opportunities[:5])
+        }
+    
+    def _calculate_priority_score(self, opportunity: Dict[str, Any]) -> int:
+        """Calculate priority score for an opportunity"""
+        savings_ms = opportunity.get("savings_ms", 0)
+        savings_bytes = opportunity.get("savings_bytes", 0)
+        
+        # Base score from time savings
+        score = min(savings_ms / 100, 50)  # Cap at 50 points
+        
+        # Add points for byte savings
+        score += min(savings_bytes / 10000, 25)  # Cap at 25 points
+        
+        # Bonus points for specific high-impact optimizations
+        high_impact_audits = [
+            "unused-javascript",
+            "render-blocking-resources", 
+            "largest-contentful-paint-element",
+            "cumulative-layout-shift"
+        ]
+        
+        if opportunity.get("id") in high_impact_audits:
+            score += 25
+        
+        return min(int(score), 100)
+    
+    def _estimate_difficulty(self, audit_id: str) -> str:
+        """Estimate implementation difficulty"""
+        
+        easy_fixes = [
+            "unused-css-rules",
+            "unused-javascript",
+            "render-blocking-resources",
+            "image-size-responsive"
+        ]
+        
+        medium_fixes = [
+            "largest-contentful-paint-element",
+            "cumulative-layout-shift",
+            "total-blocking-time"
+        ]
+        
+        if audit_id in easy_fixes:
+            return "Low"
+        elif audit_id in medium_fixes:
+            return "Medium"
+        else:
+            return "High"
+    
+    def _estimate_impact(self, savings_ms: int) -> str:
+        """Estimate performance impact"""
+        if savings_ms >= 1000:
+            return "High"
+        elif savings_ms >= 500:
+            return "Medium"
+        else:
+            return "Low"
+    
+    def _generate_overall_assessment(self, category_scores: Dict[str, Any]) -> str:
+        """Generate overall performance assessment"""
+        
+        performance_score = category_scores.get("performance", {}).get("score", 0)
+        
+        if performance_score >= 90:
+            return "Excellent performance with minor optimization opportunities"
+        elif performance_score >= 70:
+            return "Good performance with some areas for improvement"
+        elif performance_score >= 50:
+            return "Average performance requiring attention to key areas"
+        else:
+            return "Poor performance requiring immediate optimization efforts"
+    
+    def _estimate_total_improvement(self, top_opportunities: List[Dict]) -> Dict[str, Any]:
+        """Estimate total improvement from top opportunities"""
+        
+        total_savings_ms = sum(opp.get("savings_ms", 0) for opp in top_opportunities)
+        total_savings_mb = sum(opp.get("savings_bytes", 0) for opp in top_opportunities) / (1024 * 1024)
+        
+        # Estimate score improvement (rough calculation)
+        estimated_score_gain = min(total_savings_ms / 200, 30)  # Conservative estimate
+        
+        return {
+            "potential_time_savings": f"{total_savings_ms/1000:.1f} seconds",
+            "potential_size_savings": f"{total_savings_mb:.1f} MB",
+            "estimated_score_improvement": f"+{estimated_score_gain:.0f} points",
+            "confidence": "Medium" if total_savings_ms > 1000 else "Low"
+        }
+    
+    async def health_check(self) -> Dict[str, Any]:
+        """Health check for the PageSpeed service"""
+        try:
+            # Test with a simple URL
+            test_url = "https://example.com"
+            result = await self.analyze_pagespeed(test_url, "DESKTOP", "en", ["performance"])
+            
+            return {
+                "status": "operational",
+                "service": self.service_name,
+                "api_key_configured": bool(self.api_key),
+                "test_passed": bool(result.get("category_scores")),
+                "last_check": datetime.utcnow().isoformat()
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "service": self.service_name,
+                "error": str(e),
+                "last_check": datetime.utcnow().isoformat()
+            }