ALwrity/backend/services/seo_tools/technical_seo_service.py

"""
Technical SEO Analysis Service

Comprehensive technical SEO crawler and analyzer with AI-enhanced
insights for website optimization and search engine compatibility.
"""

import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import time
from typing import Dict, Any, List, Optional
from loguru import logger

class TechnicalSEOService:
    """Service for technical SEO analysis and crawling"""

    def __init__(self):
        """Initialize the technical SEO service"""
        self.service_name = "technical_seo_analyzer"
        logger.info(f"Initialized {self.service_name}")
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +http://alwrity.com/bot)'
        }

    async def analyze_technical_seo(
        self,
        url: str,
        crawl_depth: int = 3,
        include_external_links: bool = True,
        analyze_performance: bool = True
    ) -> Dict[str, Any]:
        """Analyze technical SEO factors"""
        try:
            start_time = time.time()
            async with aiohttp.ClientSession(headers=self.headers) as session:
                async with session.get(url, timeout=30) as response:
                    load_time = time.time() - start_time
                    status_code = response.status
                    content = await response.text()
                    headers = response.headers

                    # Basic parsing
                    soup = BeautifulSoup(content, 'html.parser')

                    # 1. Meta Tags Analysis
                    title = soup.title.string if soup.title else None
                    meta_desc = soup.find('meta', attrs={'name': 'description'})
                    meta_desc_content = meta_desc['content'] if meta_desc else None

                    # 2. Heading Structure
                    h1_tags = soup.find_all('h1')
                    h2_tags = soup.find_all('h2')
                    h3_tags = soup.find_all('h3')

                    # 3. Image Analysis
                    images = soup.find_all('img')
                    images_without_alt = [img['src'] for img in images if not img.get('alt')]

                    # 4. Link Analysis
                    links = soup.find_all('a')
                    internal_links = []
                    external_links = []
                    domain = urlparse(url).netloc

                    for link in links:
                        href = link.get('href')
                        if not href:
                            continue
                        if href.startswith('http'):
                            if domain in href:
                                internal_links.append(href)
                            else:
                                external_links.append(href)
                        elif href.startswith('/'):
                             internal_links.append(urljoin(url, href))

                    # 5. Technical Issues Detection
                    issues = []

                    # Status Code Issues
                    if status_code != 200:
                        issues.append({"type": f"Status Code {status_code}", "severity": "High", "pages_affected": 1})

                    # Performance Issues
                    if load_time > 2.0:
                        issues.append({"type": "Slow Server Response", "severity": "Medium", "pages_affected": 1})

                    # Meta Issues
                    if not title:
                        issues.append({"type": "Missing Title Tag", "severity": "High", "pages_affected": 1})
                    elif len(title) > 60:
                        issues.append({"type": "Title Tag Too Long", "severity": "Low", "pages_affected": 1})

                    if not meta_desc_content:
                        issues.append({"type": "Missing Meta Description", "severity": "High", "pages_affected": 1})

                    # Content Structure Issues
                    if not h1_tags:
                        issues.append({"type": "Missing H1 Tag", "severity": "High", "pages_affected": 1})
                    elif len(h1_tags) > 1:
                        issues.append({"type": "Multiple H1 Tags", "severity": "Medium", "pages_affected": 1})

                    # Image Issues
                    if images_without_alt:
                        issues.append({"type": "Images Missing Alt Text", "severity": "Medium", "pages_affected": len(images_without_alt)})

                    # Security Issues
                    if url.startswith('http:'):
                        issues.append({"type": "Insecure Protocol (HTTP)", "severity": "High", "pages_affected": 1})

                    return {
                        "url": url,
                        "pages_crawled": 1, # Currently single page
                        "crawl_depth": 1,
                        "technical_issues": issues,
                        "site_structure": {
                            "internal_links": len(internal_links),
                            "external_links": len(external_links) if include_external_links else 0,
                            "h1_count": len(h1_tags),
                            "h2_count": len(h2_tags),
                            "h3_count": len(h3_tags)
                        },
                        "performance_metrics": {
                            "response_time": round(load_time, 3),
                            "content_size": len(content)
                        } if analyze_performance else {},
                        "recommendations": [issue['type'] for issue in issues],
                        "crawl_summary": {
                            "successful": 1 if status_code == 200 else 0,
                            "errors": 1 if status_code >= 400 else 0,
                            "redirects": 1 if 300 <= status_code < 400 else 0
                        }
                    }

        except Exception as e:
            logger.error(f"Error in technical SEO analysis: {e}")
            return {
                "url": url,
                "error": str(e),
                "technical_issues": [{"type": "Crawl Failed", "severity": "High", "pages_affected": 1}]
            }

    async def health_check(self) -> Dict[str, Any]:
        """Health check for the technical SEO service"""
        return {
            "status": "operational",
            "service": self.service_name,
            "last_check": datetime.utcnow().isoformat()
        }