""" Technical SEO Analysis Service Comprehensive technical SEO crawler and analyzer with AI-enhanced insights for website optimization and search engine compatibility. """ import aiohttp import asyncio from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin import time from typing import Dict, Any, List, Optional from loguru import logger class TechnicalSEOService: """Service for technical SEO analysis and crawling""" def __init__(self): """Initialize the technical SEO service""" self.service_name = "technical_seo_analyzer" logger.info(f"Initialized {self.service_name}") self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +http://alwrity.com/bot)' } async def analyze_technical_seo( self, url: str, crawl_depth: int = 3, include_external_links: bool = True, analyze_performance: bool = True ) -> Dict[str, Any]: """Analyze technical SEO factors""" try: start_time = time.time() async with aiohttp.ClientSession(headers=self.headers) as session: async with session.get(url, timeout=30) as response: load_time = time.time() - start_time status_code = response.status content = await response.text() headers = response.headers # Basic parsing soup = BeautifulSoup(content, 'html.parser') # 1. Meta Tags Analysis title = soup.title.string if soup.title else None meta_desc = soup.find('meta', attrs={'name': 'description'}) meta_desc_content = meta_desc['content'] if meta_desc else None # 2. Heading Structure h1_tags = soup.find_all('h1') h2_tags = soup.find_all('h2') h3_tags = soup.find_all('h3') # 3. Image Analysis images = soup.find_all('img') images_without_alt = [img['src'] for img in images if not img.get('alt')] # 4. Link Analysis links = soup.find_all('a') internal_links = [] external_links = [] domain = urlparse(url).netloc for link in links: href = link.get('href') if not href: continue if href.startswith('http'): if domain in href: internal_links.append(href) else: external_links.append(href) elif href.startswith('/'): internal_links.append(urljoin(url, href)) # 5. Technical Issues Detection issues = [] # Status Code Issues if status_code != 200: issues.append({"type": f"Status Code {status_code}", "severity": "High", "pages_affected": 1}) # Performance Issues if load_time > 2.0: issues.append({"type": "Slow Server Response", "severity": "Medium", "pages_affected": 1}) # Meta Issues if not title: issues.append({"type": "Missing Title Tag", "severity": "High", "pages_affected": 1}) elif len(title) > 60: issues.append({"type": "Title Tag Too Long", "severity": "Low", "pages_affected": 1}) if not meta_desc_content: issues.append({"type": "Missing Meta Description", "severity": "High", "pages_affected": 1}) # Content Structure Issues if not h1_tags: issues.append({"type": "Missing H1 Tag", "severity": "High", "pages_affected": 1}) elif len(h1_tags) > 1: issues.append({"type": "Multiple H1 Tags", "severity": "Medium", "pages_affected": 1}) # Image Issues if images_without_alt: issues.append({"type": "Images Missing Alt Text", "severity": "Medium", "pages_affected": len(images_without_alt)}) # Security Issues if url.startswith('http:'): issues.append({"type": "Insecure Protocol (HTTP)", "severity": "High", "pages_affected": 1}) return { "url": url, "pages_crawled": 1, # Currently single page "crawl_depth": 1, "technical_issues": issues, "site_structure": { "internal_links": len(internal_links), "external_links": len(external_links) if include_external_links else 0, "h1_count": len(h1_tags), "h2_count": len(h2_tags), "h3_count": len(h3_tags) }, "performance_metrics": { "response_time": round(load_time, 3), "content_size": len(content) } if analyze_performance else {}, "recommendations": [issue['type'] for issue in issues], "crawl_summary": { "successful": 1 if status_code == 200 else 0, "errors": 1 if status_code >= 400 else 0, "redirects": 1 if 300 <= status_code < 400 else 0 } } except Exception as e: logger.error(f"Error in technical SEO analysis: {e}") return { "url": url, "error": str(e), "technical_issues": [{"type": "Crawl Failed", "severity": "High", "pages_affected": 1}] } async def health_check(self) -> Dict[str, Any]: """Health check for the technical SEO service""" return { "status": "operational", "service": self.service_name, "last_check": datetime.utcnow().isoformat() }