""" On-Page SEO Analysis Service Comprehensive on-page SEO analyzer with AI-enhanced insights for content optimization and technical improvements. """ import aiohttp from bs4 import BeautifulSoup from typing import Dict, Any, List, Optional from datetime import datetime from loguru import logger import re from urllib.parse import urlparse class OnPageSEOService: """Service for comprehensive on-page SEO analysis""" def __init__(self): """Initialize the on-page SEO service""" self.service_name = "on_page_seo_analyzer" logger.info(f"Initialized {self.service_name}") async def _fetch_page(self, url: str) -> tuple[Optional[str], int]: """Fetch page content""" try: headers = { 'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +https://alwrity.com)' } async with aiohttp.ClientSession() as session: async with session.get(url, headers=headers, timeout=10) as response: if response.status == 200: return await response.text(), 200 return None, response.status except Exception as e: logger.error(f"Error fetching {url}: {str(e)}") return None, 500 def _analyze_meta_tags(self, soup: BeautifulSoup) -> Dict[str, Any]: """Analyze meta tags""" title = soup.title.string if soup.title else None meta_desc = soup.find('meta', attrs={'name': 'description'}) viewport = soup.find('meta', attrs={'name': 'viewport'}) robots = soup.find('meta', attrs={'name': 'robots'}) charset = soup.find('meta', attrs={'charset': True}) # Social Tags og_title = soup.find('meta', property='og:title') og_desc = soup.find('meta', property='og:description') og_image = soup.find('meta', property='og:image') twitter_card = soup.find('meta', attrs={'name': 'twitter:card'}) issues = [] score = 100 # Title Analysis if not title: issues.append("Missing title tag") score -= 20 elif len(title) < 30 or len(title) > 60: issues.append(f"Title length ({len(title)} chars) should be 30-60 chars") score -= 10 # Description Analysis desc_content = meta_desc['content'] if meta_desc else None if not desc_content: issues.append("Missing meta description") score -= 20 elif len(desc_content) < 70 or len(desc_content) > 160: issues.append(f"Description length ({len(desc_content)} chars) should be 70-160 chars") score -= 10 # Viewport if not viewport: issues.append("Missing viewport meta tag") score -= 20 og_found = list(filter(None, ['Title' if og_title else '', 'Desc' if og_desc else '', 'Image' if og_image else ''])) return { "title_length": f"{len(title)} chars" if title else "Missing", "meta_description_length": f"{len(desc_content)} chars" if desc_content else "Missing", "has_viewport": bool(viewport), "charset": charset['charset'] if charset else "Missing", "robots_meta": robots['content'] if robots else "Missing (Default: index, follow)", "og_tags": f"Found: {', '.join(og_found)}" if og_found else "None", "twitter_card": twitter_card['content'] if twitter_card else "Missing", "score": max(0, score), "issues": issues } def _analyze_technical(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]: """Analyze technical SEO elements""" canonical = soup.find('link', attrs={'rel': 'canonical'}) schema = soup.find_all('script', type='application/ld+json') issues = [] score = 100 if not canonical: issues.append("Missing canonical tag") score -= 10 # Check H1 h1_tags = soup.find_all('h1') if len(h1_tags) == 0: issues.append("Missing H1 tag") score -= 20 elif len(h1_tags) > 1: issues.append(f"Multiple H1 tags found ({len(h1_tags)})") score -= 10 return { "canonical_tag": canonical['href'] if canonical else "Missing", "schema_markup": f"Found {len(schema)} schema objects", "h1_count": len(h1_tags), "score": max(0, score), "issues": issues } def _analyze_content(self, soup: BeautifulSoup) -> Dict[str, Any]: """Analyze content quality""" # Remove scripts and styles for script in soup(["script", "style"]): script.extract() text = soup.get_text() words = len(re.findall(r'\w+', text)) images = soup.find_all('img') images_without_alt = sum(1 for img in images if not img.get('alt')) issues = [] score = 100 if words < 300: issues.append(f"Low word count ({words} words)") score -= 20 if images_without_alt > 0: issues.append(f"{images_without_alt} images missing alt text") score -= 10 return { "word_count": words, "total_images": len(images), "images_without_alt": images_without_alt, "readability": "Good" if words > 300 else "Needs Improvement", # Placeholder for readability algo "score": max(0, score), "issues": issues } def _analyze_url_structure(self, url: str) -> Dict[str, Any]: parsed = urlparse(url) return { "protocol": parsed.scheme, "domain": parsed.netloc, "path_depth": len(parsed.path.strip('/').split('/')) if parsed.path else 0, "is_https": parsed.scheme == 'https' } def _calculate_overall_score(self, *analyses) -> int: total = sum(a.get('score', 0) for a in analyses) return round(total / len(analyses)) def _generate_summary(self, *analyses) -> Dict[str, Any]: critical_issues = [] for a in analyses: for issue in a.get('issues', []): critical_issues.append({"message": issue, "severity": "critical", "category": "SEO"}) return {"critical_issues": critical_issues} async def analyze_on_page_seo( self, url: str, target_keywords: Optional[List[str]] = None, analyze_images: bool = True, analyze_content_quality: bool = True ) -> Dict[str, Any]: """Analyze on-page SEO factors""" try: # Add protocol if missing if not url.startswith(('http://', 'https://')): url = 'https://' + url html_content, status_code = await self._fetch_page(url) if not html_content: # Return error structure return { "url": url, "overall_score": 0, "summary": {"critical_issues": [{"message": f"Failed to fetch URL (Status: {status_code})", "severity": "critical", "category": "Connectivity"}]}, "meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {} } soup = BeautifulSoup(html_content, 'html.parser') # Run Analyses meta_analysis = self._analyze_meta_tags(soup) technical_analysis = self._analyze_technical(soup, url) content_analysis = self._analyze_content(soup) url_analysis = self._analyze_url_structure(url) result = { "url": url, "overall_score": self._calculate_overall_score(meta_analysis, technical_analysis, content_analysis), "meta": meta_analysis, "technical": technical_analysis, "content_health": content_analysis, "url_structure": url_analysis, "performance": {"load_time": "Real-time check pending"}, "accessibility": {"images_without_alt": content_analysis["images_without_alt"]}, "ux": {"viewport": meta_analysis["has_viewport"], "mobile_friendly": bool(meta_analysis["has_viewport"])}, "summary": self._generate_summary(meta_analysis, technical_analysis, content_analysis) } return result except Exception as e: logger.error(f"Error analyzing {url}: {str(e)}") return { "url": url, "overall_score": 0, "summary": {"critical_issues": [{"message": str(e), "severity": "critical", "category": "System"}]}, "meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {} } async def health_check(self) -> Dict[str, Any]: """Health check for the on-page SEO service""" return { "status": "operational", "service": self.service_name, "last_check": datetime.utcnow().isoformat() }