236 lines
9.2 KiB
Python
236 lines
9.2 KiB
Python
"""
|
|
On-Page SEO Analysis Service
|
|
|
|
Comprehensive on-page SEO analyzer with AI-enhanced insights
|
|
for content optimization and technical improvements.
|
|
"""
|
|
|
|
import aiohttp
|
|
from bs4 import BeautifulSoup
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime
|
|
from loguru import logger
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
class OnPageSEOService:
|
|
"""Service for comprehensive on-page SEO analysis"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the on-page SEO service"""
|
|
self.service_name = "on_page_seo_analyzer"
|
|
logger.info(f"Initialized {self.service_name}")
|
|
|
|
async def _fetch_page(self, url: str) -> tuple[Optional[str], int]:
|
|
"""Fetch page content"""
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +https://alwrity.com)'
|
|
}
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url, headers=headers, timeout=10) as response:
|
|
if response.status == 200:
|
|
return await response.text(), 200
|
|
return None, response.status
|
|
except Exception as e:
|
|
logger.error(f"Error fetching {url}: {str(e)}")
|
|
return None, 500
|
|
|
|
def _analyze_meta_tags(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
|
"""Analyze meta tags"""
|
|
title = soup.title.string if soup.title else None
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
viewport = soup.find('meta', attrs={'name': 'viewport'})
|
|
robots = soup.find('meta', attrs={'name': 'robots'})
|
|
charset = soup.find('meta', attrs={'charset': True})
|
|
|
|
# Social Tags
|
|
og_title = soup.find('meta', property='og:title')
|
|
og_desc = soup.find('meta', property='og:description')
|
|
og_image = soup.find('meta', property='og:image')
|
|
twitter_card = soup.find('meta', attrs={'name': 'twitter:card'})
|
|
|
|
issues = []
|
|
score = 100
|
|
|
|
# Title Analysis
|
|
if not title:
|
|
issues.append("Missing title tag")
|
|
score -= 20
|
|
elif len(title) < 30 or len(title) > 60:
|
|
issues.append(f"Title length ({len(title)} chars) should be 30-60 chars")
|
|
score -= 10
|
|
|
|
# Description Analysis
|
|
desc_content = meta_desc['content'] if meta_desc else None
|
|
if not desc_content:
|
|
issues.append("Missing meta description")
|
|
score -= 20
|
|
elif len(desc_content) < 70 or len(desc_content) > 160:
|
|
issues.append(f"Description length ({len(desc_content)} chars) should be 70-160 chars")
|
|
score -= 10
|
|
|
|
# Viewport
|
|
if not viewport:
|
|
issues.append("Missing viewport meta tag")
|
|
score -= 20
|
|
|
|
og_found = list(filter(None, ['Title' if og_title else '', 'Desc' if og_desc else '', 'Image' if og_image else '']))
|
|
|
|
return {
|
|
"title_length": f"{len(title)} chars" if title else "Missing",
|
|
"meta_description_length": f"{len(desc_content)} chars" if desc_content else "Missing",
|
|
"has_viewport": bool(viewport),
|
|
"charset": charset['charset'] if charset else "Missing",
|
|
"robots_meta": robots['content'] if robots else "Missing (Default: index, follow)",
|
|
"og_tags": f"Found: {', '.join(og_found)}" if og_found else "None",
|
|
"twitter_card": twitter_card['content'] if twitter_card else "Missing",
|
|
"score": max(0, score),
|
|
"issues": issues
|
|
}
|
|
|
|
def _analyze_technical(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
|
"""Analyze technical SEO elements"""
|
|
canonical = soup.find('link', attrs={'rel': 'canonical'})
|
|
schema = soup.find_all('script', type='application/ld+json')
|
|
|
|
issues = []
|
|
score = 100
|
|
|
|
if not canonical:
|
|
issues.append("Missing canonical tag")
|
|
score -= 10
|
|
|
|
# Check H1
|
|
h1_tags = soup.find_all('h1')
|
|
if len(h1_tags) == 0:
|
|
issues.append("Missing H1 tag")
|
|
score -= 20
|
|
elif len(h1_tags) > 1:
|
|
issues.append(f"Multiple H1 tags found ({len(h1_tags)})")
|
|
score -= 10
|
|
|
|
return {
|
|
"canonical_tag": canonical['href'] if canonical else "Missing",
|
|
"schema_markup": f"Found {len(schema)} schema objects",
|
|
"h1_count": len(h1_tags),
|
|
"score": max(0, score),
|
|
"issues": issues
|
|
}
|
|
|
|
def _analyze_content(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
|
"""Analyze content quality"""
|
|
# Remove scripts and styles
|
|
for script in soup(["script", "style"]):
|
|
script.extract()
|
|
|
|
text = soup.get_text()
|
|
words = len(re.findall(r'\w+', text))
|
|
|
|
images = soup.find_all('img')
|
|
images_without_alt = sum(1 for img in images if not img.get('alt'))
|
|
|
|
issues = []
|
|
score = 100
|
|
|
|
if words < 300:
|
|
issues.append(f"Low word count ({words} words)")
|
|
score -= 20
|
|
|
|
if images_without_alt > 0:
|
|
issues.append(f"{images_without_alt} images missing alt text")
|
|
score -= 10
|
|
|
|
return {
|
|
"word_count": words,
|
|
"total_images": len(images),
|
|
"images_without_alt": images_without_alt,
|
|
"readability": "Good" if words > 300 else "Needs Improvement", # Placeholder for readability algo
|
|
"score": max(0, score),
|
|
"issues": issues
|
|
}
|
|
|
|
def _analyze_url_structure(self, url: str) -> Dict[str, Any]:
|
|
parsed = urlparse(url)
|
|
return {
|
|
"protocol": parsed.scheme,
|
|
"domain": parsed.netloc,
|
|
"path_depth": len(parsed.path.strip('/').split('/')) if parsed.path else 0,
|
|
"is_https": parsed.scheme == 'https'
|
|
}
|
|
|
|
def _calculate_overall_score(self, *analyses) -> int:
|
|
total = sum(a.get('score', 0) for a in analyses)
|
|
return round(total / len(analyses))
|
|
|
|
def _generate_summary(self, *analyses) -> Dict[str, Any]:
|
|
critical_issues = []
|
|
for a in analyses:
|
|
for issue in a.get('issues', []):
|
|
critical_issues.append({"message": issue, "severity": "critical", "category": "SEO"})
|
|
return {"critical_issues": critical_issues}
|
|
|
|
async def analyze_on_page_seo(
|
|
self,
|
|
url: str,
|
|
target_keywords: Optional[List[str]] = None,
|
|
analyze_images: bool = True,
|
|
analyze_content_quality: bool = True
|
|
) -> Dict[str, Any]:
|
|
"""Analyze on-page SEO factors"""
|
|
try:
|
|
# Add protocol if missing
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = 'https://' + url
|
|
|
|
html_content, status_code = await self._fetch_page(url)
|
|
|
|
if not html_content:
|
|
# Return error structure
|
|
return {
|
|
"url": url,
|
|
"overall_score": 0,
|
|
"summary": {"critical_issues": [{"message": f"Failed to fetch URL (Status: {status_code})", "severity": "critical", "category": "Connectivity"}]},
|
|
"meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {}
|
|
}
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Run Analyses
|
|
meta_analysis = self._analyze_meta_tags(soup)
|
|
technical_analysis = self._analyze_technical(soup, url)
|
|
content_analysis = self._analyze_content(soup)
|
|
url_analysis = self._analyze_url_structure(url)
|
|
|
|
result = {
|
|
"url": url,
|
|
"overall_score": self._calculate_overall_score(meta_analysis, technical_analysis, content_analysis),
|
|
"meta": meta_analysis,
|
|
"technical": technical_analysis,
|
|
"content_health": content_analysis,
|
|
"url_structure": url_analysis,
|
|
"performance": {"load_time": "Real-time check pending"},
|
|
"accessibility": {"images_without_alt": content_analysis["images_without_alt"]},
|
|
"ux": {"viewport": meta_analysis["has_viewport"], "mobile_friendly": bool(meta_analysis["has_viewport"])},
|
|
"summary": self._generate_summary(meta_analysis, technical_analysis, content_analysis)
|
|
}
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing {url}: {str(e)}")
|
|
return {
|
|
"url": url,
|
|
"overall_score": 0,
|
|
"summary": {"critical_issues": [{"message": str(e), "severity": "critical", "category": "System"}]},
|
|
"meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {}
|
|
}
|
|
|
|
async def health_check(self) -> Dict[str, Any]:
|
|
"""Health check for the on-page SEO service"""
|
|
return {
|
|
"status": "operational",
|
|
"service": self.service_name,
|
|
"last_check": datetime.utcnow().isoformat()
|
|
}
|