""" SEO Analyzers Module Contains all individual SEO analysis components. """ import re import time import requests from urllib.parse import urlparse, urljoin from typing import Dict, List, Any, Optional from bs4 import BeautifulSoup from loguru import logger class BaseAnalyzer: """Base class for all SEO analyzers""" def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) class URLStructureAnalyzer(BaseAnalyzer): """Analyzes URL structure and security""" def analyze(self, url: str) -> Dict[str, Any]: """Enhanced URL structure analysis with specific fixes""" parsed = urlparse(url) issues = [] warnings = [] recommendations = [] # Check URL length if len(url) > 2000: issues.append({ 'type': 'critical', 'message': f'URL is too long ({len(url)} characters)', 'location': 'URL', 'current_value': url, 'fix': 'Shorten URL to under 2000 characters', 'code_example': f'Link', 'action': 'shorten_url' }) # Check for hyphens if '_' in parsed.path and '-' not in parsed.path: issues.append({ 'type': 'critical', 'message': 'URL uses underscores instead of hyphens', 'location': 'URL', 'current_value': parsed.path, 'fix': 'Replace underscores with hyphens', 'code_example': f'Link', 'action': 'replace_underscores' }) # Check for special characters special_chars = re.findall(r'[^a-zA-Z0-9\-_/]', parsed.path) if special_chars: warnings.append({ 'type': 'warning', 'message': f'URL contains special characters: {", ".join(set(special_chars))}', 'location': 'URL', 'current_value': parsed.path, 'fix': 'Remove special characters from URL', 'code_example': f'Link', 'action': 'remove_special_chars' }) # Check for HTTPS if parsed.scheme != 'https': issues.append({ 'type': 'critical', 'message': 'URL is not using HTTPS', 'location': 'URL', 'current_value': parsed.scheme, 'fix': 'Redirect to HTTPS', 'code_example': 'RewriteEngine On\nRewriteCond %{HTTPS} off\nRewriteRule ^(.*)$ https://%{HTTP_HOST}%{REQUEST_URI} [L,R=301]', 'action': 'enable_https' }) score = max(0, 100 - len(issues) * 25 - len(warnings) * 10) return { 'score': score, 'issues': issues, 'warnings': warnings, 'recommendations': recommendations, 'url_length': len(url), 'has_https': parsed.scheme == 'https', 'has_hyphens': '-' in parsed.path, 'special_chars_count': len(special_chars) } class MetaDataAnalyzer(BaseAnalyzer): """Analyzes meta data and technical SEO elements""" def analyze(self, html_content: str, url: str) -> Dict[str, Any]: """Enhanced meta data analysis with specific element locations""" soup = BeautifulSoup(html_content, 'html.parser') issues = [] warnings = [] recommendations = [] # Title analysis title_tag = soup.find('title') if not title_tag: issues.append({ 'type': 'critical', 'message': 'Missing title tag', 'location': '
', 'fix': 'Add title tag to head section', 'code_example': '
',
'action': 'add_alt_text'
})
# Check for internal links
internal_links = soup.find_all('a', href=re.compile(r'^[^http]'))
if len(internal_links) < 3:
warnings.append({
'type': 'warning',
'message': f'Few internal links ({len(internal_links)} found)',
'location': 'Page content',
'current_value': f'{len(internal_links)} internal links',
'fix': 'Add more internal links to improve site structure',
'code_example': 'Related content',
'action': 'add_internal_links'
})
# Check for spelling errors (basic check)
common_words = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']
potential_errors = []
for word in words[:100]: # Check first 100 words
if len(word) > 3 and word.lower() not in common_words:
# Basic spell check (this is simplified - in production you'd use a proper spell checker)
if re.search(r'[a-z]{15,}', word.lower()): # Very long words might be misspelled
potential_errors.append(word)
if potential_errors:
issues.append({
'type': 'critical',
'message': f'Potential spelling errors found: {", ".join(potential_errors[:5])}',
'location': 'Page content',
'current_value': f'{len(potential_errors)} potential errors',
'fix': 'Review and correct spelling errors',
'code_example': 'Use spell checker or proofread content',
'action': 'fix_spelling'
})
score = max(0, 100 - len(issues) * 25 - len(warnings) * 10)
return {
'score': score,
'issues': issues,
'warnings': warnings,
'recommendations': recommendations,
'word_count': word_count,
'h1_count': len(h1_tags),
'images_count': len(images),
'images_without_alt': len(images_without_alt),
'internal_links_count': len(internal_links),
'potential_spelling_errors': len(potential_errors)
}
class TechnicalSEOAnalyzer(BaseAnalyzer):
"""Analyzes technical SEO elements"""
def analyze(self, html_content: str, url: str) -> Dict[str, Any]:
"""Enhanced technical SEO analysis with specific fixes"""
soup = BeautifulSoup(html_content, 'html.parser')
issues = []
warnings = []
recommendations = []
# Check for robots.txt
robots_url = urljoin(url, '/robots.txt')
try:
robots_response = self.session.get(robots_url, timeout=5)
if robots_response.status_code != 200:
warnings.append({
'type': 'warning',
'message': 'Robots.txt not accessible',
'location': 'Server',
'fix': 'Create robots.txt file',
'code_example': 'User-agent: *\nAllow: /',
'action': 'create_robots_txt'
})
except:
warnings.append({
'type': 'warning',
'message': 'Robots.txt not found',
'location': 'Server',
'fix': 'Create robots.txt file',
'code_example': 'User-agent: *\nAllow: /',
'action': 'create_robots_txt'
})
# Check for sitemap
sitemap_url = urljoin(url, '/sitemap.xml')
try:
sitemap_response = self.session.get(sitemap_url, timeout=5)
if sitemap_response.status_code != 200:
warnings.append({
'type': 'warning',
'message': 'Sitemap not accessible',
'location': 'Server',
'fix': 'Create XML sitemap',
'code_example': '\n
',
'action': 'add_alt_text'
})
# Check for form labels
forms = soup.find_all('form')
for form in forms:
inputs = form.find_all(['input', 'textarea', 'select'])
for input_elem in inputs:
if input_elem.get('type') not in ['hidden', 'submit', 'button']:
input_id = input_elem.get('id')
if input_id:
label = soup.find('label', attrs={'for': input_id})
if not label:
warnings.append({
'type': 'warning',
'message': f'Input without label (ID: {input_id})',
'location': 'Form',
'current_value': f'Input ID: {input_id}',
'fix': 'Add label for input field',
'code_example': f'',
'action': 'add_form_label'
})
# Check for heading hierarchy
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
if headings:
h1_count = len([h for h in headings if h.name == 'h1'])
if h1_count == 0:
issues.append({
'type': 'critical',
'message': 'No H1 heading found',
'location': 'Page structure',
'fix': 'Add H1 heading for main content',
'code_example': 'Contact us: info@example.com
', 'action': 'add_contact_info' }) # Check for social media links social_patterns = ['facebook', 'twitter', 'linkedin', 'instagram'] has_social = any(pattern in page_text for pattern in social_patterns) if not has_social: recommendations.append({ 'type': 'recommendation', 'message': 'No social media links found', 'location': 'Page content', 'fix': 'Add social media links', 'code_example': 'Facebook', 'action': 'add_social_links', 'priority': 'low' }) score = max(0, 100 - len(issues) * 25 - len(warnings) * 10) return { 'score': score, 'issues': issues, 'warnings': warnings, 'recommendations': recommendations, 'has_viewport': bool(viewport), 'has_navigation': bool(nav_elements), 'has_contact': has_contact, 'has_social': has_social } class SecurityHeadersAnalyzer(BaseAnalyzer): """Analyzes security headers""" def analyze(self, url: str) -> Dict[str, Any]: """Enhanced security headers analysis with specific fixes""" try: response = self.session.get(url, timeout=15, allow_redirects=True) security_headers = { 'X-Frame-Options': response.headers.get('X-Frame-Options'), 'X-Content-Type-Options': response.headers.get('X-Content-Type-Options'), 'X-XSS-Protection': response.headers.get('X-XSS-Protection'), 'Strict-Transport-Security': response.headers.get('Strict-Transport-Security'), 'Content-Security-Policy': response.headers.get('Content-Security-Policy'), 'Referrer-Policy': response.headers.get('Referrer-Policy') } issues = [] warnings = [] recommendations = [] present_headers = [] missing_headers = [] for header_name, header_value in security_headers.items(): if header_value: present_headers.append(header_name) else: missing_headers.append(header_name) if header_name in ['X-Frame-Options', 'X-Content-Type-Options']: issues.append({ 'type': 'critical', 'message': f'Missing {header_name} header', 'location': 'Server configuration', 'fix': f'Add {header_name} header', 'code_example': f'{header_name}: DENY' if header_name == 'X-Frame-Options' else f'{header_name}: nosniff', 'action': f'add_{header_name.lower().replace("-", "_")}_header' }) else: warnings.append({ 'type': 'warning', 'message': f'Missing {header_name} header', 'location': 'Server configuration', 'fix': f'Add {header_name} header for better security', 'code_example': f'{header_name}: max-age=31536000', 'action': f'add_{header_name.lower().replace("-", "_")}_header' }) score = min(100, len(present_headers) * 16) return { 'score': score, 'present_headers': present_headers, 'missing_headers': missing_headers, 'total_headers': len(present_headers), 'issues': issues, 'warnings': warnings, 'recommendations': recommendations } except Exception as e: logger.warning(f"Security headers analysis failed for {url}: {e}") return { 'score': 0, 'error': f'Error analyzing headers: {str(e)}', 'present_headers': [], 'missing_headers': ['All security headers'], 'total_headers': 0, 'issues': [{'type': 'critical', 'message': 'Could not analyze security headers', 'location': 'Server', 'fix': 'Check security headers manually', 'action': 'manual_check'}], 'warnings': [{'type': 'warning', 'message': 'Security headers analysis failed', 'location': 'Server', 'fix': 'Verify security headers manually', 'action': 'manual_check'}], 'recommendations': [{'type': 'recommendation', 'message': 'Check security headers manually', 'priority': 'medium', 'action': 'manual_check'}] } class KeywordAnalyzer(BaseAnalyzer): """Analyzes keyword usage and optimization""" def analyze(self, html_content: str, target_keywords: Optional[List[str]] = None) -> Dict[str, Any]: """Enhanced keyword analysis with specific locations""" if not target_keywords: return {'score': 0, 'issues': [], 'warnings': [], 'recommendations': []} soup = BeautifulSoup(html_content, 'html.parser') issues = [] warnings = [] recommendations = [] page_text = soup.get_text().lower() title_text = soup.find('title') title_text = title_text.get_text().lower() if title_text else "" for keyword in target_keywords: keyword_lower = keyword.lower() # Check if keyword is in title if keyword_lower not in title_text: issues.append({ 'type': 'critical', 'message': f'Target keyword "{keyword}" not in title', 'location': '