Files
moreminimore-marketing/backend/services/seo_analyzer/utils.py
Kunthawat Greethong c35fa52117 Base code
2026-01-08 22:39:53 +07:00

135 lines
5.3 KiB
Python

"""
SEO Analyzer Utilities
Contains utility classes for HTML fetching and AI insight generation.
"""
import requests
from typing import Optional, Dict, List, Any
from loguru import logger
class HTMLFetcher:
"""Utility class for fetching HTML content from URLs"""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def fetch_html(self, url: str) -> Optional[str]:
"""Fetch HTML content with retries and protocol fallback."""
def _try_fetch(target_url: str, timeout_s: int = 30) -> Optional[str]:
try:
response = self.session.get(
target_url,
timeout=timeout_s,
allow_redirects=True,
)
response.raise_for_status()
return response.text
except Exception as inner_e:
logger.error(f"Error fetching HTML from {target_url}: {inner_e}")
return None
# First attempt
html = _try_fetch(url, timeout_s=30)
if html is not None:
return html
# Retry once (shorter timeout)
html = _try_fetch(url, timeout_s=15)
if html is not None:
return html
# If https fails due to resets, try http fallback once
try:
if url.startswith("https://"):
http_url = "http://" + url[len("https://"):]
logger.info(f"SEO Analyzer: Falling back to HTTP for {http_url}")
html = _try_fetch(http_url, timeout_s=15)
if html is not None:
return html
except Exception:
# Best-effort fallback; errors already logged in _try_fetch
pass
return None
class AIInsightGenerator:
"""Utility class for generating AI-powered insights from analysis data"""
def generate_insights(self, analysis_data: Dict[str, Any], url: str) -> List[Dict[str, Any]]:
"""Generate AI-powered insights based on analysis data"""
insights = []
# Analyze overall performance
total_issues = sum(len(data.get('issues', [])) for data in analysis_data.values() if isinstance(data, dict))
total_warnings = sum(len(data.get('warnings', [])) for data in analysis_data.values() if isinstance(data, dict))
if total_issues > 5:
insights.append({
'type': 'critical',
'message': f'High number of critical issues ({total_issues}) detected',
'priority': 'high',
'action': 'fix_critical_issues',
'description': 'Multiple critical SEO issues need immediate attention to improve search rankings.'
})
# Content quality insights
content_data = analysis_data.get('content_analysis', {})
if content_data.get('word_count', 0) < 300:
insights.append({
'type': 'warning',
'message': 'Content is too thin for good SEO',
'priority': 'medium',
'action': 'expand_content',
'description': 'Add more valuable, relevant content to improve search rankings and user engagement.'
})
# Technical SEO insights
technical_data = analysis_data.get('technical_seo', {})
if not technical_data.get('has_canonical', False):
insights.append({
'type': 'critical',
'message': 'Missing canonical URL can cause duplicate content issues',
'priority': 'high',
'action': 'add_canonical',
'description': 'Canonical URLs help prevent duplicate content penalties.'
})
# Security insights
security_data = analysis_data.get('security_headers', {})
if security_data.get('total_headers', 0) < 3:
insights.append({
'type': 'warning',
'message': 'Insufficient security headers',
'priority': 'medium',
'action': 'improve_security',
'description': 'Security headers protect against common web vulnerabilities.'
})
# Performance insights
performance_data = analysis_data.get('performance', {})
if performance_data.get('load_time', 0) > 3:
insights.append({
'type': 'critical',
'message': 'Page load time is too slow',
'priority': 'high',
'action': 'optimize_performance',
'description': 'Slow loading pages negatively impact user experience and search rankings.'
})
# URL structure insights
url_data = analysis_data.get('url_structure', {})
if not url_data.get('has_https', False):
insights.append({
'type': 'critical',
'message': 'Website is not using HTTPS',
'priority': 'high',
'action': 'enable_https',
'description': 'HTTPS is required for security and is a ranking factor for search engines.'
})
return insights