Base code

This commit is contained in:
Kunthawat Greethong
2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions

View File

@@ -0,0 +1,135 @@
"""
SEO Analyzer Utilities
Contains utility classes for HTML fetching and AI insight generation.
"""
import requests
from typing import Optional, Dict, List, Any
from loguru import logger
class HTMLFetcher:
"""Utility class for fetching HTML content from URLs"""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def fetch_html(self, url: str) -> Optional[str]:
"""Fetch HTML content with retries and protocol fallback."""
def _try_fetch(target_url: str, timeout_s: int = 30) -> Optional[str]:
try:
response = self.session.get(
target_url,
timeout=timeout_s,
allow_redirects=True,
)
response.raise_for_status()
return response.text
except Exception as inner_e:
logger.error(f"Error fetching HTML from {target_url}: {inner_e}")
return None
# First attempt
html = _try_fetch(url, timeout_s=30)
if html is not None:
return html
# Retry once (shorter timeout)
html = _try_fetch(url, timeout_s=15)
if html is not None:
return html
# If https fails due to resets, try http fallback once
try:
if url.startswith("https://"):
http_url = "http://" + url[len("https://"):]
logger.info(f"SEO Analyzer: Falling back to HTTP for {http_url}")
html = _try_fetch(http_url, timeout_s=15)
if html is not None:
return html
except Exception:
# Best-effort fallback; errors already logged in _try_fetch
pass
return None
class AIInsightGenerator:
"""Utility class for generating AI-powered insights from analysis data"""
def generate_insights(self, analysis_data: Dict[str, Any], url: str) -> List[Dict[str, Any]]:
"""Generate AI-powered insights based on analysis data"""
insights = []
# Analyze overall performance
total_issues = sum(len(data.get('issues', [])) for data in analysis_data.values() if isinstance(data, dict))
total_warnings = sum(len(data.get('warnings', [])) for data in analysis_data.values() if isinstance(data, dict))
if total_issues > 5:
insights.append({
'type': 'critical',
'message': f'High number of critical issues ({total_issues}) detected',
'priority': 'high',
'action': 'fix_critical_issues',
'description': 'Multiple critical SEO issues need immediate attention to improve search rankings.'
})
# Content quality insights
content_data = analysis_data.get('content_analysis', {})
if content_data.get('word_count', 0) < 300:
insights.append({
'type': 'warning',
'message': 'Content is too thin for good SEO',
'priority': 'medium',
'action': 'expand_content',
'description': 'Add more valuable, relevant content to improve search rankings and user engagement.'
})
# Technical SEO insights
technical_data = analysis_data.get('technical_seo', {})
if not technical_data.get('has_canonical', False):
insights.append({
'type': 'critical',
'message': 'Missing canonical URL can cause duplicate content issues',
'priority': 'high',
'action': 'add_canonical',
'description': 'Canonical URLs help prevent duplicate content penalties.'
})
# Security insights
security_data = analysis_data.get('security_headers', {})
if security_data.get('total_headers', 0) < 3:
insights.append({
'type': 'warning',
'message': 'Insufficient security headers',
'priority': 'medium',
'action': 'improve_security',
'description': 'Security headers protect against common web vulnerabilities.'
})
# Performance insights
performance_data = analysis_data.get('performance', {})
if performance_data.get('load_time', 0) > 3:
insights.append({
'type': 'critical',
'message': 'Page load time is too slow',
'priority': 'high',
'action': 'optimize_performance',
'description': 'Slow loading pages negatively impact user experience and search rankings.'
})
# URL structure insights
url_data = analysis_data.get('url_structure', {})
if not url_data.get('has_https', False):
insights.append({
'type': 'critical',
'message': 'Website is not using HTTPS',
'priority': 'high',
'action': 'enable_https',
'description': 'HTTPS is required for security and is a ranking factor for search engines.'
})
return insights