Files
ALwrity/ToBeMigrated/ai_seo_tools/technical_seo_crawler/crawler.py
2025-08-06 16:29:49 +05:30

709 lines
31 KiB
Python

"""
Comprehensive Technical SEO Crawler using Advertools Integration.
This module provides advanced site-wide technical SEO analysis using:
- adv.crawl: Complete website crawling and analysis
- adv.crawl_headers: HTTP headers and server analysis
- adv.crawl_images: Image optimization analysis
- adv.url_to_df: URL structure optimization
- AI-powered technical recommendations
"""
import streamlit as st
import pandas as pd
import advertools as adv
from typing import Dict, Any, List, Optional, Tuple
from urllib.parse import urlparse, urljoin
import tempfile
import os
from datetime import datetime
import json
from collections import Counter, defaultdict
from loguru import logger
import numpy as np
# Import existing modules
from lib.gpt_providers.text_generation.main_text_generation import llm_text_gen
from lib.utils.website_analyzer.analyzer import WebsiteAnalyzer
class TechnicalSEOCrawler:
"""Comprehensive technical SEO crawler with advertools integration."""
def __init__(self):
"""Initialize the technical SEO crawler."""
self.temp_dir = tempfile.mkdtemp()
logger.info("TechnicalSEOCrawler initialized")
def analyze_website_technical_seo(self, website_url: str, crawl_depth: int = 3,
max_pages: int = 500) -> Dict[str, Any]:
"""
Perform comprehensive technical SEO analysis.
Args:
website_url: Website URL to analyze
crawl_depth: How deep to crawl (1-5)
max_pages: Maximum pages to crawl (50-1000)
Returns:
Comprehensive technical SEO analysis results
"""
try:
st.info("🚀 Starting Comprehensive Technical SEO Crawl...")
# Initialize results structure
results = {
'analysis_timestamp': datetime.utcnow().isoformat(),
'website_url': website_url,
'crawl_settings': {
'depth': crawl_depth,
'max_pages': max_pages
},
'crawl_overview': {},
'technical_issues': {},
'performance_analysis': {},
'content_analysis': {},
'url_structure': {},
'image_optimization': {},
'security_headers': {},
'mobile_seo': {},
'structured_data': {},
'ai_recommendations': {}
}
# Phase 1: Core Website Crawl
with st.expander("🕷️ Website Crawling Progress", expanded=True):
crawl_data = self._perform_comprehensive_crawl(website_url, crawl_depth, max_pages)
results['crawl_overview'] = crawl_data
st.success(f"✅ Crawled {crawl_data.get('pages_crawled', 0)} pages")
# Phase 2: Technical Issues Detection
with st.expander("🔍 Technical Issues Analysis", expanded=True):
technical_issues = self._analyze_technical_issues(crawl_data)
results['technical_issues'] = technical_issues
st.success("✅ Identified technical SEO issues")
# Phase 3: Performance Analysis
with st.expander("⚡ Performance Analysis", expanded=True):
performance = self._analyze_performance_metrics(crawl_data)
results['performance_analysis'] = performance
st.success("✅ Analyzed website performance metrics")
# Phase 4: Content & Structure Analysis
with st.expander("📊 Content Structure Analysis", expanded=True):
content_analysis = self._analyze_content_structure(crawl_data)
results['content_analysis'] = content_analysis
st.success("✅ Analyzed content structure and optimization")
# Phase 5: URL Structure Optimization
with st.expander("🔗 URL Structure Analysis", expanded=True):
url_analysis = self._analyze_url_structure(crawl_data)
results['url_structure'] = url_analysis
st.success("✅ Analyzed URL structure and patterns")
# Phase 6: Image SEO Analysis
with st.expander("🖼️ Image SEO Analysis", expanded=True):
image_analysis = self._analyze_image_seo(website_url)
results['image_optimization'] = image_analysis
st.success("✅ Analyzed image optimization")
# Phase 7: Security & Headers Analysis
with st.expander("🛡️ Security Headers Analysis", expanded=True):
security_analysis = self._analyze_security_headers(website_url)
results['security_headers'] = security_analysis
st.success("✅ Analyzed security headers")
# Phase 8: Mobile SEO Analysis
with st.expander("📱 Mobile SEO Analysis", expanded=True):
mobile_analysis = self._analyze_mobile_seo(crawl_data)
results['mobile_seo'] = mobile_analysis
st.success("✅ Analyzed mobile SEO factors")
# Phase 9: AI-Powered Recommendations
with st.expander("🤖 AI Technical Recommendations", expanded=True):
ai_recommendations = self._generate_technical_recommendations(results)
results['ai_recommendations'] = ai_recommendations
st.success("✅ Generated AI-powered technical recommendations")
return results
except Exception as e:
error_msg = f"Error in technical SEO analysis: {str(e)}"
logger.error(error_msg, exc_info=True)
st.error(error_msg)
return {'error': error_msg}
def _perform_comprehensive_crawl(self, website_url: str, depth: int, max_pages: int) -> Dict[str, Any]:
"""Perform comprehensive website crawl using adv.crawl."""
try:
st.info("🕷️ Crawling website for comprehensive analysis...")
# Create crawl output file
crawl_file = os.path.join(self.temp_dir, "technical_crawl.jl")
# Configure crawl settings for technical SEO
custom_settings = {
'DEPTH_LIMIT': depth,
'CLOSESPIDER_PAGECOUNT': max_pages,
'DOWNLOAD_DELAY': 0.5, # Be respectful
'CONCURRENT_REQUESTS': 8,
'ROBOTSTXT_OBEY': True,
'USER_AGENT': 'ALwrity-TechnicalSEO-Crawler/1.0',
'COOKIES_ENABLED': False,
'TELNETCONSOLE_ENABLED': False,
'LOG_LEVEL': 'WARNING'
}
# Start crawl
adv.crawl(
url_list=[website_url],
output_file=crawl_file,
follow_links=True,
custom_settings=custom_settings
)
# Read and process crawl results
if os.path.exists(crawl_file):
crawl_df = pd.read_json(crawl_file, lines=True)
# Basic crawl statistics
crawl_overview = {
'pages_crawled': len(crawl_df),
'status_codes': crawl_df['status'].value_counts().to_dict(),
'crawl_file_path': crawl_file,
'crawl_dataframe': crawl_df,
'domains_found': crawl_df['url'].apply(lambda x: urlparse(x).netloc).nunique(),
'avg_response_time': crawl_df.get('download_latency', pd.Series()).mean(),
'total_content_size': crawl_df.get('size', pd.Series()).sum()
}
return crawl_overview
else:
st.error("Crawl file not created")
return {}
except Exception as e:
st.error(f"Error in website crawl: {str(e)}")
return {}
def _analyze_technical_issues(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze technical SEO issues from crawl data."""
try:
st.info("🔍 Detecting technical SEO issues...")
if 'crawl_dataframe' not in crawl_data:
return {}
df = crawl_data['crawl_dataframe']
technical_issues = {
'http_errors': {},
'redirect_issues': {},
'duplicate_content': {},
'missing_elements': {},
'page_speed_issues': {},
'crawlability_issues': {}
}
# HTTP Status Code Issues
error_codes = df[df['status'] >= 400]['status'].value_counts().to_dict()
technical_issues['http_errors'] = {
'total_errors': len(df[df['status'] >= 400]),
'error_breakdown': error_codes,
'error_pages': df[df['status'] >= 400][['url', 'status']].to_dict('records')[:50]
}
# Redirect Analysis
redirects = df[df['status'].isin([301, 302, 303, 307, 308])]
technical_issues['redirect_issues'] = {
'total_redirects': len(redirects),
'redirect_chains': self._find_redirect_chains(redirects),
'redirect_types': redirects['status'].value_counts().to_dict()
}
# Duplicate Content Detection
if 'title' in df.columns:
duplicate_titles = df['title'].value_counts()
duplicate_titles = duplicate_titles[duplicate_titles > 1]
technical_issues['duplicate_content'] = {
'duplicate_titles': len(duplicate_titles),
'duplicate_title_groups': duplicate_titles.to_dict(),
'pages_with_duplicate_titles': df[df['title'].isin(duplicate_titles.index)][['url', 'title']].to_dict('records')[:20]
}
# Missing Elements Analysis
missing_elements = {
'missing_titles': len(df[(df['title'].isna()) | (df['title'] == '')]) if 'title' in df.columns else 0,
'missing_meta_desc': len(df[(df['meta_desc'].isna()) | (df['meta_desc'] == '')]) if 'meta_desc' in df.columns else 0,
'missing_h1': len(df[(df['h1'].isna()) | (df['h1'] == '')]) if 'h1' in df.columns else 0
}
technical_issues['missing_elements'] = missing_elements
# Page Speed Issues
if 'download_latency' in df.columns:
slow_pages = df[df['download_latency'] > 3.0] # Pages taking >3s
technical_issues['page_speed_issues'] = {
'slow_pages_count': len(slow_pages),
'avg_load_time': df['download_latency'].mean(),
'slowest_pages': slow_pages.nlargest(10, 'download_latency')[['url', 'download_latency']].to_dict('records')
}
return technical_issues
except Exception as e:
st.error(f"Error analyzing technical issues: {str(e)}")
return {}
def _analyze_performance_metrics(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze website performance metrics."""
try:
st.info("⚡ Analyzing performance metrics...")
if 'crawl_dataframe' not in crawl_data:
return {}
df = crawl_data['crawl_dataframe']
performance = {
'load_time_analysis': {},
'content_size_analysis': {},
'server_performance': {},
'optimization_opportunities': []
}
# Load Time Analysis
if 'download_latency' in df.columns:
load_times = df['download_latency'].dropna()
performance['load_time_analysis'] = {
'avg_load_time': load_times.mean(),
'median_load_time': load_times.median(),
'p95_load_time': load_times.quantile(0.95),
'fastest_page': load_times.min(),
'slowest_page': load_times.max(),
'pages_over_3s': len(load_times[load_times > 3]),
'performance_distribution': {
'fast_pages': len(load_times[load_times <= 1]),
'moderate_pages': len(load_times[(load_times > 1) & (load_times <= 3)]),
'slow_pages': len(load_times[load_times > 3])
}
}
# Content Size Analysis
if 'size' in df.columns:
sizes = df['size'].dropna()
performance['content_size_analysis'] = {
'avg_page_size': sizes.mean(),
'median_page_size': sizes.median(),
'largest_page': sizes.max(),
'smallest_page': sizes.min(),
'pages_over_1mb': len(sizes[sizes > 1048576]), # 1MB
'total_content_size': sizes.sum()
}
# Server Performance
status_codes = df['status'].value_counts()
total_pages = len(df)
performance['server_performance'] = {
'success_rate': status_codes.get(200, 0) / total_pages * 100,
'error_rate': sum(status_codes.get(code, 0) for code in range(400, 600)) / total_pages * 100,
'redirect_rate': sum(status_codes.get(code, 0) for code in [301, 302, 303, 307, 308]) / total_pages * 100
}
return performance
except Exception as e:
st.error(f"Error analyzing performance: {str(e)}")
return {}
def _analyze_content_structure(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze content structure and SEO elements."""
try:
st.info("📊 Analyzing content structure...")
if 'crawl_dataframe' not in crawl_data:
return {}
df = crawl_data['crawl_dataframe']
content_analysis = {
'title_analysis': {},
'meta_description_analysis': {},
'heading_structure': {},
'internal_linking': {},
'content_optimization': {}
}
# Title Analysis
if 'title' in df.columns:
titles = df['title'].dropna()
title_lengths = titles.str.len()
content_analysis['title_analysis'] = {
'avg_title_length': title_lengths.mean(),
'title_length_distribution': {
'too_short': len(title_lengths[title_lengths < 30]),
'optimal': len(title_lengths[(title_lengths >= 30) & (title_lengths <= 60)]),
'too_long': len(title_lengths[title_lengths > 60])
},
'duplicate_titles': len(titles.value_counts()[titles.value_counts() > 1]),
'missing_titles': len(df) - len(titles)
}
# Meta Description Analysis
if 'meta_desc' in df.columns:
meta_descs = df['meta_desc'].dropna()
meta_lengths = meta_descs.str.len()
content_analysis['meta_description_analysis'] = {
'avg_meta_length': meta_lengths.mean(),
'meta_length_distribution': {
'too_short': len(meta_lengths[meta_lengths < 120]),
'optimal': len(meta_lengths[(meta_lengths >= 120) & (meta_lengths <= 160)]),
'too_long': len(meta_lengths[meta_lengths > 160])
},
'missing_meta_descriptions': len(df) - len(meta_descs)
}
# Heading Structure Analysis
heading_cols = [col for col in df.columns if col.startswith('h') and col[1:].isdigit()]
if heading_cols:
heading_analysis = {}
for col in heading_cols:
headings = df[col].dropna()
heading_analysis[f'{col}_usage'] = {
'pages_with_heading': len(headings),
'usage_rate': len(headings) / len(df) * 100,
'avg_length': headings.str.len().mean() if len(headings) > 0 else 0
}
content_analysis['heading_structure'] = heading_analysis
# Internal Linking Analysis
if 'links_internal' in df.columns:
internal_links = df['links_internal'].apply(lambda x: len(x) if isinstance(x, list) else 0)
content_analysis['internal_linking'] = {
'avg_internal_links': internal_links.mean(),
'pages_with_no_internal_links': len(internal_links[internal_links == 0]),
'max_internal_links': internal_links.max(),
'internal_link_distribution': internal_links.describe().to_dict()
}
return content_analysis
except Exception as e:
st.error(f"Error analyzing content structure: {str(e)}")
return {}
def _analyze_url_structure(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze URL structure and optimization using adv.url_to_df."""
try:
st.info("🔗 Analyzing URL structure...")
if 'crawl_dataframe' not in crawl_data:
return {}
df = crawl_data['crawl_dataframe']
urls = df['url'].tolist()
# Use advertools to analyze URL structure
url_df = adv.url_to_df(urls)
url_analysis = {
'url_length_analysis': {},
'url_structure_patterns': {},
'url_optimization': {},
'path_analysis': {}
}
# URL Length Analysis
url_lengths = url_df['url'].str.len()
url_analysis['url_length_analysis'] = {
'avg_url_length': url_lengths.mean(),
'max_url_length': url_lengths.max(),
'long_urls_count': len(url_lengths[url_lengths > 100]),
'url_length_distribution': url_lengths.describe().to_dict()
}
# Path Depth Analysis
if 'dir_1' in url_df.columns:
path_depths = url_df.apply(lambda row: sum(1 for i in range(1, 10) if f'dir_{i}' in row and pd.notna(row[f'dir_{i}'])), axis=1)
url_analysis['path_analysis'] = {
'avg_path_depth': path_depths.mean(),
'max_path_depth': path_depths.max(),
'deep_paths_count': len(path_depths[path_depths > 4]),
'path_depth_distribution': path_depths.value_counts().to_dict()
}
# URL Structure Patterns
domains = url_df['netloc'].value_counts()
schemes = url_df['scheme'].value_counts()
url_analysis['url_structure_patterns'] = {
'domains_found': domains.to_dict(),
'schemes_used': schemes.to_dict(),
'subdomain_usage': len(url_df[url_df['netloc'].str.contains('\.', regex=True)]),
'https_usage': schemes.get('https', 0) / len(url_df) * 100
}
# URL Optimization Issues
optimization_issues = []
# Check for non-HTTPS URLs
if schemes.get('http', 0) > 0:
optimization_issues.append(f"{schemes.get('http', 0)} pages not using HTTPS")
# Check for long URLs
long_urls = len(url_lengths[url_lengths > 100])
if long_urls > 0:
optimization_issues.append(f"{long_urls} URLs are too long (>100 characters)")
# Check for deep paths
if 'path_analysis' in url_analysis:
deep_paths = url_analysis['path_analysis']['deep_paths_count']
if deep_paths > 0:
optimization_issues.append(f"{deep_paths} URLs have deep path structures (>4 levels)")
url_analysis['url_optimization'] = {
'issues_found': len(optimization_issues),
'optimization_recommendations': optimization_issues
}
return url_analysis
except Exception as e:
st.error(f"Error analyzing URL structure: {str(e)}")
return {}
def _analyze_image_seo(self, website_url: str) -> Dict[str, Any]:
"""Analyze image SEO using adv.crawl_images."""
try:
st.info("🖼️ Analyzing image SEO...")
# Create image crawl output file
image_file = os.path.join(self.temp_dir, "image_crawl.jl")
# Crawl images
adv.crawl_images(
url_list=[website_url],
output_file=image_file,
custom_settings={
'DEPTH_LIMIT': 2,
'CLOSESPIDER_PAGECOUNT': 100,
'DOWNLOAD_DELAY': 1
}
)
image_analysis = {
'image_count': 0,
'alt_text_analysis': {},
'image_format_analysis': {},
'image_size_analysis': {},
'optimization_opportunities': []
}
if os.path.exists(image_file):
image_df = pd.read_json(image_file, lines=True)
image_analysis['image_count'] = len(image_df)
# Alt text analysis
if 'img_alt' in image_df.columns:
alt_texts = image_df['img_alt'].dropna()
missing_alt = len(image_df) - len(alt_texts)
image_analysis['alt_text_analysis'] = {
'images_with_alt': len(alt_texts),
'images_missing_alt': missing_alt,
'alt_text_coverage': len(alt_texts) / len(image_df) * 100,
'avg_alt_length': alt_texts.str.len().mean() if len(alt_texts) > 0 else 0
}
# Image format analysis
if 'img_src' in image_df.columns:
# Extract file extensions
extensions = image_df['img_src'].str.extract(r'\.([a-zA-Z]{2,4})(?:\?|$)')
format_counts = extensions[0].value_counts()
image_analysis['image_format_analysis'] = {
'format_distribution': format_counts.to_dict(),
'modern_format_usage': format_counts.get('webp', 0) + format_counts.get('avif', 0)
}
return image_analysis
except Exception as e:
st.error(f"Error analyzing images: {str(e)}")
return {}
def _analyze_security_headers(self, website_url: str) -> Dict[str, Any]:
"""Analyze security headers using adv.crawl_headers."""
try:
st.info("🛡️ Analyzing security headers...")
# Create headers output file
headers_file = os.path.join(self.temp_dir, "security_headers.jl")
# Crawl headers
adv.crawl_headers([website_url], output_file=headers_file)
security_analysis = {
'security_headers_present': {},
'security_score': 0,
'security_recommendations': []
}
if os.path.exists(headers_file):
headers_df = pd.read_json(headers_file, lines=True)
# Check for important security headers
security_headers = {
'X-Frame-Options': 'resp_headers_X-Frame-Options',
'X-Content-Type-Options': 'resp_headers_X-Content-Type-Options',
'X-XSS-Protection': 'resp_headers_X-XSS-Protection',
'Strict-Transport-Security': 'resp_headers_Strict-Transport-Security',
'Content-Security-Policy': 'resp_headers_Content-Security-Policy',
'Referrer-Policy': 'resp_headers_Referrer-Policy'
}
headers_present = {}
for header_name, column_name in security_headers.items():
is_present = column_name in headers_df.columns and headers_df[column_name].notna().any()
headers_present[header_name] = is_present
security_analysis['security_headers_present'] = headers_present
# Calculate security score
present_count = sum(headers_present.values())
security_analysis['security_score'] = (present_count / len(security_headers)) * 100
# Generate recommendations
recommendations = []
for header_name, is_present in headers_present.items():
if not is_present:
recommendations.append(f"Add {header_name} header for improved security")
security_analysis['security_recommendations'] = recommendations
return security_analysis
except Exception as e:
st.error(f"Error analyzing security headers: {str(e)}")
return {}
def _analyze_mobile_seo(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze mobile SEO factors."""
try:
st.info("📱 Analyzing mobile SEO factors...")
if 'crawl_dataframe' not in crawl_data:
return {}
df = crawl_data['crawl_dataframe']
mobile_analysis = {
'viewport_analysis': {},
'mobile_optimization': {},
'responsive_design_indicators': {}
}
# Viewport meta tag analysis
if 'viewport' in df.columns:
viewport_present = df['viewport'].notna().sum()
mobile_analysis['viewport_analysis'] = {
'pages_with_viewport': viewport_present,
'viewport_coverage': viewport_present / len(df) * 100,
'pages_missing_viewport': len(df) - viewport_present
}
# Check for mobile-specific meta tags and indicators
mobile_indicators = []
# Check for touch icons
if any('touch-icon' in col for col in df.columns):
mobile_indicators.append("Touch icons configured")
# Check for responsive design indicators in content
# This is a simplified check - in practice, you'd analyze CSS and page structure
mobile_analysis['mobile_optimization'] = {
'mobile_indicators_found': len(mobile_indicators),
'mobile_indicators': mobile_indicators
}
return mobile_analysis
except Exception as e:
st.error(f"Error analyzing mobile SEO: {str(e)}")
return {}
def _generate_technical_recommendations(self, results: Dict[str, Any]) -> Dict[str, Any]:
"""Generate AI-powered technical SEO recommendations."""
try:
st.info("🤖 Generating technical recommendations...")
# Prepare technical analysis summary for AI
technical_summary = {
'website_url': results.get('website_url', ''),
'pages_crawled': results.get('crawl_overview', {}).get('pages_crawled', 0),
'error_count': results.get('technical_issues', {}).get('http_errors', {}).get('total_errors', 0),
'avg_load_time': results.get('performance_analysis', {}).get('load_time_analysis', {}).get('avg_load_time', 0),
'security_score': results.get('security_headers', {}).get('security_score', 0),
'missing_titles': results.get('content_analysis', {}).get('title_analysis', {}).get('missing_titles', 0),
'missing_meta_desc': results.get('content_analysis', {}).get('meta_description_analysis', {}).get('missing_meta_descriptions', 0)
}
# Generate AI recommendations
prompt = f"""
As a technical SEO expert, analyze this comprehensive website audit and provide prioritized recommendations:
WEBSITE: {technical_summary['website_url']}
PAGES ANALYZED: {technical_summary['pages_crawled']}
TECHNICAL ISSUES:
- HTTP Errors: {technical_summary['error_count']}
- Average Load Time: {technical_summary['avg_load_time']:.2f}s
- Security Score: {technical_summary['security_score']:.1f}%
- Missing Titles: {technical_summary['missing_titles']}
- Missing Meta Descriptions: {technical_summary['missing_meta_desc']}
PROVIDE:
1. Critical Issues (Fix Immediately)
2. High Priority Optimizations
3. Medium Priority Improvements
4. Long-term Technical Strategy
5. Specific Implementation Steps
6. Expected Impact Assessment
Format as JSON with clear priorities and actionable recommendations.
"""
ai_response = llm_text_gen(
prompt=prompt,
system_prompt="You are a senior technical SEO specialist with expertise in website optimization, Core Web Vitals, and search engine best practices.",
response_format="json_object"
)
if ai_response:
return ai_response
else:
return {'recommendations': ['AI recommendations temporarily unavailable']}
except Exception as e:
st.error(f"Error generating recommendations: {str(e)}")
return {}
def _find_redirect_chains(self, redirects_df: pd.DataFrame) -> List[Dict[str, Any]]:
"""Find redirect chains in the crawled data."""
# Simplified redirect chain detection
# In a full implementation, you'd trace the redirect paths
redirect_chains = []
if len(redirects_df) > 0:
# Group redirects by status code
for status_code in redirects_df['status'].unique():
status_redirects = redirects_df[redirects_df['status'] == status_code]
redirect_chains.append({
'status_code': int(status_code),
'count': len(status_redirects),
'examples': status_redirects['url'].head(5).tolist()
})
return redirect_chains