ALwrity/ToBeMigrated/ai_seo_tools/technical_seo_crawler/crawler.py

"""
Comprehensive Technical SEO Crawler using Advertools Integration.

This module provides advanced site-wide technical SEO analysis using:
- adv.crawl: Complete website crawling and analysis
- adv.crawl_headers: HTTP headers and server analysis
- adv.crawl_images: Image optimization analysis
- adv.url_to_df: URL structure optimization
- AI-powered technical recommendations
"""

import streamlit as st
import pandas as pd
import advertools as adv
from typing import Dict, Any, List, Optional, Tuple
from urllib.parse import urlparse, urljoin
import tempfile
import os
from datetime import datetime
import json
from collections import Counter, defaultdict
from loguru import logger
import numpy as np

# Import existing modules
from lib.gpt_providers.text_generation.main_text_generation import llm_text_gen
from lib.utils.website_analyzer.analyzer import WebsiteAnalyzer

class TechnicalSEOCrawler:
    """Comprehensive technical SEO crawler with advertools integration."""

    def __init__(self):
        """Initialize the technical SEO crawler."""
        self.temp_dir = tempfile.mkdtemp()
        logger.info("TechnicalSEOCrawler initialized")

    def analyze_website_technical_seo(self, website_url: str, crawl_depth: int = 3,
                                    max_pages: int = 500) -> Dict[str, Any]:
        """
        Perform comprehensive technical SEO analysis.

        Args:
            website_url: Website URL to analyze
            crawl_depth: How deep to crawl (1-5)
            max_pages: Maximum pages to crawl (50-1000)

        Returns:
            Comprehensive technical SEO analysis results
        """
        try:
            st.info("🚀 Starting Comprehensive Technical SEO Crawl...")

            # Initialize results structure
            results = {
                'analysis_timestamp': datetime.utcnow().isoformat(),
                'website_url': website_url,
                'crawl_settings': {
                    'depth': crawl_depth,
                    'max_pages': max_pages
                },
                'crawl_overview': {},
                'technical_issues': {},
                'performance_analysis': {},
                'content_analysis': {},
                'url_structure': {},
                'image_optimization': {},
                'security_headers': {},
                'mobile_seo': {},
                'structured_data': {},
                'ai_recommendations': {}
            }

            # Phase 1: Core Website Crawl
            with st.expander("🕷️ Website Crawling Progress", expanded=True):
                crawl_data = self._perform_comprehensive_crawl(website_url, crawl_depth, max_pages)
                results['crawl_overview'] = crawl_data
                st.success(f"✅ Crawled {crawl_data.get('pages_crawled', 0)} pages")

            # Phase 2: Technical Issues Detection
            with st.expander("🔍 Technical Issues Analysis", expanded=True):
                technical_issues = self._analyze_technical_issues(crawl_data)
                results['technical_issues'] = technical_issues
                st.success("✅ Identified technical SEO issues")

            # Phase 3: Performance Analysis
            with st.expander("⚡ Performance Analysis", expanded=True):
                performance = self._analyze_performance_metrics(crawl_data)
                results['performance_analysis'] = performance
                st.success("✅ Analyzed website performance metrics")

            # Phase 4: Content & Structure Analysis
            with st.expander("📊 Content Structure Analysis", expanded=True):
                content_analysis = self._analyze_content_structure(crawl_data)
                results['content_analysis'] = content_analysis
                st.success("✅ Analyzed content structure and optimization")

            # Phase 5: URL Structure Optimization
            with st.expander("🔗 URL Structure Analysis", expanded=True):
                url_analysis = self._analyze_url_structure(crawl_data)
                results['url_structure'] = url_analysis
                st.success("✅ Analyzed URL structure and patterns")

            # Phase 6: Image SEO Analysis
            with st.expander("🖼️ Image SEO Analysis", expanded=True):
                image_analysis = self._analyze_image_seo(website_url)
                results['image_optimization'] = image_analysis
                st.success("✅ Analyzed image optimization")

            # Phase 7: Security & Headers Analysis
            with st.expander("🛡️ Security Headers Analysis", expanded=True):
                security_analysis = self._analyze_security_headers(website_url)
                results['security_headers'] = security_analysis
                st.success("✅ Analyzed security headers")

            # Phase 8: Mobile SEO Analysis
            with st.expander("📱 Mobile SEO Analysis", expanded=True):
                mobile_analysis = self._analyze_mobile_seo(crawl_data)
                results['mobile_seo'] = mobile_analysis
                st.success("✅ Analyzed mobile SEO factors")

            # Phase 9: AI-Powered Recommendations
            with st.expander("🤖 AI Technical Recommendations", expanded=True):
                ai_recommendations = self._generate_technical_recommendations(results)
                results['ai_recommendations'] = ai_recommendations
                st.success("✅ Generated AI-powered technical recommendations")

            return results

        except Exception as e:
            error_msg = f"Error in technical SEO analysis: {str(e)}"
            logger.error(error_msg, exc_info=True)
            st.error(error_msg)
            return {'error': error_msg}

    def _perform_comprehensive_crawl(self, website_url: str, depth: int, max_pages: int) -> Dict[str, Any]:
        """Perform comprehensive website crawl using adv.crawl."""
        try:
            st.info("🕷️ Crawling website for comprehensive analysis...")

            # Create crawl output file
            crawl_file = os.path.join(self.temp_dir, "technical_crawl.jl")

            # Configure crawl settings for technical SEO
            custom_settings = {
                'DEPTH_LIMIT': depth,
                'CLOSESPIDER_PAGECOUNT': max_pages,
                'DOWNLOAD_DELAY': 0.5,  # Be respectful
                'CONCURRENT_REQUESTS': 8,
                'ROBOTSTXT_OBEY': True,
                'USER_AGENT': 'ALwrity-TechnicalSEO-Crawler/1.0',
                'COOKIES_ENABLED': False,
                'TELNETCONSOLE_ENABLED': False,
                'LOG_LEVEL': 'WARNING'
            }

            # Start crawl
            adv.crawl(
                url_list=[website_url],
                output_file=crawl_file,
                follow_links=True,
                custom_settings=custom_settings
            )

            # Read and process crawl results
            if os.path.exists(crawl_file):
                crawl_df = pd.read_json(crawl_file, lines=True)

                # Basic crawl statistics
                crawl_overview = {
                    'pages_crawled': len(crawl_df),
                    'status_codes': crawl_df['status'].value_counts().to_dict(),
                    'crawl_file_path': crawl_file,
                    'crawl_dataframe': crawl_df,
                    'domains_found': crawl_df['url'].apply(lambda x: urlparse(x).netloc).nunique(),
                    'avg_response_time': crawl_df.get('download_latency', pd.Series()).mean(),
                    'total_content_size': crawl_df.get('size', pd.Series()).sum()
                }

                return crawl_overview
            else:
                st.error("Crawl file not created")
                return {}

        except Exception as e:
            st.error(f"Error in website crawl: {str(e)}")
            return {}

    def _analyze_technical_issues(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze technical SEO issues from crawl data."""
        try:
            st.info("🔍 Detecting technical SEO issues...")

            if 'crawl_dataframe' not in crawl_data:
                return {}

            df = crawl_data['crawl_dataframe']

            technical_issues = {
                'http_errors': {},
                'redirect_issues': {},
                'duplicate_content': {},
                'missing_elements': {},
                'page_speed_issues': {},
                'crawlability_issues': {}
            }

            # HTTP Status Code Issues
            error_codes = df[df['status'] >= 400]['status'].value_counts().to_dict()
            technical_issues['http_errors'] = {
                'total_errors': len(df[df['status'] >= 400]),
                'error_breakdown': error_codes,
                'error_pages': df[df['status'] >= 400][['url', 'status']].to_dict('records')[:50]
            }

            # Redirect Analysis
            redirects = df[df['status'].isin([301, 302, 303, 307, 308])]
            technical_issues['redirect_issues'] = {
                'total_redirects': len(redirects),
                'redirect_chains': self._find_redirect_chains(redirects),
                'redirect_types': redirects['status'].value_counts().to_dict()
            }

            # Duplicate Content Detection
            if 'title' in df.columns:
                duplicate_titles = df['title'].value_counts()
                duplicate_titles = duplicate_titles[duplicate_titles > 1]

                technical_issues['duplicate_content'] = {
                    'duplicate_titles': len(duplicate_titles),
                    'duplicate_title_groups': duplicate_titles.to_dict(),
                    'pages_with_duplicate_titles': df[df['title'].isin(duplicate_titles.index)][['url', 'title']].to_dict('records')[:20]
                }

            # Missing Elements Analysis
            missing_elements = {
                'missing_titles': len(df[(df['title'].isna()) | (df['title'] == '')]) if 'title' in df.columns else 0,
                'missing_meta_desc': len(df[(df['meta_desc'].isna()) | (df['meta_desc'] == '')]) if 'meta_desc' in df.columns else 0,
                'missing_h1': len(df[(df['h1'].isna()) | (df['h1'] == '')]) if 'h1' in df.columns else 0
            }
            technical_issues['missing_elements'] = missing_elements

            # Page Speed Issues
            if 'download_latency' in df.columns:
                slow_pages = df[df['download_latency'] > 3.0]  # Pages taking >3s
                technical_issues['page_speed_issues'] = {
                    'slow_pages_count': len(slow_pages),
                    'avg_load_time': df['download_latency'].mean(),
                    'slowest_pages': slow_pages.nlargest(10, 'download_latency')[['url', 'download_latency']].to_dict('records')
                }

            return technical_issues

        except Exception as e:
            st.error(f"Error analyzing technical issues: {str(e)}")
            return {}

    def _analyze_performance_metrics(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze website performance metrics."""
        try:
            st.info("⚡ Analyzing performance metrics...")

            if 'crawl_dataframe' not in crawl_data:
                return {}

            df = crawl_data['crawl_dataframe']

            performance = {
                'load_time_analysis': {},
                'content_size_analysis': {},
                'server_performance': {},
                'optimization_opportunities': []
            }

            # Load Time Analysis
            if 'download_latency' in df.columns:
                load_times = df['download_latency'].dropna()
                performance['load_time_analysis'] = {
                    'avg_load_time': load_times.mean(),
                    'median_load_time': load_times.median(),
                    'p95_load_time': load_times.quantile(0.95),
                    'fastest_page': load_times.min(),
                    'slowest_page': load_times.max(),
                    'pages_over_3s': len(load_times[load_times > 3]),
                    'performance_distribution': {
                        'fast_pages': len(load_times[load_times <= 1]),
                        'moderate_pages': len(load_times[(load_times > 1) & (load_times <= 3)]),
                        'slow_pages': len(load_times[load_times > 3])
                    }
                }

            # Content Size Analysis
            if 'size' in df.columns:
                sizes = df['size'].dropna()
                performance['content_size_analysis'] = {
                    'avg_page_size': sizes.mean(),
                    'median_page_size': sizes.median(),
                    'largest_page': sizes.max(),
                    'smallest_page': sizes.min(),
                    'pages_over_1mb': len(sizes[sizes > 1048576]),  # 1MB
                    'total_content_size': sizes.sum()
                }

            # Server Performance
            status_codes = df['status'].value_counts()
            total_pages = len(df)
            performance['server_performance'] = {
                'success_rate': status_codes.get(200, 0) / total_pages * 100,
                'error_rate': sum(status_codes.get(code, 0) for code in range(400, 600)) / total_pages * 100,
                'redirect_rate': sum(status_codes.get(code, 0) for code in [301, 302, 303, 307, 308]) / total_pages * 100
            }

            return performance

        except Exception as e:
            st.error(f"Error analyzing performance: {str(e)}")
            return {}

    def _analyze_content_structure(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze content structure and SEO elements."""
        try:
            st.info("📊 Analyzing content structure...")

            if 'crawl_dataframe' not in crawl_data:
                return {}

            df = crawl_data['crawl_dataframe']

            content_analysis = {
                'title_analysis': {},
                'meta_description_analysis': {},
                'heading_structure': {},
                'internal_linking': {},
                'content_optimization': {}
            }

            # Title Analysis
            if 'title' in df.columns:
                titles = df['title'].dropna()
                title_lengths = titles.str.len()

                content_analysis['title_analysis'] = {
                    'avg_title_length': title_lengths.mean(),
                    'title_length_distribution': {
                        'too_short': len(title_lengths[title_lengths < 30]),
                        'optimal': len(title_lengths[(title_lengths >= 30) & (title_lengths <= 60)]),
                        'too_long': len(title_lengths[title_lengths > 60])
                    },
                    'duplicate_titles': len(titles.value_counts()[titles.value_counts() > 1]),
                    'missing_titles': len(df) - len(titles)
                }

            # Meta Description Analysis
            if 'meta_desc' in df.columns:
                meta_descs = df['meta_desc'].dropna()
                meta_lengths = meta_descs.str.len()

                content_analysis['meta_description_analysis'] = {
                    'avg_meta_length': meta_lengths.mean(),
                    'meta_length_distribution': {
                        'too_short': len(meta_lengths[meta_lengths < 120]),
                        'optimal': len(meta_lengths[(meta_lengths >= 120) & (meta_lengths <= 160)]),
                        'too_long': len(meta_lengths[meta_lengths > 160])
                    },
                    'missing_meta_descriptions': len(df) - len(meta_descs)
                }

            # Heading Structure Analysis
            heading_cols = [col for col in df.columns if col.startswith('h') and col[1:].isdigit()]
            if heading_cols:
                heading_analysis = {}
                for col in heading_cols:
                    headings = df[col].dropna()
                    heading_analysis[f'{col}_usage'] = {
                        'pages_with_heading': len(headings),
                        'usage_rate': len(headings) / len(df) * 100,
                        'avg_length': headings.str.len().mean() if len(headings) > 0 else 0
                    }
                content_analysis['heading_structure'] = heading_analysis

            # Internal Linking Analysis
            if 'links_internal' in df.columns:
                internal_links = df['links_internal'].apply(lambda x: len(x) if isinstance(x, list) else 0)
                content_analysis['internal_linking'] = {
                    'avg_internal_links': internal_links.mean(),
                    'pages_with_no_internal_links': len(internal_links[internal_links == 0]),
                    'max_internal_links': internal_links.max(),
                    'internal_link_distribution': internal_links.describe().to_dict()
                }

            return content_analysis

        except Exception as e:
            st.error(f"Error analyzing content structure: {str(e)}")
            return {}

    def _analyze_url_structure(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze URL structure and optimization using adv.url_to_df."""
        try:
            st.info("🔗 Analyzing URL structure...")

            if 'crawl_dataframe' not in crawl_data:
                return {}

            df = crawl_data['crawl_dataframe']
            urls = df['url'].tolist()

            # Use advertools to analyze URL structure
            url_df = adv.url_to_df(urls)

            url_analysis = {
                'url_length_analysis': {},
                'url_structure_patterns': {},
                'url_optimization': {},
                'path_analysis': {}
            }

            # URL Length Analysis
            url_lengths = url_df['url'].str.len()
            url_analysis['url_length_analysis'] = {
                'avg_url_length': url_lengths.mean(),
                'max_url_length': url_lengths.max(),
                'long_urls_count': len(url_lengths[url_lengths > 100]),
                'url_length_distribution': url_lengths.describe().to_dict()
            }

            # Path Depth Analysis
            if 'dir_1' in url_df.columns:
                path_depths = url_df.apply(lambda row: sum(1 for i in range(1, 10) if f'dir_{i}' in row and pd.notna(row[f'dir_{i}'])), axis=1)
                url_analysis['path_analysis'] = {
                    'avg_path_depth': path_depths.mean(),
                    'max_path_depth': path_depths.max(),
                    'deep_paths_count': len(path_depths[path_depths > 4]),
                    'path_depth_distribution': path_depths.value_counts().to_dict()
                }

            # URL Structure Patterns
            domains = url_df['netloc'].value_counts()
            schemes = url_df['scheme'].value_counts()

            url_analysis['url_structure_patterns'] = {
                'domains_found': domains.to_dict(),
                'schemes_used': schemes.to_dict(),
                'subdomain_usage': len(url_df[url_df['netloc'].str.contains('\.', regex=True)]),
                'https_usage': schemes.get('https', 0) / len(url_df) * 100
            }

            # URL Optimization Issues
            optimization_issues = []

            # Check for non-HTTPS URLs
            if schemes.get('http', 0) > 0:
                optimization_issues.append(f"{schemes.get('http', 0)} pages not using HTTPS")

            # Check for long URLs
            long_urls = len(url_lengths[url_lengths > 100])
            if long_urls > 0:
                optimization_issues.append(f"{long_urls} URLs are too long (>100 characters)")

            # Check for deep paths
            if 'path_analysis' in url_analysis:
                deep_paths = url_analysis['path_analysis']['deep_paths_count']
                if deep_paths > 0:
                    optimization_issues.append(f"{deep_paths} URLs have deep path structures (>4 levels)")

            url_analysis['url_optimization'] = {
                'issues_found': len(optimization_issues),
                'optimization_recommendations': optimization_issues
            }

            return url_analysis

        except Exception as e:
            st.error(f"Error analyzing URL structure: {str(e)}")
            return {}

    def _analyze_image_seo(self, website_url: str) -> Dict[str, Any]:
        """Analyze image SEO using adv.crawl_images."""
        try:
            st.info("🖼️ Analyzing image SEO...")

            # Create image crawl output file
            image_file = os.path.join(self.temp_dir, "image_crawl.jl")

            # Crawl images
            adv.crawl_images(
                url_list=[website_url],
                output_file=image_file,
                custom_settings={
                    'DEPTH_LIMIT': 2,
                    'CLOSESPIDER_PAGECOUNT': 100,
                    'DOWNLOAD_DELAY': 1
                }
            )

            image_analysis = {
                'image_count': 0,
                'alt_text_analysis': {},
                'image_format_analysis': {},
                'image_size_analysis': {},
                'optimization_opportunities': []
            }

            if os.path.exists(image_file):
                image_df = pd.read_json(image_file, lines=True)

                image_analysis['image_count'] = len(image_df)

                # Alt text analysis
                if 'img_alt' in image_df.columns:
                    alt_texts = image_df['img_alt'].dropna()
                    missing_alt = len(image_df) - len(alt_texts)

                    image_analysis['alt_text_analysis'] = {
                        'images_with_alt': len(alt_texts),
                        'images_missing_alt': missing_alt,
                        'alt_text_coverage': len(alt_texts) / len(image_df) * 100,
                        'avg_alt_length': alt_texts.str.len().mean() if len(alt_texts) > 0 else 0
                    }

                # Image format analysis
                if 'img_src' in image_df.columns:
                    # Extract file extensions
                    extensions = image_df['img_src'].str.extract(r'\.([a-zA-Z]{2,4})(?:\?|$)')
                    format_counts = extensions[0].value_counts()

                    image_analysis['image_format_analysis'] = {
                        'format_distribution': format_counts.to_dict(),
                        'modern_format_usage': format_counts.get('webp', 0) + format_counts.get('avif', 0)
                    }

            return image_analysis

        except Exception as e:
            st.error(f"Error analyzing images: {str(e)}")
            return {}

    def _analyze_security_headers(self, website_url: str) -> Dict[str, Any]:
        """Analyze security headers using adv.crawl_headers."""
        try:
            st.info("🛡️ Analyzing security headers...")

            # Create headers output file
            headers_file = os.path.join(self.temp_dir, "security_headers.jl")

            # Crawl headers
            adv.crawl_headers([website_url], output_file=headers_file)

            security_analysis = {
                'security_headers_present': {},
                'security_score': 0,
                'security_recommendations': []
            }

            if os.path.exists(headers_file):
                headers_df = pd.read_json(headers_file, lines=True)

                # Check for important security headers
                security_headers = {
                    'X-Frame-Options': 'resp_headers_X-Frame-Options',
                    'X-Content-Type-Options': 'resp_headers_X-Content-Type-Options',
                    'X-XSS-Protection': 'resp_headers_X-XSS-Protection',
                    'Strict-Transport-Security': 'resp_headers_Strict-Transport-Security',
                    'Content-Security-Policy': 'resp_headers_Content-Security-Policy',
                    'Referrer-Policy': 'resp_headers_Referrer-Policy'
                }

                headers_present = {}
                for header_name, column_name in security_headers.items():
                    is_present = column_name in headers_df.columns and headers_df[column_name].notna().any()
                    headers_present[header_name] = is_present

                security_analysis['security_headers_present'] = headers_present

                # Calculate security score
                present_count = sum(headers_present.values())
                security_analysis['security_score'] = (present_count / len(security_headers)) * 100

                # Generate recommendations
                recommendations = []
                for header_name, is_present in headers_present.items():
                    if not is_present:
                        recommendations.append(f"Add {header_name} header for improved security")

                security_analysis['security_recommendations'] = recommendations

            return security_analysis

        except Exception as e:
            st.error(f"Error analyzing security headers: {str(e)}")
            return {}

    def _analyze_mobile_seo(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze mobile SEO factors."""
        try:
            st.info("📱 Analyzing mobile SEO factors...")

            if 'crawl_dataframe' not in crawl_data:
                return {}

            df = crawl_data['crawl_dataframe']

            mobile_analysis = {
                'viewport_analysis': {},
                'mobile_optimization': {},
                'responsive_design_indicators': {}
            }

            # Viewport meta tag analysis
            if 'viewport' in df.columns:
                viewport_present = df['viewport'].notna().sum()
                mobile_analysis['viewport_analysis'] = {
                    'pages_with_viewport': viewport_present,
                    'viewport_coverage': viewport_present / len(df) * 100,
                    'pages_missing_viewport': len(df) - viewport_present
                }

            # Check for mobile-specific meta tags and indicators
            mobile_indicators = []

            # Check for touch icons
            if any('touch-icon' in col for col in df.columns):
                mobile_indicators.append("Touch icons configured")

            # Check for responsive design indicators in content
            # This is a simplified check - in practice, you'd analyze CSS and page structure
            mobile_analysis['mobile_optimization'] = {
                'mobile_indicators_found': len(mobile_indicators),
                'mobile_indicators': mobile_indicators
            }

            return mobile_analysis

        except Exception as e:
            st.error(f"Error analyzing mobile SEO: {str(e)}")
            return {}

    def _generate_technical_recommendations(self, results: Dict[str, Any]) -> Dict[str, Any]:
        """Generate AI-powered technical SEO recommendations."""
        try:
            st.info("🤖 Generating technical recommendations...")

            # Prepare technical analysis summary for AI
            technical_summary = {
                'website_url': results.get('website_url', ''),
                'pages_crawled': results.get('crawl_overview', {}).get('pages_crawled', 0),
                'error_count': results.get('technical_issues', {}).get('http_errors', {}).get('total_errors', 0),
                'avg_load_time': results.get('performance_analysis', {}).get('load_time_analysis', {}).get('avg_load_time', 0),
                'security_score': results.get('security_headers', {}).get('security_score', 0),
                'missing_titles': results.get('content_analysis', {}).get('title_analysis', {}).get('missing_titles', 0),
                'missing_meta_desc': results.get('content_analysis', {}).get('meta_description_analysis', {}).get('missing_meta_descriptions', 0)
            }

            # Generate AI recommendations
            prompt = f"""
            As a technical SEO expert, analyze this comprehensive website audit and provide prioritized recommendations:

            WEBSITE: {technical_summary['website_url']}
            PAGES ANALYZED: {technical_summary['pages_crawled']}

            TECHNICAL ISSUES:
            - HTTP Errors: {technical_summary['error_count']}
            - Average Load Time: {technical_summary['avg_load_time']:.2f}s
            - Security Score: {technical_summary['security_score']:.1f}%
            - Missing Titles: {technical_summary['missing_titles']}
            - Missing Meta Descriptions: {technical_summary['missing_meta_desc']}

            PROVIDE:
            1. Critical Issues (Fix Immediately)
            2. High Priority Optimizations
            3. Medium Priority Improvements
            4. Long-term Technical Strategy
            5. Specific Implementation Steps
            6. Expected Impact Assessment

            Format as JSON with clear priorities and actionable recommendations.
            """

            ai_response = llm_text_gen(
                prompt=prompt,
                system_prompt="You are a senior technical SEO specialist with expertise in website optimization, Core Web Vitals, and search engine best practices.",
                response_format="json_object"
            )

            if ai_response:
                return ai_response
            else:
                return {'recommendations': ['AI recommendations temporarily unavailable']}

        except Exception as e:
            st.error(f"Error generating recommendations: {str(e)}")
            return {}

    def _find_redirect_chains(self, redirects_df: pd.DataFrame) -> List[Dict[str, Any]]:
        """Find redirect chains in the crawled data."""
        # Simplified redirect chain detection
        # In a full implementation, you'd trace the redirect paths
        redirect_chains = []

        if len(redirects_df) > 0:
            # Group redirects by status code
            for status_code in redirects_df['status'].unique():
                status_redirects = redirects_df[redirects_df['status'] == status_code]
                redirect_chains.append({
                    'status_code': int(status_code),
                    'count': len(status_redirects),
                    'examples': status_redirects['url'].head(5).tolist()
                })

        return redirect_chains