ALwrity/ToBeMigrated/content_scheduler/core/health_checker.py

"""
Schedule health monitoring system.
"""

import logging
import asyncio
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from dataclasses import dataclass
from enum import Enum

from ..utils.error_handling import SchedulingError

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

class HealthStatus(Enum):
    """Health check status."""
    HEALTHY = "healthy"
    WARNING = "warning"
    CRITICAL = "critical"
    UNKNOWN = "unknown"

@dataclass
class HealthCheck:
    """Health check result."""
    component: str
    status: HealthStatus
    message: str
    details: Dict[str, Any]
    timestamp: datetime

class ScheduleHealthChecker:
    """Schedule health monitoring system."""

    def __init__(
        self,
        scheduler,
        check_interval: int = 300,  # 5 minutes
        warning_threshold: int = 3,
        critical_threshold: int = 5
    ):
        """Initialize the health checker.

        Args:
            scheduler: ContentScheduler instance
            check_interval: Health check interval in seconds
            warning_threshold: Number of failures before warning
            critical_threshold: Number of failures before critical
        """
        self.logger = logger
        self.scheduler = scheduler
        self.check_interval = check_interval
        self.warning_threshold = warning_threshold
        self.critical_threshold = critical_threshold

        # Initialize health check history
        self.health_history = []

        # Initialize failure counters
        self.failure_counts = {
            'job_execution': 0,
            'platform_publish': 0,
            'schedule_conflicts': 0,
            'resource_usage': 0
        }

        # Initialize monitoring task
        self.monitoring_task = None

    async def start_monitoring(self):
        """Start the health monitoring system."""
        try:
            if not self.monitoring_task:
                self.monitoring_task = asyncio.create_task(self._monitor_health())
                self.logger.info("Health monitoring started")
        except Exception as e:
            self.logger.error(f"Failed to start health monitoring: {str(e)}")
            raise SchedulingError(f"Health monitoring start failed: {str(e)}")

    async def stop_monitoring(self):
        """Stop the health monitoring system."""
        try:
            if self.monitoring_task:
                self.monitoring_task.cancel()
                self.monitoring_task = None
                self.logger.info("Health monitoring stopped")
        except Exception as e:
            self.logger.error(f"Failed to stop health monitoring: {str(e)}")
            raise SchedulingError(f"Health monitoring stop failed: {str(e)}")

    async def _monitor_health(self):
        """Monitor system health periodically."""
        while True:
            try:
                # Perform health checks
                health_checks = await self._perform_health_checks()

                # Update health history
                self.health_history.extend(health_checks)

                # Trim history if too long
                if len(self.health_history) > 1000:
                    self.health_history = self.health_history[-1000:]

                # Check for critical issues
                critical_checks = [
                    check for check in health_checks
                    if check.status == HealthStatus.CRITICAL
                ]

                if critical_checks:
                    await self._handle_critical_issues(critical_checks)

                # Wait for next check
                await asyncio.sleep(self.check_interval)

            except asyncio.CancelledError:
                break
            except Exception as e:
                self.logger.error(f"Health monitoring error: {str(e)}")
                await asyncio.sleep(self.check_interval)

    async def _perform_health_checks(self) -> List[HealthCheck]:
        """Perform all health checks.

        Returns:
            List of health check results
        """
        checks = []

        try:
            # Check scheduler status
            checks.append(await self._check_scheduler_status())

            # Check job execution
            checks.append(await self._check_job_execution())

            # Check platform connectivity
            checks.append(await self._check_platform_connectivity())

            # Check resource usage
            checks.append(await self._check_resource_usage())

            # Check schedule conflicts
            checks.append(await self._check_schedule_conflicts())

            # Check database connection
            checks.append(await self._check_database_connection())

            # Check job store
            checks.append(await self._check_job_store())

            return checks

        except Exception as e:
            self.logger.error(f"Health check failed: {str(e)}")
            return [
                HealthCheck(
                    component="health_checker",
                    status=HealthStatus.CRITICAL,
                    message=f"Health check system error: {str(e)}",
                    details={'error': str(e)},
                    timestamp=datetime.utcnow()
                )
            ]

    async def _check_scheduler_status(self) -> HealthCheck:
        """Check scheduler status.

        Returns:
            Health check result
        """
        try:
            is_running = self.scheduler.scheduler.running
            job_count = len(self.scheduler.scheduler.get_jobs())

            if not is_running:
                return HealthCheck(
                    component="scheduler",
                    status=HealthStatus.CRITICAL,
                    message="Scheduler is not running",
                    details={'job_count': job_count},
                    timestamp=datetime.utcnow()
                )

            return HealthCheck(
                component="scheduler",
                status=HealthStatus.HEALTHY,
                message="Scheduler is running",
                details={'job_count': job_count},
                timestamp=datetime.utcnow()
            )

        except Exception as e:
            return HealthCheck(
                component="scheduler",
                status=HealthStatus.CRITICAL,
                message=f"Scheduler check failed: {str(e)}",
                details={'error': str(e)},
                timestamp=datetime.utcnow()
            )

    async def _check_job_execution(self) -> HealthCheck:
        """Check job execution health.

        Returns:
            Health check result
        """
        try:
            # Get recent job history
            recent_jobs = [
                job for job in self.scheduler.job_status.values()
                if datetime.utcnow() - job['created_at'] < timedelta(hours=24)
            ]

            # Calculate failure rate
            total_jobs = len(recent_jobs)
            failed_jobs = len([
                job for job in recent_jobs
                if job['status'] == 'FAILED'
            ])

            failure_rate = failed_jobs / total_jobs if total_jobs > 0 else 0

            # Update failure counter
            self.failure_counts['job_execution'] = failed_jobs

            if failure_rate >= 0.2:  # 20% failure rate
                return HealthCheck(
                    component="job_execution",
                    status=HealthStatus.CRITICAL,
                    message="High job failure rate detected",
                    details={
                        'total_jobs': total_jobs,
                        'failed_jobs': failed_jobs,
                        'failure_rate': failure_rate
                    },
                    timestamp=datetime.utcnow()
                )
            elif failure_rate >= 0.1:  # 10% failure rate
                return HealthCheck(
                    component="job_execution",
                    status=HealthStatus.WARNING,
                    message="Elevated job failure rate",
                    details={
                        'total_jobs': total_jobs,
                        'failed_jobs': failed_jobs,
                        'failure_rate': failure_rate
                    },
                    timestamp=datetime.utcnow()
                )

            return HealthCheck(
                component="job_execution",
                status=HealthStatus.HEALTHY,
                message="Job execution is healthy",
                details={
                    'total_jobs': total_jobs,
                    'failed_jobs': failed_jobs,
                    'failure_rate': failure_rate
                },
                timestamp=datetime.utcnow()
            )

        except Exception as e:
            return HealthCheck(
                component="job_execution",
                status=HealthStatus.CRITICAL,
                message=f"Job execution check failed: {str(e)}",
                details={'error': str(e)},
                timestamp=datetime.utcnow()
            )

    async def _check_platform_connectivity(self) -> HealthCheck:
        """Check platform connectivity.

        Returns:
            Health check result
        """
        try:
            # Get unique platforms from recent jobs
            platforms = set()
            for job in self.scheduler.job_status.values():
                if 'schedule' in job:
                    platforms.update(job['schedule'].platforms)

            # Check each platform
            platform_status = {}
            for platform in platforms:
                try:
                    adapter = self.scheduler._get_platform_adapter(platform)
                    # Try to get platform status
                    status = await adapter.get_platform_status()
                    platform_status[platform] = status['status']
                except Exception as e:
                    platform_status[platform] = 'error'
                    self.failure_counts['platform_publish'] += 1

            # Check overall status
            if any(status == 'error' for status in platform_status.values()):
                return HealthCheck(
                    component="platform_connectivity",
                    status=HealthStatus.CRITICAL,
                    message="Platform connectivity issues detected",
                    details={'platform_status': platform_status},
                    timestamp=datetime.utcnow()
                )

            return HealthCheck(
                component="platform_connectivity",
                status=HealthStatus.HEALTHY,
                message="Platform connectivity is healthy",
                details={'platform_status': platform_status},
                timestamp=datetime.utcnow()
            )

        except Exception as e:
            return HealthCheck(
                component="platform_connectivity",
                status=HealthStatus.CRITICAL,
                message=f"Platform connectivity check failed: {str(e)}",
                details={'error': str(e)},
                timestamp=datetime.utcnow()
            )

    async def _check_resource_usage(self) -> HealthCheck:
        """Check system resource usage.

        Returns:
            Health check result
        """
        try:
            import psutil

            # Get system metrics
            cpu_percent = psutil.cpu_percent()
            memory_percent = psutil.virtual_memory().percent
            disk_percent = psutil.disk_usage('/').percent

            # Check thresholds
            if cpu_percent > 90 or memory_percent > 90 or disk_percent > 90:
                self.failure_counts['resource_usage'] += 1
                return HealthCheck(
                    component="resource_usage",
                    status=HealthStatus.CRITICAL,
                    message="High resource usage detected",
                    details={
                        'cpu_percent': cpu_percent,
                        'memory_percent': memory_percent,
                        'disk_percent': disk_percent
                    },
                    timestamp=datetime.utcnow()
                )
            elif cpu_percent > 70 or memory_percent > 70 or disk_percent > 70:
                return HealthCheck(
                    component="resource_usage",
                    status=HealthStatus.WARNING,
                    message="Elevated resource usage",
                    details={
                        'cpu_percent': cpu_percent,
                        'memory_percent': memory_percent,
                        'disk_percent': disk_percent
                    },
                    timestamp=datetime.utcnow()
                )

            return HealthCheck(
                component="resource_usage",
                status=HealthStatus.HEALTHY,
                message="Resource usage is healthy",
                details={
                    'cpu_percent': cpu_percent,
                    'memory_percent': memory_percent,
                    'disk_percent': disk_percent
                },
                timestamp=datetime.utcnow()
            )

        except Exception as e:
            return HealthCheck(
                component="resource_usage",
                status=HealthStatus.CRITICAL,
                message=f"Resource usage check failed: {str(e)}",
                details={'error': str(e)},
                timestamp=datetime.utcnow()
            )

    async def _check_schedule_conflicts(self) -> HealthCheck:
        """Check for schedule conflicts.

        Returns:
            Health check result
        """
        try:
            # Get all pending schedules
            pending_schedules = [
                job['schedule'] for job in self.scheduler.job_status.values()
                if job['status'] == 'PENDING'
            ]

            # Check for conflicts
            conflicts = await self.scheduler.conflict_resolver.detect_conflicts(
                pending_schedules
            )

            if conflicts:
                self.failure_counts['schedule_conflicts'] += len(conflicts)
                return HealthCheck(
                    component="schedule_conflicts",
                    status=HealthStatus.WARNING,
                    message="Schedule conflicts detected",
                    details={
                        'conflict_count': len(conflicts),
                        'conflicts': [c.dict() for c in conflicts]
                    },
                    timestamp=datetime.utcnow()
                )

            return HealthCheck(
                component="schedule_conflicts",
                status=HealthStatus.HEALTHY,
                message="No schedule conflicts detected",
                details={'conflict_count': 0},
                timestamp=datetime.utcnow()
            )

        except Exception as e:
            return HealthCheck(
                component="schedule_conflicts",
                status=HealthStatus.CRITICAL,
                message=f"Schedule conflict check failed: {str(e)}",
                details={'error': str(e)},
                timestamp=datetime.utcnow()
            )

    async def _check_database_connection(self) -> HealthCheck:
        """Check database connection health.

        Returns:
            Health check result
        """
        try:
            session = self.scheduler.Session()
            session.execute("SELECT 1")
            session.close()

            return HealthCheck(
                component="database",
                status=HealthStatus.HEALTHY,
                message="Database connection is healthy",
                details={},
                timestamp=datetime.utcnow()
            )

        except Exception as e:
            return HealthCheck(
                component="database",
                status=HealthStatus.CRITICAL,
                message=f"Database connection failed: {str(e)}",
                details={'error': str(e)},
                timestamp=datetime.utcnow()
            )

    async def _check_job_store(self) -> HealthCheck:
        """Check job store health.

        Returns:
            Health check result
        """
        try:
            # Get job store statistics
            job_count = len(self.scheduler.scheduler.get_jobs())
            store_size = len(self.scheduler.job_status)

            if job_count != store_size:
                return HealthCheck(
                    component="job_store",
                    status=HealthStatus.WARNING,
                    message="Job store inconsistency detected",
                    details={
                        'job_count': job_count,
                        'store_size': store_size
                    },
                    timestamp=datetime.utcnow()
                )

            return HealthCheck(
                component="job_store",
                status=HealthStatus.HEALTHY,
                message="Job store is healthy",
                details={
                    'job_count': job_count,
                    'store_size': store_size
                },
                timestamp=datetime.utcnow()
            )

        except Exception as e:
            return HealthCheck(
                component="job_store",
                status=HealthStatus.CRITICAL,
                message=f"Job store check failed: {str(e)}",
                details={'error': str(e)},
                timestamp=datetime.utcnow()
            )

    async def _handle_critical_issues(self, critical_checks: List[HealthCheck]):
        """Handle critical health issues.

        Args:
            critical_checks: List of critical health checks
        """
        try:
            # Log critical issues
            for check in critical_checks:
                self.logger.error(
                    f"Critical health issue in {check.component}: {check.message}"
                )

            # Attempt recovery actions
            for check in critical_checks:
                if check.component == "scheduler" and not self.scheduler.scheduler.running:
                    await self.scheduler.start()

                elif check.component == "database":
                    # Attempt to reconnect
                    self.scheduler.engine.dispose()
                    self.scheduler.engine = create_engine(self.scheduler.db_url)
                    self.scheduler.Session = sessionmaker(bind=self.scheduler.engine)

                elif check.component == "job_store":
                    # Attempt to recover job store
                    await self.scheduler._recover_jobs()

            # Reset failure counters if recovery successful
            self.failure_counts = {k: 0 for k in self.failure_counts}

        except Exception as e:
            self.logger.error(f"Failed to handle critical issues: {str(e)}")

    def get_health_summary(self) -> Dict[str, Any]:
        """Get health check summary.

        Returns:
            Dictionary containing health summary
        """
        try:
            # Get latest health checks
            latest_checks = {
                check.component: check
                for check in self.health_history[-len(self.health_history):]
            }

            # Calculate overall status
            if any(check.status == HealthStatus.CRITICAL for check in latest_checks.values()):
                overall_status = HealthStatus.CRITICAL
            elif any(check.status == HealthStatus.WARNING for check in latest_checks.values()):
                overall_status = HealthStatus.WARNING
            else:
                overall_status = HealthStatus.HEALTHY

            return {
                'status': overall_status.value,
                'components': {
                    component: {
                        'status': check.status.value,
                        'message': check.message,
                        'details': check.details,
                        'timestamp': check.timestamp.isoformat()
                    }
                    for component, check in latest_checks.items()
                },
                'failure_counts': self.failure_counts,
                'last_check': datetime.utcnow().isoformat()
            }

        except Exception as e:
            self.logger.error(f"Failed to get health summary: {str(e)}")
            return {
                'status': HealthStatus.UNKNOWN.value,
                'error': str(e),
                'last_check': datetime.utcnow().isoformat()
            }