Files
ALwrity/ToBeMigrated/content_scheduler/core/health_checker.py
2025-08-06 16:29:49 +05:30

584 lines
21 KiB
Python

"""
Schedule health monitoring system.
"""
import logging
import asyncio
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from dataclasses import dataclass
from enum import Enum
from ..utils.error_handling import SchedulingError
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class HealthStatus(Enum):
"""Health check status."""
HEALTHY = "healthy"
WARNING = "warning"
CRITICAL = "critical"
UNKNOWN = "unknown"
@dataclass
class HealthCheck:
"""Health check result."""
component: str
status: HealthStatus
message: str
details: Dict[str, Any]
timestamp: datetime
class ScheduleHealthChecker:
"""Schedule health monitoring system."""
def __init__(
self,
scheduler,
check_interval: int = 300, # 5 minutes
warning_threshold: int = 3,
critical_threshold: int = 5
):
"""Initialize the health checker.
Args:
scheduler: ContentScheduler instance
check_interval: Health check interval in seconds
warning_threshold: Number of failures before warning
critical_threshold: Number of failures before critical
"""
self.logger = logger
self.scheduler = scheduler
self.check_interval = check_interval
self.warning_threshold = warning_threshold
self.critical_threshold = critical_threshold
# Initialize health check history
self.health_history = []
# Initialize failure counters
self.failure_counts = {
'job_execution': 0,
'platform_publish': 0,
'schedule_conflicts': 0,
'resource_usage': 0
}
# Initialize monitoring task
self.monitoring_task = None
async def start_monitoring(self):
"""Start the health monitoring system."""
try:
if not self.monitoring_task:
self.monitoring_task = asyncio.create_task(self._monitor_health())
self.logger.info("Health monitoring started")
except Exception as e:
self.logger.error(f"Failed to start health monitoring: {str(e)}")
raise SchedulingError(f"Health monitoring start failed: {str(e)}")
async def stop_monitoring(self):
"""Stop the health monitoring system."""
try:
if self.monitoring_task:
self.monitoring_task.cancel()
self.monitoring_task = None
self.logger.info("Health monitoring stopped")
except Exception as e:
self.logger.error(f"Failed to stop health monitoring: {str(e)}")
raise SchedulingError(f"Health monitoring stop failed: {str(e)}")
async def _monitor_health(self):
"""Monitor system health periodically."""
while True:
try:
# Perform health checks
health_checks = await self._perform_health_checks()
# Update health history
self.health_history.extend(health_checks)
# Trim history if too long
if len(self.health_history) > 1000:
self.health_history = self.health_history[-1000:]
# Check for critical issues
critical_checks = [
check for check in health_checks
if check.status == HealthStatus.CRITICAL
]
if critical_checks:
await self._handle_critical_issues(critical_checks)
# Wait for next check
await asyncio.sleep(self.check_interval)
except asyncio.CancelledError:
break
except Exception as e:
self.logger.error(f"Health monitoring error: {str(e)}")
await asyncio.sleep(self.check_interval)
async def _perform_health_checks(self) -> List[HealthCheck]:
"""Perform all health checks.
Returns:
List of health check results
"""
checks = []
try:
# Check scheduler status
checks.append(await self._check_scheduler_status())
# Check job execution
checks.append(await self._check_job_execution())
# Check platform connectivity
checks.append(await self._check_platform_connectivity())
# Check resource usage
checks.append(await self._check_resource_usage())
# Check schedule conflicts
checks.append(await self._check_schedule_conflicts())
# Check database connection
checks.append(await self._check_database_connection())
# Check job store
checks.append(await self._check_job_store())
return checks
except Exception as e:
self.logger.error(f"Health check failed: {str(e)}")
return [
HealthCheck(
component="health_checker",
status=HealthStatus.CRITICAL,
message=f"Health check system error: {str(e)}",
details={'error': str(e)},
timestamp=datetime.utcnow()
)
]
async def _check_scheduler_status(self) -> HealthCheck:
"""Check scheduler status.
Returns:
Health check result
"""
try:
is_running = self.scheduler.scheduler.running
job_count = len(self.scheduler.scheduler.get_jobs())
if not is_running:
return HealthCheck(
component="scheduler",
status=HealthStatus.CRITICAL,
message="Scheduler is not running",
details={'job_count': job_count},
timestamp=datetime.utcnow()
)
return HealthCheck(
component="scheduler",
status=HealthStatus.HEALTHY,
message="Scheduler is running",
details={'job_count': job_count},
timestamp=datetime.utcnow()
)
except Exception as e:
return HealthCheck(
component="scheduler",
status=HealthStatus.CRITICAL,
message=f"Scheduler check failed: {str(e)}",
details={'error': str(e)},
timestamp=datetime.utcnow()
)
async def _check_job_execution(self) -> HealthCheck:
"""Check job execution health.
Returns:
Health check result
"""
try:
# Get recent job history
recent_jobs = [
job for job in self.scheduler.job_status.values()
if datetime.utcnow() - job['created_at'] < timedelta(hours=24)
]
# Calculate failure rate
total_jobs = len(recent_jobs)
failed_jobs = len([
job for job in recent_jobs
if job['status'] == 'FAILED'
])
failure_rate = failed_jobs / total_jobs if total_jobs > 0 else 0
# Update failure counter
self.failure_counts['job_execution'] = failed_jobs
if failure_rate >= 0.2: # 20% failure rate
return HealthCheck(
component="job_execution",
status=HealthStatus.CRITICAL,
message="High job failure rate detected",
details={
'total_jobs': total_jobs,
'failed_jobs': failed_jobs,
'failure_rate': failure_rate
},
timestamp=datetime.utcnow()
)
elif failure_rate >= 0.1: # 10% failure rate
return HealthCheck(
component="job_execution",
status=HealthStatus.WARNING,
message="Elevated job failure rate",
details={
'total_jobs': total_jobs,
'failed_jobs': failed_jobs,
'failure_rate': failure_rate
},
timestamp=datetime.utcnow()
)
return HealthCheck(
component="job_execution",
status=HealthStatus.HEALTHY,
message="Job execution is healthy",
details={
'total_jobs': total_jobs,
'failed_jobs': failed_jobs,
'failure_rate': failure_rate
},
timestamp=datetime.utcnow()
)
except Exception as e:
return HealthCheck(
component="job_execution",
status=HealthStatus.CRITICAL,
message=f"Job execution check failed: {str(e)}",
details={'error': str(e)},
timestamp=datetime.utcnow()
)
async def _check_platform_connectivity(self) -> HealthCheck:
"""Check platform connectivity.
Returns:
Health check result
"""
try:
# Get unique platforms from recent jobs
platforms = set()
for job in self.scheduler.job_status.values():
if 'schedule' in job:
platforms.update(job['schedule'].platforms)
# Check each platform
platform_status = {}
for platform in platforms:
try:
adapter = self.scheduler._get_platform_adapter(platform)
# Try to get platform status
status = await adapter.get_platform_status()
platform_status[platform] = status['status']
except Exception as e:
platform_status[platform] = 'error'
self.failure_counts['platform_publish'] += 1
# Check overall status
if any(status == 'error' for status in platform_status.values()):
return HealthCheck(
component="platform_connectivity",
status=HealthStatus.CRITICAL,
message="Platform connectivity issues detected",
details={'platform_status': platform_status},
timestamp=datetime.utcnow()
)
return HealthCheck(
component="platform_connectivity",
status=HealthStatus.HEALTHY,
message="Platform connectivity is healthy",
details={'platform_status': platform_status},
timestamp=datetime.utcnow()
)
except Exception as e:
return HealthCheck(
component="platform_connectivity",
status=HealthStatus.CRITICAL,
message=f"Platform connectivity check failed: {str(e)}",
details={'error': str(e)},
timestamp=datetime.utcnow()
)
async def _check_resource_usage(self) -> HealthCheck:
"""Check system resource usage.
Returns:
Health check result
"""
try:
import psutil
# Get system metrics
cpu_percent = psutil.cpu_percent()
memory_percent = psutil.virtual_memory().percent
disk_percent = psutil.disk_usage('/').percent
# Check thresholds
if cpu_percent > 90 or memory_percent > 90 or disk_percent > 90:
self.failure_counts['resource_usage'] += 1
return HealthCheck(
component="resource_usage",
status=HealthStatus.CRITICAL,
message="High resource usage detected",
details={
'cpu_percent': cpu_percent,
'memory_percent': memory_percent,
'disk_percent': disk_percent
},
timestamp=datetime.utcnow()
)
elif cpu_percent > 70 or memory_percent > 70 or disk_percent > 70:
return HealthCheck(
component="resource_usage",
status=HealthStatus.WARNING,
message="Elevated resource usage",
details={
'cpu_percent': cpu_percent,
'memory_percent': memory_percent,
'disk_percent': disk_percent
},
timestamp=datetime.utcnow()
)
return HealthCheck(
component="resource_usage",
status=HealthStatus.HEALTHY,
message="Resource usage is healthy",
details={
'cpu_percent': cpu_percent,
'memory_percent': memory_percent,
'disk_percent': disk_percent
},
timestamp=datetime.utcnow()
)
except Exception as e:
return HealthCheck(
component="resource_usage",
status=HealthStatus.CRITICAL,
message=f"Resource usage check failed: {str(e)}",
details={'error': str(e)},
timestamp=datetime.utcnow()
)
async def _check_schedule_conflicts(self) -> HealthCheck:
"""Check for schedule conflicts.
Returns:
Health check result
"""
try:
# Get all pending schedules
pending_schedules = [
job['schedule'] for job in self.scheduler.job_status.values()
if job['status'] == 'PENDING'
]
# Check for conflicts
conflicts = await self.scheduler.conflict_resolver.detect_conflicts(
pending_schedules
)
if conflicts:
self.failure_counts['schedule_conflicts'] += len(conflicts)
return HealthCheck(
component="schedule_conflicts",
status=HealthStatus.WARNING,
message="Schedule conflicts detected",
details={
'conflict_count': len(conflicts),
'conflicts': [c.dict() for c in conflicts]
},
timestamp=datetime.utcnow()
)
return HealthCheck(
component="schedule_conflicts",
status=HealthStatus.HEALTHY,
message="No schedule conflicts detected",
details={'conflict_count': 0},
timestamp=datetime.utcnow()
)
except Exception as e:
return HealthCheck(
component="schedule_conflicts",
status=HealthStatus.CRITICAL,
message=f"Schedule conflict check failed: {str(e)}",
details={'error': str(e)},
timestamp=datetime.utcnow()
)
async def _check_database_connection(self) -> HealthCheck:
"""Check database connection health.
Returns:
Health check result
"""
try:
session = self.scheduler.Session()
session.execute("SELECT 1")
session.close()
return HealthCheck(
component="database",
status=HealthStatus.HEALTHY,
message="Database connection is healthy",
details={},
timestamp=datetime.utcnow()
)
except Exception as e:
return HealthCheck(
component="database",
status=HealthStatus.CRITICAL,
message=f"Database connection failed: {str(e)}",
details={'error': str(e)},
timestamp=datetime.utcnow()
)
async def _check_job_store(self) -> HealthCheck:
"""Check job store health.
Returns:
Health check result
"""
try:
# Get job store statistics
job_count = len(self.scheduler.scheduler.get_jobs())
store_size = len(self.scheduler.job_status)
if job_count != store_size:
return HealthCheck(
component="job_store",
status=HealthStatus.WARNING,
message="Job store inconsistency detected",
details={
'job_count': job_count,
'store_size': store_size
},
timestamp=datetime.utcnow()
)
return HealthCheck(
component="job_store",
status=HealthStatus.HEALTHY,
message="Job store is healthy",
details={
'job_count': job_count,
'store_size': store_size
},
timestamp=datetime.utcnow()
)
except Exception as e:
return HealthCheck(
component="job_store",
status=HealthStatus.CRITICAL,
message=f"Job store check failed: {str(e)}",
details={'error': str(e)},
timestamp=datetime.utcnow()
)
async def _handle_critical_issues(self, critical_checks: List[HealthCheck]):
"""Handle critical health issues.
Args:
critical_checks: List of critical health checks
"""
try:
# Log critical issues
for check in critical_checks:
self.logger.error(
f"Critical health issue in {check.component}: {check.message}"
)
# Attempt recovery actions
for check in critical_checks:
if check.component == "scheduler" and not self.scheduler.scheduler.running:
await self.scheduler.start()
elif check.component == "database":
# Attempt to reconnect
self.scheduler.engine.dispose()
self.scheduler.engine = create_engine(self.scheduler.db_url)
self.scheduler.Session = sessionmaker(bind=self.scheduler.engine)
elif check.component == "job_store":
# Attempt to recover job store
await self.scheduler._recover_jobs()
# Reset failure counters if recovery successful
self.failure_counts = {k: 0 for k in self.failure_counts}
except Exception as e:
self.logger.error(f"Failed to handle critical issues: {str(e)}")
def get_health_summary(self) -> Dict[str, Any]:
"""Get health check summary.
Returns:
Dictionary containing health summary
"""
try:
# Get latest health checks
latest_checks = {
check.component: check
for check in self.health_history[-len(self.health_history):]
}
# Calculate overall status
if any(check.status == HealthStatus.CRITICAL for check in latest_checks.values()):
overall_status = HealthStatus.CRITICAL
elif any(check.status == HealthStatus.WARNING for check in latest_checks.values()):
overall_status = HealthStatus.WARNING
else:
overall_status = HealthStatus.HEALTHY
return {
'status': overall_status.value,
'components': {
component: {
'status': check.status.value,
'message': check.message,
'details': check.details,
'timestamp': check.timestamp.isoformat()
}
for component, check in latest_checks.items()
},
'failure_counts': self.failure_counts,
'last_check': datetime.utcnow().isoformat()
}
except Exception as e:
self.logger.error(f"Failed to get health summary: {str(e)}")
return {
'status': HealthStatus.UNKNOWN.value,
'error': str(e),
'last_check': datetime.utcnow().isoformat()
}