584 lines
21 KiB
Python
584 lines
21 KiB
Python
"""
|
|
Schedule health monitoring system.
|
|
"""
|
|
|
|
import logging
|
|
import asyncio
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime, timedelta
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
|
|
from ..utils.error_handling import SchedulingError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger.setLevel(logging.INFO)
|
|
|
|
class HealthStatus(Enum):
|
|
"""Health check status."""
|
|
HEALTHY = "healthy"
|
|
WARNING = "warning"
|
|
CRITICAL = "critical"
|
|
UNKNOWN = "unknown"
|
|
|
|
@dataclass
|
|
class HealthCheck:
|
|
"""Health check result."""
|
|
component: str
|
|
status: HealthStatus
|
|
message: str
|
|
details: Dict[str, Any]
|
|
timestamp: datetime
|
|
|
|
class ScheduleHealthChecker:
|
|
"""Schedule health monitoring system."""
|
|
|
|
def __init__(
|
|
self,
|
|
scheduler,
|
|
check_interval: int = 300, # 5 minutes
|
|
warning_threshold: int = 3,
|
|
critical_threshold: int = 5
|
|
):
|
|
"""Initialize the health checker.
|
|
|
|
Args:
|
|
scheduler: ContentScheduler instance
|
|
check_interval: Health check interval in seconds
|
|
warning_threshold: Number of failures before warning
|
|
critical_threshold: Number of failures before critical
|
|
"""
|
|
self.logger = logger
|
|
self.scheduler = scheduler
|
|
self.check_interval = check_interval
|
|
self.warning_threshold = warning_threshold
|
|
self.critical_threshold = critical_threshold
|
|
|
|
# Initialize health check history
|
|
self.health_history = []
|
|
|
|
# Initialize failure counters
|
|
self.failure_counts = {
|
|
'job_execution': 0,
|
|
'platform_publish': 0,
|
|
'schedule_conflicts': 0,
|
|
'resource_usage': 0
|
|
}
|
|
|
|
# Initialize monitoring task
|
|
self.monitoring_task = None
|
|
|
|
async def start_monitoring(self):
|
|
"""Start the health monitoring system."""
|
|
try:
|
|
if not self.monitoring_task:
|
|
self.monitoring_task = asyncio.create_task(self._monitor_health())
|
|
self.logger.info("Health monitoring started")
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to start health monitoring: {str(e)}")
|
|
raise SchedulingError(f"Health monitoring start failed: {str(e)}")
|
|
|
|
async def stop_monitoring(self):
|
|
"""Stop the health monitoring system."""
|
|
try:
|
|
if self.monitoring_task:
|
|
self.monitoring_task.cancel()
|
|
self.monitoring_task = None
|
|
self.logger.info("Health monitoring stopped")
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to stop health monitoring: {str(e)}")
|
|
raise SchedulingError(f"Health monitoring stop failed: {str(e)}")
|
|
|
|
async def _monitor_health(self):
|
|
"""Monitor system health periodically."""
|
|
while True:
|
|
try:
|
|
# Perform health checks
|
|
health_checks = await self._perform_health_checks()
|
|
|
|
# Update health history
|
|
self.health_history.extend(health_checks)
|
|
|
|
# Trim history if too long
|
|
if len(self.health_history) > 1000:
|
|
self.health_history = self.health_history[-1000:]
|
|
|
|
# Check for critical issues
|
|
critical_checks = [
|
|
check for check in health_checks
|
|
if check.status == HealthStatus.CRITICAL
|
|
]
|
|
|
|
if critical_checks:
|
|
await self._handle_critical_issues(critical_checks)
|
|
|
|
# Wait for next check
|
|
await asyncio.sleep(self.check_interval)
|
|
|
|
except asyncio.CancelledError:
|
|
break
|
|
except Exception as e:
|
|
self.logger.error(f"Health monitoring error: {str(e)}")
|
|
await asyncio.sleep(self.check_interval)
|
|
|
|
async def _perform_health_checks(self) -> List[HealthCheck]:
|
|
"""Perform all health checks.
|
|
|
|
Returns:
|
|
List of health check results
|
|
"""
|
|
checks = []
|
|
|
|
try:
|
|
# Check scheduler status
|
|
checks.append(await self._check_scheduler_status())
|
|
|
|
# Check job execution
|
|
checks.append(await self._check_job_execution())
|
|
|
|
# Check platform connectivity
|
|
checks.append(await self._check_platform_connectivity())
|
|
|
|
# Check resource usage
|
|
checks.append(await self._check_resource_usage())
|
|
|
|
# Check schedule conflicts
|
|
checks.append(await self._check_schedule_conflicts())
|
|
|
|
# Check database connection
|
|
checks.append(await self._check_database_connection())
|
|
|
|
# Check job store
|
|
checks.append(await self._check_job_store())
|
|
|
|
return checks
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Health check failed: {str(e)}")
|
|
return [
|
|
HealthCheck(
|
|
component="health_checker",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Health check system error: {str(e)}",
|
|
details={'error': str(e)},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
]
|
|
|
|
async def _check_scheduler_status(self) -> HealthCheck:
|
|
"""Check scheduler status.
|
|
|
|
Returns:
|
|
Health check result
|
|
"""
|
|
try:
|
|
is_running = self.scheduler.scheduler.running
|
|
job_count = len(self.scheduler.scheduler.get_jobs())
|
|
|
|
if not is_running:
|
|
return HealthCheck(
|
|
component="scheduler",
|
|
status=HealthStatus.CRITICAL,
|
|
message="Scheduler is not running",
|
|
details={'job_count': job_count},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
return HealthCheck(
|
|
component="scheduler",
|
|
status=HealthStatus.HEALTHY,
|
|
message="Scheduler is running",
|
|
details={'job_count': job_count},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
return HealthCheck(
|
|
component="scheduler",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Scheduler check failed: {str(e)}",
|
|
details={'error': str(e)},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
async def _check_job_execution(self) -> HealthCheck:
|
|
"""Check job execution health.
|
|
|
|
Returns:
|
|
Health check result
|
|
"""
|
|
try:
|
|
# Get recent job history
|
|
recent_jobs = [
|
|
job for job in self.scheduler.job_status.values()
|
|
if datetime.utcnow() - job['created_at'] < timedelta(hours=24)
|
|
]
|
|
|
|
# Calculate failure rate
|
|
total_jobs = len(recent_jobs)
|
|
failed_jobs = len([
|
|
job for job in recent_jobs
|
|
if job['status'] == 'FAILED'
|
|
])
|
|
|
|
failure_rate = failed_jobs / total_jobs if total_jobs > 0 else 0
|
|
|
|
# Update failure counter
|
|
self.failure_counts['job_execution'] = failed_jobs
|
|
|
|
if failure_rate >= 0.2: # 20% failure rate
|
|
return HealthCheck(
|
|
component="job_execution",
|
|
status=HealthStatus.CRITICAL,
|
|
message="High job failure rate detected",
|
|
details={
|
|
'total_jobs': total_jobs,
|
|
'failed_jobs': failed_jobs,
|
|
'failure_rate': failure_rate
|
|
},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
elif failure_rate >= 0.1: # 10% failure rate
|
|
return HealthCheck(
|
|
component="job_execution",
|
|
status=HealthStatus.WARNING,
|
|
message="Elevated job failure rate",
|
|
details={
|
|
'total_jobs': total_jobs,
|
|
'failed_jobs': failed_jobs,
|
|
'failure_rate': failure_rate
|
|
},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
return HealthCheck(
|
|
component="job_execution",
|
|
status=HealthStatus.HEALTHY,
|
|
message="Job execution is healthy",
|
|
details={
|
|
'total_jobs': total_jobs,
|
|
'failed_jobs': failed_jobs,
|
|
'failure_rate': failure_rate
|
|
},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
return HealthCheck(
|
|
component="job_execution",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Job execution check failed: {str(e)}",
|
|
details={'error': str(e)},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
async def _check_platform_connectivity(self) -> HealthCheck:
|
|
"""Check platform connectivity.
|
|
|
|
Returns:
|
|
Health check result
|
|
"""
|
|
try:
|
|
# Get unique platforms from recent jobs
|
|
platforms = set()
|
|
for job in self.scheduler.job_status.values():
|
|
if 'schedule' in job:
|
|
platforms.update(job['schedule'].platforms)
|
|
|
|
# Check each platform
|
|
platform_status = {}
|
|
for platform in platforms:
|
|
try:
|
|
adapter = self.scheduler._get_platform_adapter(platform)
|
|
# Try to get platform status
|
|
status = await adapter.get_platform_status()
|
|
platform_status[platform] = status['status']
|
|
except Exception as e:
|
|
platform_status[platform] = 'error'
|
|
self.failure_counts['platform_publish'] += 1
|
|
|
|
# Check overall status
|
|
if any(status == 'error' for status in platform_status.values()):
|
|
return HealthCheck(
|
|
component="platform_connectivity",
|
|
status=HealthStatus.CRITICAL,
|
|
message="Platform connectivity issues detected",
|
|
details={'platform_status': platform_status},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
return HealthCheck(
|
|
component="platform_connectivity",
|
|
status=HealthStatus.HEALTHY,
|
|
message="Platform connectivity is healthy",
|
|
details={'platform_status': platform_status},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
return HealthCheck(
|
|
component="platform_connectivity",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Platform connectivity check failed: {str(e)}",
|
|
details={'error': str(e)},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
async def _check_resource_usage(self) -> HealthCheck:
|
|
"""Check system resource usage.
|
|
|
|
Returns:
|
|
Health check result
|
|
"""
|
|
try:
|
|
import psutil
|
|
|
|
# Get system metrics
|
|
cpu_percent = psutil.cpu_percent()
|
|
memory_percent = psutil.virtual_memory().percent
|
|
disk_percent = psutil.disk_usage('/').percent
|
|
|
|
# Check thresholds
|
|
if cpu_percent > 90 or memory_percent > 90 or disk_percent > 90:
|
|
self.failure_counts['resource_usage'] += 1
|
|
return HealthCheck(
|
|
component="resource_usage",
|
|
status=HealthStatus.CRITICAL,
|
|
message="High resource usage detected",
|
|
details={
|
|
'cpu_percent': cpu_percent,
|
|
'memory_percent': memory_percent,
|
|
'disk_percent': disk_percent
|
|
},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
elif cpu_percent > 70 or memory_percent > 70 or disk_percent > 70:
|
|
return HealthCheck(
|
|
component="resource_usage",
|
|
status=HealthStatus.WARNING,
|
|
message="Elevated resource usage",
|
|
details={
|
|
'cpu_percent': cpu_percent,
|
|
'memory_percent': memory_percent,
|
|
'disk_percent': disk_percent
|
|
},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
return HealthCheck(
|
|
component="resource_usage",
|
|
status=HealthStatus.HEALTHY,
|
|
message="Resource usage is healthy",
|
|
details={
|
|
'cpu_percent': cpu_percent,
|
|
'memory_percent': memory_percent,
|
|
'disk_percent': disk_percent
|
|
},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
return HealthCheck(
|
|
component="resource_usage",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Resource usage check failed: {str(e)}",
|
|
details={'error': str(e)},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
async def _check_schedule_conflicts(self) -> HealthCheck:
|
|
"""Check for schedule conflicts.
|
|
|
|
Returns:
|
|
Health check result
|
|
"""
|
|
try:
|
|
# Get all pending schedules
|
|
pending_schedules = [
|
|
job['schedule'] for job in self.scheduler.job_status.values()
|
|
if job['status'] == 'PENDING'
|
|
]
|
|
|
|
# Check for conflicts
|
|
conflicts = await self.scheduler.conflict_resolver.detect_conflicts(
|
|
pending_schedules
|
|
)
|
|
|
|
if conflicts:
|
|
self.failure_counts['schedule_conflicts'] += len(conflicts)
|
|
return HealthCheck(
|
|
component="schedule_conflicts",
|
|
status=HealthStatus.WARNING,
|
|
message="Schedule conflicts detected",
|
|
details={
|
|
'conflict_count': len(conflicts),
|
|
'conflicts': [c.dict() for c in conflicts]
|
|
},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
return HealthCheck(
|
|
component="schedule_conflicts",
|
|
status=HealthStatus.HEALTHY,
|
|
message="No schedule conflicts detected",
|
|
details={'conflict_count': 0},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
return HealthCheck(
|
|
component="schedule_conflicts",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Schedule conflict check failed: {str(e)}",
|
|
details={'error': str(e)},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
async def _check_database_connection(self) -> HealthCheck:
|
|
"""Check database connection health.
|
|
|
|
Returns:
|
|
Health check result
|
|
"""
|
|
try:
|
|
session = self.scheduler.Session()
|
|
session.execute("SELECT 1")
|
|
session.close()
|
|
|
|
return HealthCheck(
|
|
component="database",
|
|
status=HealthStatus.HEALTHY,
|
|
message="Database connection is healthy",
|
|
details={},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
return HealthCheck(
|
|
component="database",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Database connection failed: {str(e)}",
|
|
details={'error': str(e)},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
async def _check_job_store(self) -> HealthCheck:
|
|
"""Check job store health.
|
|
|
|
Returns:
|
|
Health check result
|
|
"""
|
|
try:
|
|
# Get job store statistics
|
|
job_count = len(self.scheduler.scheduler.get_jobs())
|
|
store_size = len(self.scheduler.job_status)
|
|
|
|
if job_count != store_size:
|
|
return HealthCheck(
|
|
component="job_store",
|
|
status=HealthStatus.WARNING,
|
|
message="Job store inconsistency detected",
|
|
details={
|
|
'job_count': job_count,
|
|
'store_size': store_size
|
|
},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
return HealthCheck(
|
|
component="job_store",
|
|
status=HealthStatus.HEALTHY,
|
|
message="Job store is healthy",
|
|
details={
|
|
'job_count': job_count,
|
|
'store_size': store_size
|
|
},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
return HealthCheck(
|
|
component="job_store",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Job store check failed: {str(e)}",
|
|
details={'error': str(e)},
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
async def _handle_critical_issues(self, critical_checks: List[HealthCheck]):
|
|
"""Handle critical health issues.
|
|
|
|
Args:
|
|
critical_checks: List of critical health checks
|
|
"""
|
|
try:
|
|
# Log critical issues
|
|
for check in critical_checks:
|
|
self.logger.error(
|
|
f"Critical health issue in {check.component}: {check.message}"
|
|
)
|
|
|
|
# Attempt recovery actions
|
|
for check in critical_checks:
|
|
if check.component == "scheduler" and not self.scheduler.scheduler.running:
|
|
await self.scheduler.start()
|
|
|
|
elif check.component == "database":
|
|
# Attempt to reconnect
|
|
self.scheduler.engine.dispose()
|
|
self.scheduler.engine = create_engine(self.scheduler.db_url)
|
|
self.scheduler.Session = sessionmaker(bind=self.scheduler.engine)
|
|
|
|
elif check.component == "job_store":
|
|
# Attempt to recover job store
|
|
await self.scheduler._recover_jobs()
|
|
|
|
# Reset failure counters if recovery successful
|
|
self.failure_counts = {k: 0 for k in self.failure_counts}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to handle critical issues: {str(e)}")
|
|
|
|
def get_health_summary(self) -> Dict[str, Any]:
|
|
"""Get health check summary.
|
|
|
|
Returns:
|
|
Dictionary containing health summary
|
|
"""
|
|
try:
|
|
# Get latest health checks
|
|
latest_checks = {
|
|
check.component: check
|
|
for check in self.health_history[-len(self.health_history):]
|
|
}
|
|
|
|
# Calculate overall status
|
|
if any(check.status == HealthStatus.CRITICAL for check in latest_checks.values()):
|
|
overall_status = HealthStatus.CRITICAL
|
|
elif any(check.status == HealthStatus.WARNING for check in latest_checks.values()):
|
|
overall_status = HealthStatus.WARNING
|
|
else:
|
|
overall_status = HealthStatus.HEALTHY
|
|
|
|
return {
|
|
'status': overall_status.value,
|
|
'components': {
|
|
component: {
|
|
'status': check.status.value,
|
|
'message': check.message,
|
|
'details': check.details,
|
|
'timestamp': check.timestamp.isoformat()
|
|
}
|
|
for component, check in latest_checks.items()
|
|
},
|
|
'failure_counts': self.failure_counts,
|
|
'last_check': datetime.utcnow().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to get health summary: {str(e)}")
|
|
return {
|
|
'status': HealthStatus.UNKNOWN.value,
|
|
'error': str(e),
|
|
'last_check': datetime.utcnow().isoformat()
|
|
} |