alwrity chatbot assistant, content scheduler, and content repurposing
This commit is contained in:
584
lib/content_scheduler/core/health_checker.py
Normal file
584
lib/content_scheduler/core/health_checker.py
Normal file
@@ -0,0 +1,584 @@
|
||||
"""
|
||||
Schedule health monitoring system.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
from ..utils.error_handling import SchedulingError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
class HealthStatus(Enum):
|
||||
"""Health check status."""
|
||||
HEALTHY = "healthy"
|
||||
WARNING = "warning"
|
||||
CRITICAL = "critical"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
@dataclass
|
||||
class HealthCheck:
|
||||
"""Health check result."""
|
||||
component: str
|
||||
status: HealthStatus
|
||||
message: str
|
||||
details: Dict[str, Any]
|
||||
timestamp: datetime
|
||||
|
||||
class ScheduleHealthChecker:
|
||||
"""Schedule health monitoring system."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler,
|
||||
check_interval: int = 300, # 5 minutes
|
||||
warning_threshold: int = 3,
|
||||
critical_threshold: int = 5
|
||||
):
|
||||
"""Initialize the health checker.
|
||||
|
||||
Args:
|
||||
scheduler: ContentScheduler instance
|
||||
check_interval: Health check interval in seconds
|
||||
warning_threshold: Number of failures before warning
|
||||
critical_threshold: Number of failures before critical
|
||||
"""
|
||||
self.logger = logger
|
||||
self.scheduler = scheduler
|
||||
self.check_interval = check_interval
|
||||
self.warning_threshold = warning_threshold
|
||||
self.critical_threshold = critical_threshold
|
||||
|
||||
# Initialize health check history
|
||||
self.health_history = []
|
||||
|
||||
# Initialize failure counters
|
||||
self.failure_counts = {
|
||||
'job_execution': 0,
|
||||
'platform_publish': 0,
|
||||
'schedule_conflicts': 0,
|
||||
'resource_usage': 0
|
||||
}
|
||||
|
||||
# Initialize monitoring task
|
||||
self.monitoring_task = None
|
||||
|
||||
async def start_monitoring(self):
|
||||
"""Start the health monitoring system."""
|
||||
try:
|
||||
if not self.monitoring_task:
|
||||
self.monitoring_task = asyncio.create_task(self._monitor_health())
|
||||
self.logger.info("Health monitoring started")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to start health monitoring: {str(e)}")
|
||||
raise SchedulingError(f"Health monitoring start failed: {str(e)}")
|
||||
|
||||
async def stop_monitoring(self):
|
||||
"""Stop the health monitoring system."""
|
||||
try:
|
||||
if self.monitoring_task:
|
||||
self.monitoring_task.cancel()
|
||||
self.monitoring_task = None
|
||||
self.logger.info("Health monitoring stopped")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to stop health monitoring: {str(e)}")
|
||||
raise SchedulingError(f"Health monitoring stop failed: {str(e)}")
|
||||
|
||||
async def _monitor_health(self):
|
||||
"""Monitor system health periodically."""
|
||||
while True:
|
||||
try:
|
||||
# Perform health checks
|
||||
health_checks = await self._perform_health_checks()
|
||||
|
||||
# Update health history
|
||||
self.health_history.extend(health_checks)
|
||||
|
||||
# Trim history if too long
|
||||
if len(self.health_history) > 1000:
|
||||
self.health_history = self.health_history[-1000:]
|
||||
|
||||
# Check for critical issues
|
||||
critical_checks = [
|
||||
check for check in health_checks
|
||||
if check.status == HealthStatus.CRITICAL
|
||||
]
|
||||
|
||||
if critical_checks:
|
||||
await self._handle_critical_issues(critical_checks)
|
||||
|
||||
# Wait for next check
|
||||
await asyncio.sleep(self.check_interval)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error(f"Health monitoring error: {str(e)}")
|
||||
await asyncio.sleep(self.check_interval)
|
||||
|
||||
async def _perform_health_checks(self) -> List[HealthCheck]:
|
||||
"""Perform all health checks.
|
||||
|
||||
Returns:
|
||||
List of health check results
|
||||
"""
|
||||
checks = []
|
||||
|
||||
try:
|
||||
# Check scheduler status
|
||||
checks.append(await self._check_scheduler_status())
|
||||
|
||||
# Check job execution
|
||||
checks.append(await self._check_job_execution())
|
||||
|
||||
# Check platform connectivity
|
||||
checks.append(await self._check_platform_connectivity())
|
||||
|
||||
# Check resource usage
|
||||
checks.append(await self._check_resource_usage())
|
||||
|
||||
# Check schedule conflicts
|
||||
checks.append(await self._check_schedule_conflicts())
|
||||
|
||||
# Check database connection
|
||||
checks.append(await self._check_database_connection())
|
||||
|
||||
# Check job store
|
||||
checks.append(await self._check_job_store())
|
||||
|
||||
return checks
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Health check failed: {str(e)}")
|
||||
return [
|
||||
HealthCheck(
|
||||
component="health_checker",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Health check system error: {str(e)}",
|
||||
details={'error': str(e)},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
]
|
||||
|
||||
async def _check_scheduler_status(self) -> HealthCheck:
|
||||
"""Check scheduler status.
|
||||
|
||||
Returns:
|
||||
Health check result
|
||||
"""
|
||||
try:
|
||||
is_running = self.scheduler.scheduler.running
|
||||
job_count = len(self.scheduler.scheduler.get_jobs())
|
||||
|
||||
if not is_running:
|
||||
return HealthCheck(
|
||||
component="scheduler",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message="Scheduler is not running",
|
||||
details={'job_count': job_count},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
return HealthCheck(
|
||||
component="scheduler",
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="Scheduler is running",
|
||||
details={'job_count': job_count},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return HealthCheck(
|
||||
component="scheduler",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Scheduler check failed: {str(e)}",
|
||||
details={'error': str(e)},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
async def _check_job_execution(self) -> HealthCheck:
|
||||
"""Check job execution health.
|
||||
|
||||
Returns:
|
||||
Health check result
|
||||
"""
|
||||
try:
|
||||
# Get recent job history
|
||||
recent_jobs = [
|
||||
job for job in self.scheduler.job_status.values()
|
||||
if datetime.utcnow() - job['created_at'] < timedelta(hours=24)
|
||||
]
|
||||
|
||||
# Calculate failure rate
|
||||
total_jobs = len(recent_jobs)
|
||||
failed_jobs = len([
|
||||
job for job in recent_jobs
|
||||
if job['status'] == 'FAILED'
|
||||
])
|
||||
|
||||
failure_rate = failed_jobs / total_jobs if total_jobs > 0 else 0
|
||||
|
||||
# Update failure counter
|
||||
self.failure_counts['job_execution'] = failed_jobs
|
||||
|
||||
if failure_rate >= 0.2: # 20% failure rate
|
||||
return HealthCheck(
|
||||
component="job_execution",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message="High job failure rate detected",
|
||||
details={
|
||||
'total_jobs': total_jobs,
|
||||
'failed_jobs': failed_jobs,
|
||||
'failure_rate': failure_rate
|
||||
},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
elif failure_rate >= 0.1: # 10% failure rate
|
||||
return HealthCheck(
|
||||
component="job_execution",
|
||||
status=HealthStatus.WARNING,
|
||||
message="Elevated job failure rate",
|
||||
details={
|
||||
'total_jobs': total_jobs,
|
||||
'failed_jobs': failed_jobs,
|
||||
'failure_rate': failure_rate
|
||||
},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
return HealthCheck(
|
||||
component="job_execution",
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="Job execution is healthy",
|
||||
details={
|
||||
'total_jobs': total_jobs,
|
||||
'failed_jobs': failed_jobs,
|
||||
'failure_rate': failure_rate
|
||||
},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return HealthCheck(
|
||||
component="job_execution",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Job execution check failed: {str(e)}",
|
||||
details={'error': str(e)},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
async def _check_platform_connectivity(self) -> HealthCheck:
|
||||
"""Check platform connectivity.
|
||||
|
||||
Returns:
|
||||
Health check result
|
||||
"""
|
||||
try:
|
||||
# Get unique platforms from recent jobs
|
||||
platforms = set()
|
||||
for job in self.scheduler.job_status.values():
|
||||
if 'schedule' in job:
|
||||
platforms.update(job['schedule'].platforms)
|
||||
|
||||
# Check each platform
|
||||
platform_status = {}
|
||||
for platform in platforms:
|
||||
try:
|
||||
adapter = self.scheduler._get_platform_adapter(platform)
|
||||
# Try to get platform status
|
||||
status = await adapter.get_platform_status()
|
||||
platform_status[platform] = status['status']
|
||||
except Exception as e:
|
||||
platform_status[platform] = 'error'
|
||||
self.failure_counts['platform_publish'] += 1
|
||||
|
||||
# Check overall status
|
||||
if any(status == 'error' for status in platform_status.values()):
|
||||
return HealthCheck(
|
||||
component="platform_connectivity",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message="Platform connectivity issues detected",
|
||||
details={'platform_status': platform_status},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
return HealthCheck(
|
||||
component="platform_connectivity",
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="Platform connectivity is healthy",
|
||||
details={'platform_status': platform_status},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return HealthCheck(
|
||||
component="platform_connectivity",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Platform connectivity check failed: {str(e)}",
|
||||
details={'error': str(e)},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
async def _check_resource_usage(self) -> HealthCheck:
|
||||
"""Check system resource usage.
|
||||
|
||||
Returns:
|
||||
Health check result
|
||||
"""
|
||||
try:
|
||||
import psutil
|
||||
|
||||
# Get system metrics
|
||||
cpu_percent = psutil.cpu_percent()
|
||||
memory_percent = psutil.virtual_memory().percent
|
||||
disk_percent = psutil.disk_usage('/').percent
|
||||
|
||||
# Check thresholds
|
||||
if cpu_percent > 90 or memory_percent > 90 or disk_percent > 90:
|
||||
self.failure_counts['resource_usage'] += 1
|
||||
return HealthCheck(
|
||||
component="resource_usage",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message="High resource usage detected",
|
||||
details={
|
||||
'cpu_percent': cpu_percent,
|
||||
'memory_percent': memory_percent,
|
||||
'disk_percent': disk_percent
|
||||
},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
elif cpu_percent > 70 or memory_percent > 70 or disk_percent > 70:
|
||||
return HealthCheck(
|
||||
component="resource_usage",
|
||||
status=HealthStatus.WARNING,
|
||||
message="Elevated resource usage",
|
||||
details={
|
||||
'cpu_percent': cpu_percent,
|
||||
'memory_percent': memory_percent,
|
||||
'disk_percent': disk_percent
|
||||
},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
return HealthCheck(
|
||||
component="resource_usage",
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="Resource usage is healthy",
|
||||
details={
|
||||
'cpu_percent': cpu_percent,
|
||||
'memory_percent': memory_percent,
|
||||
'disk_percent': disk_percent
|
||||
},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return HealthCheck(
|
||||
component="resource_usage",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Resource usage check failed: {str(e)}",
|
||||
details={'error': str(e)},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
async def _check_schedule_conflicts(self) -> HealthCheck:
|
||||
"""Check for schedule conflicts.
|
||||
|
||||
Returns:
|
||||
Health check result
|
||||
"""
|
||||
try:
|
||||
# Get all pending schedules
|
||||
pending_schedules = [
|
||||
job['schedule'] for job in self.scheduler.job_status.values()
|
||||
if job['status'] == 'PENDING'
|
||||
]
|
||||
|
||||
# Check for conflicts
|
||||
conflicts = await self.scheduler.conflict_resolver.detect_conflicts(
|
||||
pending_schedules
|
||||
)
|
||||
|
||||
if conflicts:
|
||||
self.failure_counts['schedule_conflicts'] += len(conflicts)
|
||||
return HealthCheck(
|
||||
component="schedule_conflicts",
|
||||
status=HealthStatus.WARNING,
|
||||
message="Schedule conflicts detected",
|
||||
details={
|
||||
'conflict_count': len(conflicts),
|
||||
'conflicts': [c.dict() for c in conflicts]
|
||||
},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
return HealthCheck(
|
||||
component="schedule_conflicts",
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="No schedule conflicts detected",
|
||||
details={'conflict_count': 0},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return HealthCheck(
|
||||
component="schedule_conflicts",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Schedule conflict check failed: {str(e)}",
|
||||
details={'error': str(e)},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
async def _check_database_connection(self) -> HealthCheck:
|
||||
"""Check database connection health.
|
||||
|
||||
Returns:
|
||||
Health check result
|
||||
"""
|
||||
try:
|
||||
session = self.scheduler.Session()
|
||||
session.execute("SELECT 1")
|
||||
session.close()
|
||||
|
||||
return HealthCheck(
|
||||
component="database",
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="Database connection is healthy",
|
||||
details={},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return HealthCheck(
|
||||
component="database",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Database connection failed: {str(e)}",
|
||||
details={'error': str(e)},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
async def _check_job_store(self) -> HealthCheck:
|
||||
"""Check job store health.
|
||||
|
||||
Returns:
|
||||
Health check result
|
||||
"""
|
||||
try:
|
||||
# Get job store statistics
|
||||
job_count = len(self.scheduler.scheduler.get_jobs())
|
||||
store_size = len(self.scheduler.job_status)
|
||||
|
||||
if job_count != store_size:
|
||||
return HealthCheck(
|
||||
component="job_store",
|
||||
status=HealthStatus.WARNING,
|
||||
message="Job store inconsistency detected",
|
||||
details={
|
||||
'job_count': job_count,
|
||||
'store_size': store_size
|
||||
},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
return HealthCheck(
|
||||
component="job_store",
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="Job store is healthy",
|
||||
details={
|
||||
'job_count': job_count,
|
||||
'store_size': store_size
|
||||
},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return HealthCheck(
|
||||
component="job_store",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Job store check failed: {str(e)}",
|
||||
details={'error': str(e)},
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
async def _handle_critical_issues(self, critical_checks: List[HealthCheck]):
|
||||
"""Handle critical health issues.
|
||||
|
||||
Args:
|
||||
critical_checks: List of critical health checks
|
||||
"""
|
||||
try:
|
||||
# Log critical issues
|
||||
for check in critical_checks:
|
||||
self.logger.error(
|
||||
f"Critical health issue in {check.component}: {check.message}"
|
||||
)
|
||||
|
||||
# Attempt recovery actions
|
||||
for check in critical_checks:
|
||||
if check.component == "scheduler" and not self.scheduler.scheduler.running:
|
||||
await self.scheduler.start()
|
||||
|
||||
elif check.component == "database":
|
||||
# Attempt to reconnect
|
||||
self.scheduler.engine.dispose()
|
||||
self.scheduler.engine = create_engine(self.scheduler.db_url)
|
||||
self.scheduler.Session = sessionmaker(bind=self.scheduler.engine)
|
||||
|
||||
elif check.component == "job_store":
|
||||
# Attempt to recover job store
|
||||
await self.scheduler._recover_jobs()
|
||||
|
||||
# Reset failure counters if recovery successful
|
||||
self.failure_counts = {k: 0 for k in self.failure_counts}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to handle critical issues: {str(e)}")
|
||||
|
||||
def get_health_summary(self) -> Dict[str, Any]:
|
||||
"""Get health check summary.
|
||||
|
||||
Returns:
|
||||
Dictionary containing health summary
|
||||
"""
|
||||
try:
|
||||
# Get latest health checks
|
||||
latest_checks = {
|
||||
check.component: check
|
||||
for check in self.health_history[-len(self.health_history):]
|
||||
}
|
||||
|
||||
# Calculate overall status
|
||||
if any(check.status == HealthStatus.CRITICAL for check in latest_checks.values()):
|
||||
overall_status = HealthStatus.CRITICAL
|
||||
elif any(check.status == HealthStatus.WARNING for check in latest_checks.values()):
|
||||
overall_status = HealthStatus.WARNING
|
||||
else:
|
||||
overall_status = HealthStatus.HEALTHY
|
||||
|
||||
return {
|
||||
'status': overall_status.value,
|
||||
'components': {
|
||||
component: {
|
||||
'status': check.status.value,
|
||||
'message': check.message,
|
||||
'details': check.details,
|
||||
'timestamp': check.timestamp.isoformat()
|
||||
}
|
||||
for component, check in latest_checks.items()
|
||||
},
|
||||
'failure_counts': self.failure_counts,
|
||||
'last_check': datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get health summary: {str(e)}")
|
||||
return {
|
||||
'status': HealthStatus.UNKNOWN.value,
|
||||
'error': str(e),
|
||||
'last_check': datetime.utcnow().isoformat()
|
||||
}
|
||||
Reference in New Issue
Block a user