alwrity chatbot assistant, content scheduler, and content repurposing

2025-06-02 00:00:18 +05:30
parent 889021c078
commit 5ca2fd5977
69 changed files with 13952 additions and 3279 deletions
--- a/lib/content_scheduler/core/health_checker.py
+++ b/lib/content_scheduler/core/health_checker.py
@@ -0,0 +1,584 @@
+"""
+Schedule health monitoring system.
+"""
+
+import logging
+import asyncio
+from typing import Dict, Any, List, Optional
+from datetime import datetime, timedelta
+from dataclasses import dataclass
+from enum import Enum
+
+from ..utils.error_handling import SchedulingError
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+class HealthStatus(Enum):
+    """Health check status."""
+    HEALTHY = "healthy"
+    WARNING = "warning"
+    CRITICAL = "critical"
+    UNKNOWN = "unknown"
+
+@dataclass
+class HealthCheck:
+    """Health check result."""
+    component: str
+    status: HealthStatus
+    message: str
+    details: Dict[str, Any]
+    timestamp: datetime
+
+class ScheduleHealthChecker:
+    """Schedule health monitoring system."""
+    
+    def __init__(
+        self,
+        scheduler,
+        check_interval: int = 300,  # 5 minutes
+        warning_threshold: int = 3,
+        critical_threshold: int = 5
+    ):
+        """Initialize the health checker.
+        
+        Args:
+            scheduler: ContentScheduler instance
+            check_interval: Health check interval in seconds
+            warning_threshold: Number of failures before warning
+            critical_threshold: Number of failures before critical
+        """
+        self.logger = logger
+        self.scheduler = scheduler
+        self.check_interval = check_interval
+        self.warning_threshold = warning_threshold
+        self.critical_threshold = critical_threshold
+        
+        # Initialize health check history
+        self.health_history = []
+        
+        # Initialize failure counters
+        self.failure_counts = {
+            'job_execution': 0,
+            'platform_publish': 0,
+            'schedule_conflicts': 0,
+            'resource_usage': 0
+        }
+        
+        # Initialize monitoring task
+        self.monitoring_task = None
+    
+    async def start_monitoring(self):
+        """Start the health monitoring system."""
+        try:
+            if not self.monitoring_task:
+                self.monitoring_task = asyncio.create_task(self._monitor_health())
+                self.logger.info("Health monitoring started")
+        except Exception as e:
+            self.logger.error(f"Failed to start health monitoring: {str(e)}")
+            raise SchedulingError(f"Health monitoring start failed: {str(e)}")
+    
+    async def stop_monitoring(self):
+        """Stop the health monitoring system."""
+        try:
+            if self.monitoring_task:
+                self.monitoring_task.cancel()
+                self.monitoring_task = None
+                self.logger.info("Health monitoring stopped")
+        except Exception as e:
+            self.logger.error(f"Failed to stop health monitoring: {str(e)}")
+            raise SchedulingError(f"Health monitoring stop failed: {str(e)}")
+    
+    async def _monitor_health(self):
+        """Monitor system health periodically."""
+        while True:
+            try:
+                # Perform health checks
+                health_checks = await self._perform_health_checks()
+                
+                # Update health history
+                self.health_history.extend(health_checks)
+                
+                # Trim history if too long
+                if len(self.health_history) > 1000:
+                    self.health_history = self.health_history[-1000:]
+                
+                # Check for critical issues
+                critical_checks = [
+                    check for check in health_checks
+                    if check.status == HealthStatus.CRITICAL
+                ]
+                
+                if critical_checks:
+                    await self._handle_critical_issues(critical_checks)
+                
+                # Wait for next check
+                await asyncio.sleep(self.check_interval)
+                
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                self.logger.error(f"Health monitoring error: {str(e)}")
+                await asyncio.sleep(self.check_interval)
+    
+    async def _perform_health_checks(self) -> List[HealthCheck]:
+        """Perform all health checks.
+        
+        Returns:
+            List of health check results
+        """
+        checks = []
+        
+        try:
+            # Check scheduler status
+            checks.append(await self._check_scheduler_status())
+            
+            # Check job execution
+            checks.append(await self._check_job_execution())
+            
+            # Check platform connectivity
+            checks.append(await self._check_platform_connectivity())
+            
+            # Check resource usage
+            checks.append(await self._check_resource_usage())
+            
+            # Check schedule conflicts
+            checks.append(await self._check_schedule_conflicts())
+            
+            # Check database connection
+            checks.append(await self._check_database_connection())
+            
+            # Check job store
+            checks.append(await self._check_job_store())
+            
+            return checks
+            
+        except Exception as e:
+            self.logger.error(f"Health check failed: {str(e)}")
+            return [
+                HealthCheck(
+                    component="health_checker",
+                    status=HealthStatus.CRITICAL,
+                    message=f"Health check system error: {str(e)}",
+                    details={'error': str(e)},
+                    timestamp=datetime.utcnow()
+                )
+            ]
+    
+    async def _check_scheduler_status(self) -> HealthCheck:
+        """Check scheduler status.
+        
+        Returns:
+            Health check result
+        """
+        try:
+            is_running = self.scheduler.scheduler.running
+            job_count = len(self.scheduler.scheduler.get_jobs())
+            
+            if not is_running:
+                return HealthCheck(
+                    component="scheduler",
+                    status=HealthStatus.CRITICAL,
+                    message="Scheduler is not running",
+                    details={'job_count': job_count},
+                    timestamp=datetime.utcnow()
+                )
+            
+            return HealthCheck(
+                component="scheduler",
+                status=HealthStatus.HEALTHY,
+                message="Scheduler is running",
+                details={'job_count': job_count},
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            return HealthCheck(
+                component="scheduler",
+                status=HealthStatus.CRITICAL,
+                message=f"Scheduler check failed: {str(e)}",
+                details={'error': str(e)},
+                timestamp=datetime.utcnow()
+            )
+    
+    async def _check_job_execution(self) -> HealthCheck:
+        """Check job execution health.
+        
+        Returns:
+            Health check result
+        """
+        try:
+            # Get recent job history
+            recent_jobs = [
+                job for job in self.scheduler.job_status.values()
+                if datetime.utcnow() - job['created_at'] < timedelta(hours=24)
+            ]
+            
+            # Calculate failure rate
+            total_jobs = len(recent_jobs)
+            failed_jobs = len([
+                job for job in recent_jobs
+                if job['status'] == 'FAILED'
+            ])
+            
+            failure_rate = failed_jobs / total_jobs if total_jobs > 0 else 0
+            
+            # Update failure counter
+            self.failure_counts['job_execution'] = failed_jobs
+            
+            if failure_rate >= 0.2:  # 20% failure rate
+                return HealthCheck(
+                    component="job_execution",
+                    status=HealthStatus.CRITICAL,
+                    message="High job failure rate detected",
+                    details={
+                        'total_jobs': total_jobs,
+                        'failed_jobs': failed_jobs,
+                        'failure_rate': failure_rate
+                    },
+                    timestamp=datetime.utcnow()
+                )
+            elif failure_rate >= 0.1:  # 10% failure rate
+                return HealthCheck(
+                    component="job_execution",
+                    status=HealthStatus.WARNING,
+                    message="Elevated job failure rate",
+                    details={
+                        'total_jobs': total_jobs,
+                        'failed_jobs': failed_jobs,
+                        'failure_rate': failure_rate
+                    },
+                    timestamp=datetime.utcnow()
+                )
+            
+            return HealthCheck(
+                component="job_execution",
+                status=HealthStatus.HEALTHY,
+                message="Job execution is healthy",
+                details={
+                    'total_jobs': total_jobs,
+                    'failed_jobs': failed_jobs,
+                    'failure_rate': failure_rate
+                },
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            return HealthCheck(
+                component="job_execution",
+                status=HealthStatus.CRITICAL,
+                message=f"Job execution check failed: {str(e)}",
+                details={'error': str(e)},
+                timestamp=datetime.utcnow()
+            )
+    
+    async def _check_platform_connectivity(self) -> HealthCheck:
+        """Check platform connectivity.
+        
+        Returns:
+            Health check result
+        """
+        try:
+            # Get unique platforms from recent jobs
+            platforms = set()
+            for job in self.scheduler.job_status.values():
+                if 'schedule' in job:
+                    platforms.update(job['schedule'].platforms)
+            
+            # Check each platform
+            platform_status = {}
+            for platform in platforms:
+                try:
+                    adapter = self.scheduler._get_platform_adapter(platform)
+                    # Try to get platform status
+                    status = await adapter.get_platform_status()
+                    platform_status[platform] = status['status']
+                except Exception as e:
+                    platform_status[platform] = 'error'
+                    self.failure_counts['platform_publish'] += 1
+            
+            # Check overall status
+            if any(status == 'error' for status in platform_status.values()):
+                return HealthCheck(
+                    component="platform_connectivity",
+                    status=HealthStatus.CRITICAL,
+                    message="Platform connectivity issues detected",
+                    details={'platform_status': platform_status},
+                    timestamp=datetime.utcnow()
+                )
+            
+            return HealthCheck(
+                component="platform_connectivity",
+                status=HealthStatus.HEALTHY,
+                message="Platform connectivity is healthy",
+                details={'platform_status': platform_status},
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            return HealthCheck(
+                component="platform_connectivity",
+                status=HealthStatus.CRITICAL,
+                message=f"Platform connectivity check failed: {str(e)}",
+                details={'error': str(e)},
+                timestamp=datetime.utcnow()
+            )
+    
+    async def _check_resource_usage(self) -> HealthCheck:
+        """Check system resource usage.
+        
+        Returns:
+            Health check result
+        """
+        try:
+            import psutil
+            
+            # Get system metrics
+            cpu_percent = psutil.cpu_percent()
+            memory_percent = psutil.virtual_memory().percent
+            disk_percent = psutil.disk_usage('/').percent
+            
+            # Check thresholds
+            if cpu_percent > 90 or memory_percent > 90 or disk_percent > 90:
+                self.failure_counts['resource_usage'] += 1
+                return HealthCheck(
+                    component="resource_usage",
+                    status=HealthStatus.CRITICAL,
+                    message="High resource usage detected",
+                    details={
+                        'cpu_percent': cpu_percent,
+                        'memory_percent': memory_percent,
+                        'disk_percent': disk_percent
+                    },
+                    timestamp=datetime.utcnow()
+                )
+            elif cpu_percent > 70 or memory_percent > 70 or disk_percent > 70:
+                return HealthCheck(
+                    component="resource_usage",
+                    status=HealthStatus.WARNING,
+                    message="Elevated resource usage",
+                    details={
+                        'cpu_percent': cpu_percent,
+                        'memory_percent': memory_percent,
+                        'disk_percent': disk_percent
+                    },
+                    timestamp=datetime.utcnow()
+                )
+            
+            return HealthCheck(
+                component="resource_usage",
+                status=HealthStatus.HEALTHY,
+                message="Resource usage is healthy",
+                details={
+                    'cpu_percent': cpu_percent,
+                    'memory_percent': memory_percent,
+                    'disk_percent': disk_percent
+                },
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            return HealthCheck(
+                component="resource_usage",
+                status=HealthStatus.CRITICAL,
+                message=f"Resource usage check failed: {str(e)}",
+                details={'error': str(e)},
+                timestamp=datetime.utcnow()
+            )
+    
+    async def _check_schedule_conflicts(self) -> HealthCheck:
+        """Check for schedule conflicts.
+        
+        Returns:
+            Health check result
+        """
+        try:
+            # Get all pending schedules
+            pending_schedules = [
+                job['schedule'] for job in self.scheduler.job_status.values()
+                if job['status'] == 'PENDING'
+            ]
+            
+            # Check for conflicts
+            conflicts = await self.scheduler.conflict_resolver.detect_conflicts(
+                pending_schedules
+            )
+            
+            if conflicts:
+                self.failure_counts['schedule_conflicts'] += len(conflicts)
+                return HealthCheck(
+                    component="schedule_conflicts",
+                    status=HealthStatus.WARNING,
+                    message="Schedule conflicts detected",
+                    details={
+                        'conflict_count': len(conflicts),
+                        'conflicts': [c.dict() for c in conflicts]
+                    },
+                    timestamp=datetime.utcnow()
+                )
+            
+            return HealthCheck(
+                component="schedule_conflicts",
+                status=HealthStatus.HEALTHY,
+                message="No schedule conflicts detected",
+                details={'conflict_count': 0},
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            return HealthCheck(
+                component="schedule_conflicts",
+                status=HealthStatus.CRITICAL,
+                message=f"Schedule conflict check failed: {str(e)}",
+                details={'error': str(e)},
+                timestamp=datetime.utcnow()
+            )
+    
+    async def _check_database_connection(self) -> HealthCheck:
+        """Check database connection health.
+        
+        Returns:
+            Health check result
+        """
+        try:
+            session = self.scheduler.Session()
+            session.execute("SELECT 1")
+            session.close()
+            
+            return HealthCheck(
+                component="database",
+                status=HealthStatus.HEALTHY,
+                message="Database connection is healthy",
+                details={},
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            return HealthCheck(
+                component="database",
+                status=HealthStatus.CRITICAL,
+                message=f"Database connection failed: {str(e)}",
+                details={'error': str(e)},
+                timestamp=datetime.utcnow()
+            )
+    
+    async def _check_job_store(self) -> HealthCheck:
+        """Check job store health.
+        
+        Returns:
+            Health check result
+        """
+        try:
+            # Get job store statistics
+            job_count = len(self.scheduler.scheduler.get_jobs())
+            store_size = len(self.scheduler.job_status)
+            
+            if job_count != store_size:
+                return HealthCheck(
+                    component="job_store",
+                    status=HealthStatus.WARNING,
+                    message="Job store inconsistency detected",
+                    details={
+                        'job_count': job_count,
+                        'store_size': store_size
+                    },
+                    timestamp=datetime.utcnow()
+                )
+            
+            return HealthCheck(
+                component="job_store",
+                status=HealthStatus.HEALTHY,
+                message="Job store is healthy",
+                details={
+                    'job_count': job_count,
+                    'store_size': store_size
+                },
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            return HealthCheck(
+                component="job_store",
+                status=HealthStatus.CRITICAL,
+                message=f"Job store check failed: {str(e)}",
+                details={'error': str(e)},
+                timestamp=datetime.utcnow()
+            )
+    
+    async def _handle_critical_issues(self, critical_checks: List[HealthCheck]):
+        """Handle critical health issues.
+        
+        Args:
+            critical_checks: List of critical health checks
+        """
+        try:
+            # Log critical issues
+            for check in critical_checks:
+                self.logger.error(
+                    f"Critical health issue in {check.component}: {check.message}"
+                )
+            
+            # Attempt recovery actions
+            for check in critical_checks:
+                if check.component == "scheduler" and not self.scheduler.scheduler.running:
+                    await self.scheduler.start()
+                
+                elif check.component == "database":
+                    # Attempt to reconnect
+                    self.scheduler.engine.dispose()
+                    self.scheduler.engine = create_engine(self.scheduler.db_url)
+                    self.scheduler.Session = sessionmaker(bind=self.scheduler.engine)
+                
+                elif check.component == "job_store":
+                    # Attempt to recover job store
+                    await self.scheduler._recover_jobs()
+            
+            # Reset failure counters if recovery successful
+            self.failure_counts = {k: 0 for k in self.failure_counts}
+            
+        except Exception as e:
+            self.logger.error(f"Failed to handle critical issues: {str(e)}")
+    
+    def get_health_summary(self) -> Dict[str, Any]:
+        """Get health check summary.
+        
+        Returns:
+            Dictionary containing health summary
+        """
+        try:
+            # Get latest health checks
+            latest_checks = {
+                check.component: check
+                for check in self.health_history[-len(self.health_history):]
+            }
+            
+            # Calculate overall status
+            if any(check.status == HealthStatus.CRITICAL for check in latest_checks.values()):
+                overall_status = HealthStatus.CRITICAL
+            elif any(check.status == HealthStatus.WARNING for check in latest_checks.values()):
+                overall_status = HealthStatus.WARNING
+            else:
+                overall_status = HealthStatus.HEALTHY
+            
+            return {
+                'status': overall_status.value,
+                'components': {
+                    component: {
+                        'status': check.status.value,
+                        'message': check.message,
+                        'details': check.details,
+                        'timestamp': check.timestamp.isoformat()
+                    }
+                    for component, check in latest_checks.items()
+                },
+                'failure_counts': self.failure_counts,
+                'last_check': datetime.utcnow().isoformat()
+            }
+            
+        except Exception as e:
+            self.logger.error(f"Failed to get health summary: {str(e)}")
+            return {
+                'status': HealthStatus.UNKNOWN.value,
+                'error': str(e),
+                'last_check': datetime.utcnow().isoformat()
+            }