Files
ALwrity/backend/services/scheduler/core/exception_handler.py

396 lines
14 KiB
Python

"""
Comprehensive Exception Handling and Logging for Task Scheduler
Provides robust error handling, logging, and monitoring for the scheduler system.
"""
import traceback
import sys
from datetime import datetime
from typing import Dict, Any, Optional, Union
from enum import Enum
from sqlalchemy.orm import Session
from sqlalchemy.exc import SQLAlchemyError, OperationalError, IntegrityError
from utils.logger_utils import get_service_logger
logger = get_service_logger("scheduler_exception_handler")
class SchedulerErrorType(Enum):
"""Error types for scheduler system."""
DATABASE_ERROR = "database_error"
TASK_EXECUTION_ERROR = "task_execution_error"
TASK_LOADER_ERROR = "task_loader_error"
SCHEDULER_CONFIG_ERROR = "scheduler_config_error"
RETRY_ERROR = "retry_error"
CONCURRENCY_ERROR = "concurrency_error"
TIMEOUT_ERROR = "timeout_error"
class SchedulerErrorSeverity(Enum):
"""Severity levels for scheduler errors."""
LOW = "low"
MEDIUM = "medium"
HIGH = "high"
CRITICAL = "critical"
class SchedulerException(Exception):
"""Base exception for scheduler system errors."""
def __init__(
self,
message: str,
error_type: SchedulerErrorType,
severity: SchedulerErrorSeverity = SchedulerErrorSeverity.MEDIUM,
user_id: Optional[int] = None,
task_id: Optional[int] = None,
task_type: Optional[str] = None,
context: Optional[Dict[str, Any]] = None,
original_error: Optional[Exception] = None
):
self.message = message
self.error_type = error_type
self.severity = severity
self.user_id = user_id
self.task_id = task_id
self.task_type = task_type
self.context = context or {}
self.original_error = original_error
self.timestamp = datetime.utcnow()
# Capture stack trace if original error provided
self.stack_trace = None
if self.original_error:
try:
exc_type, exc_value, exc_traceback = sys.exc_info()
if exc_traceback:
self.stack_trace = ''.join(traceback.format_exception(
exc_type, exc_value, exc_traceback
))
else:
self.stack_trace = traceback.format_exception(
type(self.original_error),
self.original_error,
self.original_error.__traceback__
)
except Exception:
self.stack_trace = str(self.original_error)
super().__init__(message)
def to_dict(self) -> Dict[str, Any]:
"""Convert exception to dictionary for logging/storage."""
return {
"message": self.message,
"error_type": self.error_type.value,
"severity": self.severity.value,
"user_id": self.user_id,
"task_id": self.task_id,
"task_type": self.task_type,
"context": self.context,
"timestamp": self.timestamp.isoformat() if isinstance(self.timestamp, datetime) else self.timestamp,
"original_error": str(self.original_error) if self.original_error else None,
"stack_trace": self.stack_trace
}
def __str__(self):
return f"[{self.error_type.value}] {self.message}"
class DatabaseError(SchedulerException):
"""Exception raised for database-related errors."""
def __init__(
self,
message: str,
user_id: Optional[int] = None,
task_id: Optional[int] = None,
context: Dict[str, Any] = None,
original_error: Exception = None
):
super().__init__(
message=message,
error_type=SchedulerErrorType.DATABASE_ERROR,
severity=SchedulerErrorSeverity.CRITICAL,
user_id=user_id,
task_id=task_id,
context=context or {},
original_error=original_error
)
class TaskExecutionError(SchedulerException):
"""Exception raised for task execution failures."""
def __init__(
self,
message: str,
user_id: Optional[int] = None,
task_id: Optional[int] = None,
task_type: Optional[str] = None,
retry_count: int = 0,
execution_time_ms: Optional[int] = None,
context: Dict[str, Any] = None,
original_error: Exception = None
):
context = context or {}
context.update({
"retry_count": retry_count,
"execution_time_ms": execution_time_ms
})
super().__init__(
message=message,
error_type=SchedulerErrorType.TASK_EXECUTION_ERROR,
severity=SchedulerErrorSeverity.HIGH,
user_id=user_id,
task_id=task_id,
task_type=task_type,
context=context,
original_error=original_error
)
class TaskLoaderError(SchedulerException):
"""Exception raised for task loading failures."""
def __init__(
self,
message: str,
task_type: Optional[str] = None,
user_id: Optional[int] = None,
context: Dict[str, Any] = None,
original_error: Exception = None
):
super().__init__(
message=message,
error_type=SchedulerErrorType.TASK_LOADER_ERROR,
severity=SchedulerErrorSeverity.HIGH,
user_id=user_id,
task_type=task_type,
context=context or {},
original_error=original_error
)
class SchedulerConfigError(SchedulerException):
"""Exception raised for scheduler configuration errors."""
def __init__(
self,
message: str,
context: Dict[str, Any] = None,
original_error: Exception = None
):
super().__init__(
message=message,
error_type=SchedulerErrorType.SCHEDULER_CONFIG_ERROR,
severity=SchedulerErrorSeverity.CRITICAL,
context=context or {},
original_error=original_error
)
class SchedulerExceptionHandler:
"""Comprehensive exception handler for the scheduler system."""
def __init__(self, db: Session = None):
self.db = db
self.logger = logger
def handle_exception(
self,
error: Union[Exception, SchedulerException],
context: Dict[str, Any] = None,
log_level: str = "error"
) -> Dict[str, Any]:
"""Handle and log scheduler exceptions."""
context = context or {}
# Convert regular exceptions to SchedulerException
if not isinstance(error, SchedulerException):
error = SchedulerException(
message=str(error),
error_type=self._classify_error(error),
severity=self._determine_severity(error),
context=context,
original_error=error
)
# Log the error
error_data = error.to_dict()
error_data.update(context)
log_message = f"Scheduler Error: {error.message}"
if log_level == "critical" or error.severity == SchedulerErrorSeverity.CRITICAL:
self.logger.critical(log_message, extra={"error_data": error_data})
elif log_level == "error" or error.severity == SchedulerErrorSeverity.HIGH:
self.logger.error(log_message, extra={"error_data": error_data})
elif log_level == "warning" or error.severity == SchedulerErrorSeverity.MEDIUM:
self.logger.warning(log_message, extra={"error_data": error_data})
else:
self.logger.info(log_message, extra={"error_data": error_data})
# Store critical errors in database for alerting
if error.severity in [SchedulerErrorSeverity.HIGH, SchedulerErrorSeverity.CRITICAL]:
self._store_error_alert(error)
# Return formatted error response
return self._format_error_response(error)
def _classify_error(self, error: Exception) -> SchedulerErrorType:
"""Classify an exception into a scheduler error type."""
error_str = str(error).lower()
error_type_name = type(error).__name__.lower()
# Database errors
if isinstance(error, (SQLAlchemyError, OperationalError, IntegrityError)):
return SchedulerErrorType.DATABASE_ERROR
if "database" in error_str or "sql" in error_type_name or "connection" in error_str:
return SchedulerErrorType.DATABASE_ERROR
# Timeout errors
if "timeout" in error_str or "timed out" in error_str:
return SchedulerErrorType.TIMEOUT_ERROR
# Concurrency errors
if "concurrent" in error_str or "race" in error_str or "lock" in error_str:
return SchedulerErrorType.CONCURRENCY_ERROR
# Task execution errors
if "task" in error_str and "execut" in error_str:
return SchedulerErrorType.TASK_EXECUTION_ERROR
# Task loader errors
if "load" in error_str and "task" in error_str:
return SchedulerErrorType.TASK_LOADER_ERROR
# Retry errors
if "retry" in error_str:
return SchedulerErrorType.RETRY_ERROR
# Config errors
if "config" in error_str or "scheduler" in error_str and "init" in error_str:
return SchedulerErrorType.SCHEDULER_CONFIG_ERROR
# Default to task execution error for unknown errors
return SchedulerErrorType.TASK_EXECUTION_ERROR
def _determine_severity(self, error: Exception) -> SchedulerErrorSeverity:
"""Determine the severity of an error."""
error_str = str(error).lower()
error_type = type(error)
# Critical errors
if isinstance(error, (SQLAlchemyError, OperationalError, ConnectionError)):
return SchedulerErrorSeverity.CRITICAL
if "database" in error_str or "connection" in error_str:
return SchedulerErrorSeverity.CRITICAL
# High severity errors
if "timeout" in error_str or "concurrent" in error_str:
return SchedulerErrorSeverity.HIGH
if isinstance(error, (KeyError, AttributeError)) and "config" in error_str:
return SchedulerErrorSeverity.HIGH
# Medium severity errors
if "task" in error_str or "execution" in error_str:
return SchedulerErrorSeverity.MEDIUM
# Default to low
return SchedulerErrorSeverity.LOW
def _store_error_alert(self, error: SchedulerException):
"""Store critical errors in database for alerting."""
if not self.db:
return
try:
# Import here to avoid circular dependencies
from models.monitoring_models import TaskExecutionLog
# Store as failed execution log if we have task_id (even without user_id for system errors)
if error.task_id:
try:
execution_log = TaskExecutionLog(
task_id=error.task_id,
user_id=error.user_id, # Can be None for system-level errors
execution_date=error.timestamp,
status='failed',
error_message=error.message,
result_data={
"error_type": error.error_type.value,
"severity": error.severity.value,
"context": error.context,
"stack_trace": error.stack_trace,
"task_type": error.task_type
}
)
self.db.add(execution_log)
self.db.commit()
self.logger.info(f"Stored error alert in execution log for task {error.task_id}")
except Exception as e:
self.logger.error(f"Failed to store error in execution log: {e}")
self.db.rollback()
# Note: For errors without task_id, we rely on structured logging only
# Future: Could create a separate scheduler_error_logs table for system-level errors
except Exception as e:
self.logger.error(f"Failed to store error alert: {e}")
def _format_error_response(self, error: SchedulerException) -> Dict[str, Any]:
"""Format error for API response or logging."""
response = {
"success": False,
"error": {
"type": error.error_type.value,
"message": error.message,
"severity": error.severity.value,
"timestamp": error.timestamp.isoformat() if isinstance(error.timestamp, datetime) else str(error.timestamp),
"user_id": error.user_id,
"task_id": error.task_id,
"task_type": error.task_type
}
}
# Add context for debugging (non-sensitive info only)
if error.context:
safe_context = {
k: v for k, v in error.context.items()
if k not in ["password", "token", "key", "secret", "credential"]
}
response["error"]["context"] = safe_context
# Add user-friendly message based on error type
user_messages = {
SchedulerErrorType.DATABASE_ERROR:
"A database error occurred while processing the task. Please try again later.",
SchedulerErrorType.TASK_EXECUTION_ERROR:
"The task failed to execute. Please check the task configuration and try again.",
SchedulerErrorType.TASK_LOADER_ERROR:
"Failed to load tasks. The scheduler may be experiencing issues.",
SchedulerErrorType.SCHEDULER_CONFIG_ERROR:
"The scheduler configuration is invalid. Contact support.",
SchedulerErrorType.RETRY_ERROR:
"Task retry failed. The task will be rescheduled.",
SchedulerErrorType.CONCURRENCY_ERROR:
"A concurrency issue occurred. The task will be retried.",
SchedulerErrorType.TIMEOUT_ERROR:
"The task execution timed out. The task will be retried."
}
response["error"]["user_message"] = user_messages.get(
error.error_type,
"An error occurred while processing the task."
)
return response