404 lines
15 KiB
Python
404 lines
15 KiB
Python
"""
|
|
Comprehensive Exception Handling and Logging for Task Scheduler
|
|
Provides robust error handling, logging, and monitoring for the scheduler system.
|
|
"""
|
|
|
|
import traceback
|
|
import sys
|
|
from datetime import datetime
|
|
from typing import Dict, Any, Optional, Union
|
|
from enum import Enum
|
|
from sqlalchemy.orm import Session
|
|
from sqlalchemy.exc import SQLAlchemyError, OperationalError, IntegrityError
|
|
|
|
from utils.logger_utils import get_service_logger
|
|
|
|
logger = get_service_logger("scheduler_exception_handler")
|
|
|
|
|
|
class SchedulerErrorType(Enum):
|
|
"""Error types for scheduler system."""
|
|
DATABASE_ERROR = "database_error"
|
|
TASK_EXECUTION_ERROR = "task_execution_error"
|
|
TASK_LOADER_ERROR = "task_loader_error"
|
|
SCHEDULER_CONFIG_ERROR = "scheduler_config_error"
|
|
RETRY_ERROR = "retry_error"
|
|
CONCURRENCY_ERROR = "concurrency_error"
|
|
TIMEOUT_ERROR = "timeout_error"
|
|
|
|
|
|
class SchedulerErrorSeverity(Enum):
|
|
"""Severity levels for scheduler errors."""
|
|
LOW = "low"
|
|
MEDIUM = "medium"
|
|
HIGH = "high"
|
|
CRITICAL = "critical"
|
|
|
|
|
|
class SchedulerException(Exception):
|
|
"""Base exception for scheduler system errors."""
|
|
|
|
def __init__(
|
|
self,
|
|
message: str,
|
|
error_type: SchedulerErrorType,
|
|
severity: SchedulerErrorSeverity = SchedulerErrorSeverity.MEDIUM,
|
|
user_id: Optional[int] = None,
|
|
task_id: Optional[int] = None,
|
|
task_type: Optional[str] = None,
|
|
context: Optional[Dict[str, Any]] = None,
|
|
original_error: Optional[Exception] = None
|
|
):
|
|
self.message = message
|
|
self.error_type = error_type
|
|
self.severity = severity
|
|
self.user_id = user_id
|
|
self.task_id = task_id
|
|
self.task_type = task_type
|
|
self.context = context or {}
|
|
self.original_error = original_error
|
|
self.timestamp = datetime.utcnow()
|
|
|
|
# Capture stack trace if original error provided
|
|
self.stack_trace = None
|
|
if self.original_error:
|
|
try:
|
|
exc_type, exc_value, exc_traceback = sys.exc_info()
|
|
if exc_traceback:
|
|
self.stack_trace = ''.join(traceback.format_exception(
|
|
exc_type, exc_value, exc_traceback
|
|
))
|
|
else:
|
|
self.stack_trace = traceback.format_exception(
|
|
type(self.original_error),
|
|
self.original_error,
|
|
self.original_error.__traceback__
|
|
)
|
|
except Exception:
|
|
self.stack_trace = str(self.original_error)
|
|
|
|
super().__init__(message)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert exception to dictionary for logging/storage."""
|
|
return {
|
|
"message": self.message,
|
|
"error_type": self.error_type.value,
|
|
"severity": self.severity.value,
|
|
"user_id": self.user_id,
|
|
"task_id": self.task_id,
|
|
"task_type": self.task_type,
|
|
"context": self.context,
|
|
"timestamp": self.timestamp.isoformat() if isinstance(self.timestamp, datetime) else self.timestamp,
|
|
"original_error": str(self.original_error) if self.original_error else None,
|
|
"stack_trace": self.stack_trace
|
|
}
|
|
|
|
def __str__(self):
|
|
return f"[{self.error_type.value}] {self.message}"
|
|
|
|
|
|
class DatabaseError(SchedulerException):
|
|
"""Exception raised for database-related errors."""
|
|
|
|
def __init__(
|
|
self,
|
|
message: str,
|
|
user_id: Optional[int] = None,
|
|
task_id: Optional[int] = None,
|
|
task_type: Optional[str] = None,
|
|
context: Dict[str, Any] = None,
|
|
original_error: Exception = None
|
|
):
|
|
super().__init__(
|
|
message=message,
|
|
error_type=SchedulerErrorType.DATABASE_ERROR,
|
|
severity=SchedulerErrorSeverity.CRITICAL,
|
|
user_id=user_id,
|
|
task_id=task_id,
|
|
task_type=task_type,
|
|
context=context or {},
|
|
original_error=original_error
|
|
)
|
|
|
|
|
|
class TaskExecutionError(SchedulerException):
|
|
"""Exception raised for task execution failures."""
|
|
|
|
def __init__(
|
|
self,
|
|
message: str,
|
|
user_id: Optional[int] = None,
|
|
task_id: Optional[int] = None,
|
|
task_type: Optional[str] = None,
|
|
retry_count: int = 0,
|
|
execution_time_ms: Optional[int] = None,
|
|
context: Dict[str, Any] = None,
|
|
original_error: Exception = None
|
|
):
|
|
context = context or {}
|
|
context.update({
|
|
"retry_count": retry_count,
|
|
"execution_time_ms": execution_time_ms
|
|
})
|
|
|
|
super().__init__(
|
|
message=message,
|
|
error_type=SchedulerErrorType.TASK_EXECUTION_ERROR,
|
|
severity=SchedulerErrorSeverity.HIGH,
|
|
user_id=user_id,
|
|
task_id=task_id,
|
|
task_type=task_type,
|
|
context=context,
|
|
original_error=original_error
|
|
)
|
|
|
|
|
|
class TaskLoaderError(SchedulerException):
|
|
"""Exception raised for task loading failures."""
|
|
|
|
def __init__(
|
|
self,
|
|
message: str,
|
|
task_type: Optional[str] = None,
|
|
user_id: Optional[int] = None,
|
|
context: Dict[str, Any] = None,
|
|
original_error: Exception = None
|
|
):
|
|
super().__init__(
|
|
message=message,
|
|
error_type=SchedulerErrorType.TASK_LOADER_ERROR,
|
|
severity=SchedulerErrorSeverity.HIGH,
|
|
user_id=user_id,
|
|
task_type=task_type,
|
|
context=context or {},
|
|
original_error=original_error
|
|
)
|
|
|
|
|
|
class SchedulerConfigError(SchedulerException):
|
|
"""Exception raised for scheduler configuration errors."""
|
|
|
|
def __init__(
|
|
self,
|
|
message: str,
|
|
user_id: Optional[int] = None,
|
|
task_id: Optional[int] = None,
|
|
task_type: Optional[str] = None,
|
|
context: Dict[str, Any] = None,
|
|
original_error: Exception = None
|
|
):
|
|
super().__init__(
|
|
message=message,
|
|
error_type=SchedulerErrorType.SCHEDULER_CONFIG_ERROR,
|
|
severity=SchedulerErrorSeverity.CRITICAL,
|
|
user_id=user_id,
|
|
task_id=task_id,
|
|
task_type=task_type,
|
|
context=context or {},
|
|
original_error=original_error
|
|
)
|
|
|
|
|
|
class SchedulerExceptionHandler:
|
|
"""Comprehensive exception handler for the scheduler system."""
|
|
|
|
def __init__(self, db: Session = None):
|
|
self.db = db
|
|
self.logger = logger
|
|
|
|
def handle_exception(
|
|
self,
|
|
error: Union[Exception, SchedulerException],
|
|
context: Dict[str, Any] = None,
|
|
log_level: str = "error"
|
|
) -> Dict[str, Any]:
|
|
"""Handle and log scheduler exceptions."""
|
|
|
|
context = context or {}
|
|
|
|
# Convert regular exceptions to SchedulerException
|
|
if not isinstance(error, SchedulerException):
|
|
error = SchedulerException(
|
|
message=str(error),
|
|
error_type=self._classify_error(error),
|
|
severity=self._determine_severity(error),
|
|
context=context,
|
|
original_error=error
|
|
)
|
|
|
|
# Log the error
|
|
error_data = error.to_dict()
|
|
error_data.update(context)
|
|
|
|
log_message = f"Scheduler Error: {error.message}"
|
|
|
|
if log_level == "critical" or error.severity == SchedulerErrorSeverity.CRITICAL:
|
|
self.logger.critical(log_message, extra={"error_data": error_data})
|
|
elif log_level == "error" or error.severity == SchedulerErrorSeverity.HIGH:
|
|
self.logger.error(log_message, extra={"error_data": error_data})
|
|
elif log_level == "warning" or error.severity == SchedulerErrorSeverity.MEDIUM:
|
|
self.logger.warning(log_message, extra={"error_data": error_data})
|
|
else:
|
|
self.logger.info(log_message, extra={"error_data": error_data})
|
|
|
|
# Store critical errors in database for alerting
|
|
if error.severity in [SchedulerErrorSeverity.HIGH, SchedulerErrorSeverity.CRITICAL]:
|
|
self._store_error_alert(error)
|
|
|
|
# Return formatted error response
|
|
return self._format_error_response(error)
|
|
|
|
def _classify_error(self, error: Exception) -> SchedulerErrorType:
|
|
"""Classify an exception into a scheduler error type."""
|
|
|
|
error_str = str(error).lower()
|
|
error_type_name = type(error).__name__.lower()
|
|
|
|
# Database errors
|
|
if isinstance(error, (SQLAlchemyError, OperationalError, IntegrityError)):
|
|
return SchedulerErrorType.DATABASE_ERROR
|
|
if "database" in error_str or "sql" in error_type_name or "connection" in error_str:
|
|
return SchedulerErrorType.DATABASE_ERROR
|
|
|
|
# Timeout errors
|
|
if "timeout" in error_str or "timed out" in error_str:
|
|
return SchedulerErrorType.TIMEOUT_ERROR
|
|
|
|
# Concurrency errors
|
|
if "concurrent" in error_str or "race" in error_str or "lock" in error_str:
|
|
return SchedulerErrorType.CONCURRENCY_ERROR
|
|
|
|
# Task execution errors
|
|
if "task" in error_str and "execut" in error_str:
|
|
return SchedulerErrorType.TASK_EXECUTION_ERROR
|
|
|
|
# Task loader errors
|
|
if "load" in error_str and "task" in error_str:
|
|
return SchedulerErrorType.TASK_LOADER_ERROR
|
|
|
|
# Retry errors
|
|
if "retry" in error_str:
|
|
return SchedulerErrorType.RETRY_ERROR
|
|
|
|
# Config errors
|
|
if "config" in error_str or "scheduler" in error_str and "init" in error_str:
|
|
return SchedulerErrorType.SCHEDULER_CONFIG_ERROR
|
|
|
|
# Default to task execution error for unknown errors
|
|
return SchedulerErrorType.TASK_EXECUTION_ERROR
|
|
|
|
def _determine_severity(self, error: Exception) -> SchedulerErrorSeverity:
|
|
"""Determine the severity of an error."""
|
|
|
|
error_str = str(error).lower()
|
|
error_type = type(error)
|
|
|
|
# Critical errors
|
|
if isinstance(error, (SQLAlchemyError, OperationalError, ConnectionError)):
|
|
return SchedulerErrorSeverity.CRITICAL
|
|
if "database" in error_str or "connection" in error_str:
|
|
return SchedulerErrorSeverity.CRITICAL
|
|
|
|
# High severity errors
|
|
if "timeout" in error_str or "concurrent" in error_str:
|
|
return SchedulerErrorSeverity.HIGH
|
|
if isinstance(error, (KeyError, AttributeError)) and "config" in error_str:
|
|
return SchedulerErrorSeverity.HIGH
|
|
|
|
# Medium severity errors
|
|
if "task" in error_str or "execution" in error_str:
|
|
return SchedulerErrorSeverity.MEDIUM
|
|
|
|
# Default to low
|
|
return SchedulerErrorSeverity.LOW
|
|
|
|
def _store_error_alert(self, error: SchedulerException):
|
|
"""Store critical errors in database for alerting."""
|
|
|
|
if not self.db:
|
|
return
|
|
|
|
try:
|
|
# Import here to avoid circular dependencies
|
|
from models.monitoring_models import TaskExecutionLog
|
|
|
|
# Store as failed execution log if we have task_id (even without user_id for system errors)
|
|
if error.task_id:
|
|
try:
|
|
execution_log = TaskExecutionLog(
|
|
task_id=error.task_id,
|
|
user_id=error.user_id, # Can be None for system-level errors
|
|
execution_date=error.timestamp,
|
|
status='failed',
|
|
error_message=error.message,
|
|
result_data={
|
|
"error_type": error.error_type.value,
|
|
"severity": error.severity.value,
|
|
"context": error.context,
|
|
"stack_trace": error.stack_trace,
|
|
"task_type": error.task_type
|
|
}
|
|
)
|
|
self.db.add(execution_log)
|
|
self.db.commit()
|
|
self.logger.info(f"Stored error alert in execution log for task {error.task_id}")
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to store error in execution log: {e}")
|
|
self.db.rollback()
|
|
# Note: For errors without task_id, we rely on structured logging only
|
|
# Future: Could create a separate scheduler_error_logs table for system-level errors
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to store error alert: {e}")
|
|
|
|
def _format_error_response(self, error: SchedulerException) -> Dict[str, Any]:
|
|
"""Format error for API response or logging."""
|
|
|
|
response = {
|
|
"success": False,
|
|
"error": {
|
|
"type": error.error_type.value,
|
|
"message": error.message,
|
|
"severity": error.severity.value,
|
|
"timestamp": error.timestamp.isoformat() if isinstance(error.timestamp, datetime) else str(error.timestamp),
|
|
"user_id": error.user_id,
|
|
"task_id": error.task_id,
|
|
"task_type": error.task_type
|
|
}
|
|
}
|
|
|
|
# Add context for debugging (non-sensitive info only)
|
|
if error.context:
|
|
safe_context = {
|
|
k: v for k, v in error.context.items()
|
|
if k not in ["password", "token", "key", "secret", "credential"]
|
|
}
|
|
response["error"]["context"] = safe_context
|
|
|
|
# Add user-friendly message based on error type
|
|
user_messages = {
|
|
SchedulerErrorType.DATABASE_ERROR:
|
|
"A database error occurred while processing the task. Please try again later.",
|
|
SchedulerErrorType.TASK_EXECUTION_ERROR:
|
|
"The task failed to execute. Please check the task configuration and try again.",
|
|
SchedulerErrorType.TASK_LOADER_ERROR:
|
|
"Failed to load tasks. The scheduler may be experiencing issues.",
|
|
SchedulerErrorType.SCHEDULER_CONFIG_ERROR:
|
|
"The scheduler configuration is invalid. Contact support.",
|
|
SchedulerErrorType.RETRY_ERROR:
|
|
"Task retry failed. The task will be rescheduled.",
|
|
SchedulerErrorType.CONCURRENCY_ERROR:
|
|
"A concurrency issue occurred. The task will be retried.",
|
|
SchedulerErrorType.TIMEOUT_ERROR:
|
|
"The task execution timed out. The task will be retried."
|
|
}
|
|
|
|
response["error"]["user_message"] = user_messages.get(
|
|
error.error_type,
|
|
"An error occurred while processing the task."
|
|
)
|
|
|
|
return response
|
|
|