Subscription dashboard improvements, AI text generation limit, and other fixes.
This commit is contained in:
395
backend/services/scheduler/core/exception_handler.py
Normal file
395
backend/services/scheduler/core/exception_handler.py
Normal file
@@ -0,0 +1,395 @@
|
||||
"""
|
||||
Comprehensive Exception Handling and Logging for Task Scheduler
|
||||
Provides robust error handling, logging, and monitoring for the scheduler system.
|
||||
"""
|
||||
|
||||
import traceback
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional, Union
|
||||
from enum import Enum
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.exc import SQLAlchemyError, OperationalError, IntegrityError
|
||||
|
||||
from utils.logger_utils import get_service_logger
|
||||
|
||||
logger = get_service_logger("scheduler_exception_handler")
|
||||
|
||||
|
||||
class SchedulerErrorType(Enum):
|
||||
"""Error types for scheduler system."""
|
||||
DATABASE_ERROR = "database_error"
|
||||
TASK_EXECUTION_ERROR = "task_execution_error"
|
||||
TASK_LOADER_ERROR = "task_loader_error"
|
||||
SCHEDULER_CONFIG_ERROR = "scheduler_config_error"
|
||||
RETRY_ERROR = "retry_error"
|
||||
CONCURRENCY_ERROR = "concurrency_error"
|
||||
TIMEOUT_ERROR = "timeout_error"
|
||||
|
||||
|
||||
class SchedulerErrorSeverity(Enum):
|
||||
"""Severity levels for scheduler errors."""
|
||||
LOW = "low"
|
||||
MEDIUM = "medium"
|
||||
HIGH = "high"
|
||||
CRITICAL = "critical"
|
||||
|
||||
|
||||
class SchedulerException(Exception):
|
||||
"""Base exception for scheduler system errors."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
error_type: SchedulerErrorType,
|
||||
severity: SchedulerErrorSeverity = SchedulerErrorSeverity.MEDIUM,
|
||||
user_id: Optional[int] = None,
|
||||
task_id: Optional[int] = None,
|
||||
task_type: Optional[str] = None,
|
||||
context: Optional[Dict[str, Any]] = None,
|
||||
original_error: Optional[Exception] = None
|
||||
):
|
||||
self.message = message
|
||||
self.error_type = error_type
|
||||
self.severity = severity
|
||||
self.user_id = user_id
|
||||
self.task_id = task_id
|
||||
self.task_type = task_type
|
||||
self.context = context or {}
|
||||
self.original_error = original_error
|
||||
self.timestamp = datetime.utcnow()
|
||||
|
||||
# Capture stack trace if original error provided
|
||||
self.stack_trace = None
|
||||
if self.original_error:
|
||||
try:
|
||||
exc_type, exc_value, exc_traceback = sys.exc_info()
|
||||
if exc_traceback:
|
||||
self.stack_trace = ''.join(traceback.format_exception(
|
||||
exc_type, exc_value, exc_traceback
|
||||
))
|
||||
else:
|
||||
self.stack_trace = traceback.format_exception(
|
||||
type(self.original_error),
|
||||
self.original_error,
|
||||
self.original_error.__traceback__
|
||||
)
|
||||
except Exception:
|
||||
self.stack_trace = str(self.original_error)
|
||||
|
||||
super().__init__(message)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert exception to dictionary for logging/storage."""
|
||||
return {
|
||||
"message": self.message,
|
||||
"error_type": self.error_type.value,
|
||||
"severity": self.severity.value,
|
||||
"user_id": self.user_id,
|
||||
"task_id": self.task_id,
|
||||
"task_type": self.task_type,
|
||||
"context": self.context,
|
||||
"timestamp": self.timestamp.isoformat() if isinstance(self.timestamp, datetime) else self.timestamp,
|
||||
"original_error": str(self.original_error) if self.original_error else None,
|
||||
"stack_trace": self.stack_trace
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
return f"[{self.error_type.value}] {self.message}"
|
||||
|
||||
|
||||
class DatabaseError(SchedulerException):
|
||||
"""Exception raised for database-related errors."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
user_id: Optional[int] = None,
|
||||
task_id: Optional[int] = None,
|
||||
context: Dict[str, Any] = None,
|
||||
original_error: Exception = None
|
||||
):
|
||||
super().__init__(
|
||||
message=message,
|
||||
error_type=SchedulerErrorType.DATABASE_ERROR,
|
||||
severity=SchedulerErrorSeverity.CRITICAL,
|
||||
user_id=user_id,
|
||||
task_id=task_id,
|
||||
context=context or {},
|
||||
original_error=original_error
|
||||
)
|
||||
|
||||
|
||||
class TaskExecutionError(SchedulerException):
|
||||
"""Exception raised for task execution failures."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
user_id: Optional[int] = None,
|
||||
task_id: Optional[int] = None,
|
||||
task_type: Optional[str] = None,
|
||||
retry_count: int = 0,
|
||||
execution_time_ms: Optional[int] = None,
|
||||
context: Dict[str, Any] = None,
|
||||
original_error: Exception = None
|
||||
):
|
||||
context = context or {}
|
||||
context.update({
|
||||
"retry_count": retry_count,
|
||||
"execution_time_ms": execution_time_ms
|
||||
})
|
||||
|
||||
super().__init__(
|
||||
message=message,
|
||||
error_type=SchedulerErrorType.TASK_EXECUTION_ERROR,
|
||||
severity=SchedulerErrorSeverity.HIGH,
|
||||
user_id=user_id,
|
||||
task_id=task_id,
|
||||
task_type=task_type,
|
||||
context=context,
|
||||
original_error=original_error
|
||||
)
|
||||
|
||||
|
||||
class TaskLoaderError(SchedulerException):
|
||||
"""Exception raised for task loading failures."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
task_type: Optional[str] = None,
|
||||
user_id: Optional[int] = None,
|
||||
context: Dict[str, Any] = None,
|
||||
original_error: Exception = None
|
||||
):
|
||||
super().__init__(
|
||||
message=message,
|
||||
error_type=SchedulerErrorType.TASK_LOADER_ERROR,
|
||||
severity=SchedulerErrorSeverity.HIGH,
|
||||
user_id=user_id,
|
||||
task_type=task_type,
|
||||
context=context or {},
|
||||
original_error=original_error
|
||||
)
|
||||
|
||||
|
||||
class SchedulerConfigError(SchedulerException):
|
||||
"""Exception raised for scheduler configuration errors."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
context: Dict[str, Any] = None,
|
||||
original_error: Exception = None
|
||||
):
|
||||
super().__init__(
|
||||
message=message,
|
||||
error_type=SchedulerErrorType.SCHEDULER_CONFIG_ERROR,
|
||||
severity=SchedulerErrorSeverity.CRITICAL,
|
||||
context=context or {},
|
||||
original_error=original_error
|
||||
)
|
||||
|
||||
|
||||
class SchedulerExceptionHandler:
|
||||
"""Comprehensive exception handler for the scheduler system."""
|
||||
|
||||
def __init__(self, db: Session = None):
|
||||
self.db = db
|
||||
self.logger = logger
|
||||
|
||||
def handle_exception(
|
||||
self,
|
||||
error: Union[Exception, SchedulerException],
|
||||
context: Dict[str, Any] = None,
|
||||
log_level: str = "error"
|
||||
) -> Dict[str, Any]:
|
||||
"""Handle and log scheduler exceptions."""
|
||||
|
||||
context = context or {}
|
||||
|
||||
# Convert regular exceptions to SchedulerException
|
||||
if not isinstance(error, SchedulerException):
|
||||
error = SchedulerException(
|
||||
message=str(error),
|
||||
error_type=self._classify_error(error),
|
||||
severity=self._determine_severity(error),
|
||||
context=context,
|
||||
original_error=error
|
||||
)
|
||||
|
||||
# Log the error
|
||||
error_data = error.to_dict()
|
||||
error_data.update(context)
|
||||
|
||||
log_message = f"Scheduler Error: {error.message}"
|
||||
|
||||
if log_level == "critical" or error.severity == SchedulerErrorSeverity.CRITICAL:
|
||||
self.logger.critical(log_message, extra={"error_data": error_data})
|
||||
elif log_level == "error" or error.severity == SchedulerErrorSeverity.HIGH:
|
||||
self.logger.error(log_message, extra={"error_data": error_data})
|
||||
elif log_level == "warning" or error.severity == SchedulerErrorSeverity.MEDIUM:
|
||||
self.logger.warning(log_message, extra={"error_data": error_data})
|
||||
else:
|
||||
self.logger.info(log_message, extra={"error_data": error_data})
|
||||
|
||||
# Store critical errors in database for alerting
|
||||
if error.severity in [SchedulerErrorSeverity.HIGH, SchedulerErrorSeverity.CRITICAL]:
|
||||
self._store_error_alert(error)
|
||||
|
||||
# Return formatted error response
|
||||
return self._format_error_response(error)
|
||||
|
||||
def _classify_error(self, error: Exception) -> SchedulerErrorType:
|
||||
"""Classify an exception into a scheduler error type."""
|
||||
|
||||
error_str = str(error).lower()
|
||||
error_type_name = type(error).__name__.lower()
|
||||
|
||||
# Database errors
|
||||
if isinstance(error, (SQLAlchemyError, OperationalError, IntegrityError)):
|
||||
return SchedulerErrorType.DATABASE_ERROR
|
||||
if "database" in error_str or "sql" in error_type_name or "connection" in error_str:
|
||||
return SchedulerErrorType.DATABASE_ERROR
|
||||
|
||||
# Timeout errors
|
||||
if "timeout" in error_str or "timed out" in error_str:
|
||||
return SchedulerErrorType.TIMEOUT_ERROR
|
||||
|
||||
# Concurrency errors
|
||||
if "concurrent" in error_str or "race" in error_str or "lock" in error_str:
|
||||
return SchedulerErrorType.CONCURRENCY_ERROR
|
||||
|
||||
# Task execution errors
|
||||
if "task" in error_str and "execut" in error_str:
|
||||
return SchedulerErrorType.TASK_EXECUTION_ERROR
|
||||
|
||||
# Task loader errors
|
||||
if "load" in error_str and "task" in error_str:
|
||||
return SchedulerErrorType.TASK_LOADER_ERROR
|
||||
|
||||
# Retry errors
|
||||
if "retry" in error_str:
|
||||
return SchedulerErrorType.RETRY_ERROR
|
||||
|
||||
# Config errors
|
||||
if "config" in error_str or "scheduler" in error_str and "init" in error_str:
|
||||
return SchedulerErrorType.SCHEDULER_CONFIG_ERROR
|
||||
|
||||
# Default to task execution error for unknown errors
|
||||
return SchedulerErrorType.TASK_EXECUTION_ERROR
|
||||
|
||||
def _determine_severity(self, error: Exception) -> SchedulerErrorSeverity:
|
||||
"""Determine the severity of an error."""
|
||||
|
||||
error_str = str(error).lower()
|
||||
error_type = type(error)
|
||||
|
||||
# Critical errors
|
||||
if isinstance(error, (SQLAlchemyError, OperationalError, ConnectionError)):
|
||||
return SchedulerErrorSeverity.CRITICAL
|
||||
if "database" in error_str or "connection" in error_str:
|
||||
return SchedulerErrorSeverity.CRITICAL
|
||||
|
||||
# High severity errors
|
||||
if "timeout" in error_str or "concurrent" in error_str:
|
||||
return SchedulerErrorSeverity.HIGH
|
||||
if isinstance(error, (KeyError, AttributeError)) and "config" in error_str:
|
||||
return SchedulerErrorSeverity.HIGH
|
||||
|
||||
# Medium severity errors
|
||||
if "task" in error_str or "execution" in error_str:
|
||||
return SchedulerErrorSeverity.MEDIUM
|
||||
|
||||
# Default to low
|
||||
return SchedulerErrorSeverity.LOW
|
||||
|
||||
def _store_error_alert(self, error: SchedulerException):
|
||||
"""Store critical errors in database for alerting."""
|
||||
|
||||
if not self.db:
|
||||
return
|
||||
|
||||
try:
|
||||
# Import here to avoid circular dependencies
|
||||
from models.monitoring_models import TaskExecutionLog
|
||||
|
||||
# Store as failed execution log if we have task_id (even without user_id for system errors)
|
||||
if error.task_id:
|
||||
try:
|
||||
execution_log = TaskExecutionLog(
|
||||
task_id=error.task_id,
|
||||
user_id=error.user_id, # Can be None for system-level errors
|
||||
execution_date=error.timestamp,
|
||||
status='failed',
|
||||
error_message=error.message,
|
||||
result_data={
|
||||
"error_type": error.error_type.value,
|
||||
"severity": error.severity.value,
|
||||
"context": error.context,
|
||||
"stack_trace": error.stack_trace,
|
||||
"task_type": error.task_type
|
||||
}
|
||||
)
|
||||
self.db.add(execution_log)
|
||||
self.db.commit()
|
||||
self.logger.info(f"Stored error alert in execution log for task {error.task_id}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to store error in execution log: {e}")
|
||||
self.db.rollback()
|
||||
# Note: For errors without task_id, we rely on structured logging only
|
||||
# Future: Could create a separate scheduler_error_logs table for system-level errors
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to store error alert: {e}")
|
||||
|
||||
def _format_error_response(self, error: SchedulerException) -> Dict[str, Any]:
|
||||
"""Format error for API response or logging."""
|
||||
|
||||
response = {
|
||||
"success": False,
|
||||
"error": {
|
||||
"type": error.error_type.value,
|
||||
"message": error.message,
|
||||
"severity": error.severity.value,
|
||||
"timestamp": error.timestamp.isoformat() if isinstance(error.timestamp, datetime) else str(error.timestamp),
|
||||
"user_id": error.user_id,
|
||||
"task_id": error.task_id,
|
||||
"task_type": error.task_type
|
||||
}
|
||||
}
|
||||
|
||||
# Add context for debugging (non-sensitive info only)
|
||||
if error.context:
|
||||
safe_context = {
|
||||
k: v for k, v in error.context.items()
|
||||
if k not in ["password", "token", "key", "secret", "credential"]
|
||||
}
|
||||
response["error"]["context"] = safe_context
|
||||
|
||||
# Add user-friendly message based on error type
|
||||
user_messages = {
|
||||
SchedulerErrorType.DATABASE_ERROR:
|
||||
"A database error occurred while processing the task. Please try again later.",
|
||||
SchedulerErrorType.TASK_EXECUTION_ERROR:
|
||||
"The task failed to execute. Please check the task configuration and try again.",
|
||||
SchedulerErrorType.TASK_LOADER_ERROR:
|
||||
"Failed to load tasks. The scheduler may be experiencing issues.",
|
||||
SchedulerErrorType.SCHEDULER_CONFIG_ERROR:
|
||||
"The scheduler configuration is invalid. Contact support.",
|
||||
SchedulerErrorType.RETRY_ERROR:
|
||||
"Task retry failed. The task will be rescheduled.",
|
||||
SchedulerErrorType.CONCURRENCY_ERROR:
|
||||
"A concurrency issue occurred. The task will be retried.",
|
||||
SchedulerErrorType.TIMEOUT_ERROR:
|
||||
"The task execution timed out. The task will be retried."
|
||||
}
|
||||
|
||||
response["error"]["user_message"] = user_messages.get(
|
||||
error.error_type,
|
||||
"An error occurred while processing the task."
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
Reference in New Issue
Block a user