story writer backend migration complete, Blog writer SEO and story writer backend migration complete, Blog writer SEO and story writer frontend migration complete

This commit is contained in:
ajaysi
2025-11-13 16:14:26 +05:30
parent 7191c7e7f0
commit 3b9356e2c8
124 changed files with 20055 additions and 1208 deletions

View File

@@ -0,0 +1,378 @@
"""
Failure Detection Service
Analyzes execution logs to detect failure patterns and mark tasks for human intervention.
"""
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, List
from sqlalchemy.orm import Session
from enum import Enum
import json
from utils.logger_utils import get_service_logger
logger = get_service_logger("failure_detection")
class FailureReason(Enum):
"""Categories of failure reasons."""
API_LIMIT = "api_limit" # 429, rate limits, quota exceeded
AUTH_ERROR = "auth_error" # 401, 403, token expired
NETWORK_ERROR = "network_error" # Connection errors, timeouts
CONFIG_ERROR = "config_error" # Missing config, invalid parameters
UNKNOWN = "unknown" # Other errors
class FailurePattern:
"""Represents a failure pattern for a task."""
def __init__(
self,
task_id: int,
task_type: str,
user_id: str,
consecutive_failures: int,
recent_failures: int,
failure_reason: FailureReason,
last_failure_time: Optional[datetime],
error_patterns: List[str],
should_cool_off: bool
):
self.task_id = task_id
self.task_type = task_type
self.user_id = user_id
self.consecutive_failures = consecutive_failures
self.recent_failures = recent_failures
self.failure_reason = failure_reason
self.last_failure_time = last_failure_time
self.error_patterns = error_patterns
self.should_cool_off = should_cool_off
class FailureDetectionService:
"""Service for detecting failure patterns in task execution logs."""
# Cool-off thresholds
CONSECUTIVE_FAILURE_THRESHOLD = 3 # 3 consecutive failures
RECENT_FAILURE_THRESHOLD = 5 # 5 failures in last 7 days
COOL_OFF_PERIOD_DAYS = 7 # Cool-off period after marking for intervention
def __init__(self, db: Session):
self.db = db
self.logger = logger
def analyze_task_failures(
self,
task_id: int,
task_type: str,
user_id: str
) -> Optional[FailurePattern]:
"""
Analyze failure patterns for a specific task.
Args:
task_id: Task ID
task_type: Task type (oauth_token_monitoring, website_analysis, etc.)
user_id: User ID
Returns:
FailurePattern if pattern detected, None otherwise
"""
try:
# Get execution logs for this task
execution_logs = self._get_execution_logs(task_id, task_type)
if not execution_logs:
return None
# Analyze failure patterns
consecutive_failures = self._count_consecutive_failures(execution_logs)
recent_failures = self._count_recent_failures(execution_logs, days=7)
failure_reason = self._classify_failure_reason(execution_logs)
error_patterns = self._extract_error_patterns(execution_logs)
last_failure_time = self._get_last_failure_time(execution_logs)
# Determine if task should be cooled off
should_cool_off = (
consecutive_failures >= self.CONSECUTIVE_FAILURE_THRESHOLD or
recent_failures >= self.RECENT_FAILURE_THRESHOLD
)
if should_cool_off:
self.logger.warning(
f"Failure pattern detected for task {task_id} ({task_type}): "
f"consecutive={consecutive_failures}, recent={recent_failures}, "
f"reason={failure_reason.value}"
)
return FailurePattern(
task_id=task_id,
task_type=task_type,
user_id=user_id,
consecutive_failures=consecutive_failures,
recent_failures=recent_failures,
failure_reason=failure_reason,
last_failure_time=last_failure_time,
error_patterns=error_patterns,
should_cool_off=should_cool_off
)
except Exception as e:
self.logger.error(f"Error analyzing task failures for task {task_id}: {e}", exc_info=True)
return None
def _get_execution_logs(self, task_id: int, task_type: str) -> List[Dict[str, Any]]:
"""Get execution logs for a task."""
try:
if task_type == "oauth_token_monitoring":
from models.oauth_token_monitoring_models import OAuthTokenExecutionLog
logs = self.db.query(OAuthTokenExecutionLog).filter(
OAuthTokenExecutionLog.task_id == task_id
).order_by(OAuthTokenExecutionLog.execution_date.desc()).all()
return [
{
"status": log.status,
"error_message": log.error_message,
"execution_date": log.execution_date,
"result_data": log.result_data
}
for log in logs
]
elif task_type == "website_analysis":
from models.website_analysis_monitoring_models import WebsiteAnalysisExecutionLog
logs = self.db.query(WebsiteAnalysisExecutionLog).filter(
WebsiteAnalysisExecutionLog.task_id == task_id
).order_by(WebsiteAnalysisExecutionLog.execution_date.desc()).all()
return [
{
"status": log.status,
"error_message": log.error_message,
"execution_date": log.execution_date,
"result_data": log.result_data
}
for log in logs
]
elif task_type in ["gsc_insights", "bing_insights", "platform_insights"]:
from models.platform_insights_monitoring_models import PlatformInsightsExecutionLog
logs = self.db.query(PlatformInsightsExecutionLog).filter(
PlatformInsightsExecutionLog.task_id == task_id
).order_by(PlatformInsightsExecutionLog.execution_date.desc()).all()
return [
{
"status": log.status,
"error_message": log.error_message,
"execution_date": log.execution_date,
"result_data": log.result_data
}
for log in logs
]
else:
# Fallback to monitoring_task execution logs
from models.monitoring_models import TaskExecutionLog
logs = self.db.query(TaskExecutionLog).filter(
TaskExecutionLog.task_id == task_id
).order_by(TaskExecutionLog.execution_date.desc()).all()
return [
{
"status": log.status,
"error_message": log.error_message,
"execution_date": log.execution_date,
"result_data": log.result_data
}
for log in logs
]
except Exception as e:
self.logger.error(f"Error getting execution logs for task {task_id}: {e}", exc_info=True)
return []
def _count_consecutive_failures(self, logs: List[Dict[str, Any]]) -> int:
"""Count consecutive failures from most recent."""
count = 0
for log in logs:
if log["status"] == "failed":
count += 1
else:
break # Stop at first success
return count
def _count_recent_failures(self, logs: List[Dict[str, Any]], days: int = 7) -> int:
"""Count failures in the last N days."""
cutoff = datetime.utcnow() - timedelta(days=days)
return sum(
1 for log in logs
if log["status"] == "failed" and log["execution_date"] >= cutoff
)
def _classify_failure_reason(self, logs: List[Dict[str, Any]]) -> FailureReason:
"""Classify the primary failure reason from error messages."""
# Check most recent failures first
recent_failures = [log for log in logs if log["status"] == "failed"][:5]
for log in recent_failures:
error_message = (log.get("error_message") or "").lower()
result_data = log.get("result_data") or {}
# Check for API limits (429)
if "429" in error_message or "rate limit" in error_message or "limit reached" in error_message:
return FailureReason.API_LIMIT
# Check result_data for API limit info
if isinstance(result_data, dict):
if result_data.get("error_status") == 429:
return FailureReason.API_LIMIT
if "limit" in str(result_data).lower() and "reached" in str(result_data).lower():
return FailureReason.API_LIMIT
# Check for usage info indicating limits
usage_info = result_data.get("usage_info", {})
if isinstance(usage_info, dict):
if usage_info.get("usage_percentage", 0) >= 100:
return FailureReason.API_LIMIT
# Check for auth errors
if "401" in error_message or "403" in error_message or "unauthorized" in error_message or "forbidden" in error_message:
return FailureReason.AUTH_ERROR
if "token" in error_message and ("expired" in error_message or "invalid" in error_message):
return FailureReason.AUTH_ERROR
# Check for network errors
if "timeout" in error_message or "connection" in error_message or "network" in error_message:
return FailureReason.NETWORK_ERROR
# Check for config errors
if "config" in error_message or "missing" in error_message or "invalid" in error_message:
return FailureReason.CONFIG_ERROR
return FailureReason.UNKNOWN
def _extract_error_patterns(self, logs: List[Dict[str, Any]]) -> List[str]:
"""Extract common error patterns from failure logs."""
patterns = []
recent_failures = [log for log in logs if log["status"] == "failed"][:5]
for log in recent_failures:
error_message = log.get("error_message") or ""
if error_message:
# Extract key phrases (first 100 chars)
pattern = error_message[:100].strip()
if pattern and pattern not in patterns:
patterns.append(pattern)
return patterns[:3] # Return top 3 patterns
def _get_last_failure_time(self, logs: List[Dict[str, Any]]) -> Optional[datetime]:
"""Get the timestamp of the most recent failure."""
for log in logs:
if log["status"] == "failed":
return log["execution_date"]
return None
def get_tasks_needing_intervention(
self,
user_id: Optional[str] = None,
task_type: Optional[str] = None
) -> List[Dict[str, Any]]:
"""
Get all tasks that need human intervention.
Args:
user_id: Optional user ID filter
task_type: Optional task type filter
Returns:
List of task dictionaries with failure pattern info
"""
try:
tasks_needing_intervention = []
# Check OAuth token monitoring tasks
from models.oauth_token_monitoring_models import OAuthTokenMonitoringTask
oauth_tasks = self.db.query(OAuthTokenMonitoringTask).filter(
OAuthTokenMonitoringTask.status == "needs_intervention"
)
if user_id:
oauth_tasks = oauth_tasks.filter(OAuthTokenMonitoringTask.user_id == user_id)
for task in oauth_tasks.all():
pattern = self.analyze_task_failures(task.id, "oauth_token_monitoring", task.user_id)
if pattern:
tasks_needing_intervention.append({
"task_id": task.id,
"task_type": "oauth_token_monitoring",
"user_id": task.user_id,
"platform": task.platform,
"failure_pattern": {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"last_failure_time": pattern.last_failure_time.isoformat() if pattern.last_failure_time else None,
"error_patterns": pattern.error_patterns
},
"failure_reason": task.failure_reason,
"last_failure": task.last_failure.isoformat() if task.last_failure else None
})
# Check website analysis tasks
from models.website_analysis_monitoring_models import WebsiteAnalysisTask
website_tasks = self.db.query(WebsiteAnalysisTask).filter(
WebsiteAnalysisTask.status == "needs_intervention"
)
if user_id:
website_tasks = website_tasks.filter(WebsiteAnalysisTask.user_id == user_id)
for task in website_tasks.all():
pattern = self.analyze_task_failures(task.id, "website_analysis", task.user_id)
if pattern:
tasks_needing_intervention.append({
"task_id": task.id,
"task_type": "website_analysis",
"user_id": task.user_id,
"website_url": task.website_url,
"failure_pattern": {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"last_failure_time": pattern.last_failure_time.isoformat() if pattern.last_failure_time else None,
"error_patterns": pattern.error_patterns
},
"failure_reason": task.failure_reason,
"last_failure": task.last_failure.isoformat() if task.last_failure else None
})
# Check platform insights tasks
from models.platform_insights_monitoring_models import PlatformInsightsTask
insights_tasks = self.db.query(PlatformInsightsTask).filter(
PlatformInsightsTask.status == "needs_intervention"
)
if user_id:
insights_tasks = insights_tasks.filter(PlatformInsightsTask.user_id == user_id)
for task in insights_tasks.all():
task_type_str = f"{task.platform}_insights"
pattern = self.analyze_task_failures(task.id, task_type_str, task.user_id)
if pattern:
tasks_needing_intervention.append({
"task_id": task.id,
"task_type": task_type_str,
"user_id": task.user_id,
"platform": task.platform,
"failure_pattern": {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"last_failure_time": pattern.last_failure_time.isoformat() if pattern.last_failure_time else None,
"error_patterns": pattern.error_patterns
},
"failure_reason": task.failure_reason,
"last_failure": task.last_failure.isoformat() if task.last_failure else None
})
return tasks_needing_intervention
except Exception as e:
self.logger.error(f"Error getting tasks needing intervention: {e}", exc_info=True)
return []

View File

@@ -22,7 +22,8 @@ async def execute_task_async(
scheduler: 'TaskScheduler',
task_type: str,
task: Any,
summary: Optional[Dict[str, Any]] = None
summary: Optional[Dict[str, Any]] = None,
execution_source: str = "scheduler" # "scheduler" or "manual"
):
"""
Execute a single task asynchronously with user isolation.
@@ -98,6 +99,19 @@ async def execute_task_async(
except Exception as e:
logger.debug(f"Could not extract user_id after merge for task {task_id}: {e}")
# Check if task is in cool-off (skip if scheduler-triggered, allow if manual)
if execution_source == "scheduler":
if hasattr(task, 'status') and task.status == "needs_intervention":
logger.warning(
f"[Scheduler] ⏸️ Skipping task {task_id} - marked for human intervention. "
f"Use manual trigger to retry."
)
scheduler.stats['tasks_skipped'] += 1
if summary:
summary.setdefault('skipped', 0)
summary['skipped'] += 1
return
# Get executor for this task type
try:
executor = scheduler.registry.get_executor(task_type)

View File

@@ -86,6 +86,9 @@ class BingInsightsExecutor(TaskExecutor):
task.last_success = datetime.utcnow()
task.status = 'active'
task.failure_reason = None
# Reset failure tracking on success
task.consecutive_failures = 0
task.failure_pattern = None
# Schedule next check (7 days from now)
task.next_check = self.calculate_next_execution(
task=task,
@@ -93,11 +96,41 @@ class BingInsightsExecutor(TaskExecutor):
last_execution=task.last_check
)
else:
# Analyze failure pattern
from services.scheduler.core.failure_detection_service import FailureDetectionService
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(
task.id, "bing_insights", task.user_id
)
task.last_failure = datetime.utcnow()
task.failure_reason = result.error_message
task.status = 'failed'
# Schedule retry in 1 day
task.next_check = datetime.utcnow() + timedelta(days=1)
if pattern and pattern.should_cool_off:
# Mark task for human intervention
task.status = "needs_intervention"
task.consecutive_failures = pattern.consecutive_failures
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
# Clear next_check - task won't run automatically
task.next_check = None
self.logger.warning(
f"Task {task.id} marked for human intervention: "
f"{pattern.consecutive_failures} consecutive failures, "
f"reason: {pattern.failure_reason.value}"
)
else:
# Normal failure handling
task.status = 'failed'
task.consecutive_failures = (task.consecutive_failures or 0) + 1
# Schedule retry in 1 day
task.next_check = datetime.utcnow() + timedelta(days=1)
task.updated_at = datetime.utcnow()
db.commit()
@@ -117,12 +150,35 @@ class BingInsightsExecutor(TaskExecutor):
context="Bing insights fetch"
)
# Analyze failure pattern
from services.scheduler.core.failure_detection_service import FailureDetectionService
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(
task.id, "bing_insights", task.user_id
)
# Update task
task.last_check = datetime.utcnow()
task.last_failure = datetime.utcnow()
task.failure_reason = str(e)
task.status = 'failed'
task.next_check = datetime.utcnow() + timedelta(days=1)
if pattern and pattern.should_cool_off:
# Mark task for human intervention
task.status = "needs_intervention"
task.consecutive_failures = pattern.consecutive_failures
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
task.next_check = None
else:
task.status = 'failed'
task.consecutive_failures = (task.consecutive_failures or 0) + 1
task.next_check = datetime.utcnow() + timedelta(days=1)
task.updated_at = datetime.utcnow()
db.commit()

View File

@@ -85,6 +85,9 @@ class GSCInsightsExecutor(TaskExecutor):
task.last_success = datetime.utcnow()
task.status = 'active'
task.failure_reason = None
# Reset failure tracking on success
task.consecutive_failures = 0
task.failure_pattern = None
# Schedule next check (7 days from now)
task.next_check = self.calculate_next_execution(
task=task,
@@ -92,11 +95,41 @@ class GSCInsightsExecutor(TaskExecutor):
last_execution=task.last_check
)
else:
# Analyze failure pattern
from services.scheduler.core.failure_detection_service import FailureDetectionService
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(
task.id, "gsc_insights", task.user_id
)
task.last_failure = datetime.utcnow()
task.failure_reason = result.error_message
task.status = 'failed'
# Schedule retry in 1 day
task.next_check = datetime.utcnow() + timedelta(days=1)
if pattern and pattern.should_cool_off:
# Mark task for human intervention
task.status = "needs_intervention"
task.consecutive_failures = pattern.consecutive_failures
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
# Clear next_check - task won't run automatically
task.next_check = None
self.logger.warning(
f"Task {task.id} marked for human intervention: "
f"{pattern.consecutive_failures} consecutive failures, "
f"reason: {pattern.failure_reason.value}"
)
else:
# Normal failure handling
task.status = 'failed'
task.consecutive_failures = (task.consecutive_failures or 0) + 1
# Schedule retry in 1 day
task.next_check = datetime.utcnow() + timedelta(days=1)
task.updated_at = datetime.utcnow()
db.commit()
@@ -116,12 +149,35 @@ class GSCInsightsExecutor(TaskExecutor):
context="GSC insights fetch"
)
# Analyze failure pattern
from services.scheduler.core.failure_detection_service import FailureDetectionService
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(
task.id, "gsc_insights", task.user_id
)
# Update task
task.last_check = datetime.utcnow()
task.last_failure = datetime.utcnow()
task.failure_reason = str(e)
task.status = 'failed'
task.next_check = datetime.utcnow() + timedelta(days=1)
if pattern and pattern.should_cool_off:
# Mark task for human intervention
task.status = "needs_intervention"
task.consecutive_failures = pattern.consecutive_failures
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
task.next_check = None
else:
task.status = 'failed'
task.consecutive_failures = (task.consecutive_failures or 0) + 1
task.next_check = datetime.utcnow() + timedelta(days=1)
task.updated_at = datetime.utcnow()
db.commit()

View File

@@ -92,6 +92,9 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
task.last_success = datetime.utcnow()
task.status = 'active'
task.failure_reason = None
# Reset failure tracking on success
task.consecutive_failures = 0
task.failure_pattern = None
# Schedule next check (7 days from now)
task.next_check = self.calculate_next_execution(
task=task,
@@ -99,14 +102,44 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
last_execution=task.last_check
)
else:
# Refresh failed - mark as failed and stop automatic retries
# Analyze failure pattern
from services.scheduler.core.failure_detection_service import FailureDetectionService
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(
task.id, "oauth_token_monitoring", task.user_id
)
task.last_failure = datetime.utcnow()
task.failure_reason = result.error_message
task.status = 'failed'
# Do NOT update next_check - wait for manual trigger
if pattern and pattern.should_cool_off:
# Mark task for human intervention
task.status = "needs_intervention"
task.consecutive_failures = pattern.consecutive_failures
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
# Clear next_check - task won't run automatically
task.next_check = None
self.logger.warning(
f"Task {task.id} marked for human intervention: "
f"{pattern.consecutive_failures} consecutive failures, "
f"reason: {pattern.failure_reason.value}"
)
else:
# Normal failure handling
task.status = 'failed'
task.consecutive_failures = (task.consecutive_failures or 0) + 1
# Do NOT update next_check - wait for manual trigger
self.logger.warning(
f"OAuth token refresh failed for user {user_id}, platform {platform}. "
f"Task marked as failed. No automatic retry will be scheduled."
f"{'Task marked for human intervention' if pattern and pattern.should_cool_off else 'Task marked as failed. No automatic retry will be scheduled.'}"
)
# Create UsageAlert notification for the user

View File

@@ -106,6 +106,9 @@ class WebsiteAnalysisExecutor(TaskExecutor):
task.last_success = datetime.utcnow()
task.status = 'active'
task.failure_reason = None
# Reset failure tracking on success
task.consecutive_failures = 0
task.failure_pattern = None
# Schedule next check based on frequency_days
task.next_check = self.calculate_next_execution(
task=task,
@@ -123,17 +126,48 @@ class WebsiteAnalysisExecutor(TaskExecutor):
)
return result
else:
# Analyze failure pattern
from services.scheduler.core.failure_detection_service import FailureDetectionService
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(
task.id, "website_analysis", task.user_id
)
task.last_failure = datetime.utcnow()
task.failure_reason = result.error_message
task.status = 'failed'
# Do NOT update next_check - wait for manual retry
if pattern and pattern.should_cool_off:
# Mark task for human intervention
task.status = "needs_intervention"
task.consecutive_failures = pattern.consecutive_failures
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
# Clear next_check - task won't run automatically
task.next_check = None
self.logger.warning(
f"Task {task.id} marked for human intervention: "
f"{pattern.consecutive_failures} consecutive failures, "
f"reason: {pattern.failure_reason.value}"
)
else:
# Normal failure handling
task.status = 'failed'
task.consecutive_failures = (task.consecutive_failures or 0) + 1
# Do NOT update next_check - wait for manual retry
# Commit all changes to database
db.commit()
self.logger.warning(
f"Website analysis failed for task {task.id}. "
f"Error: {result.error_message}. Waiting for manual retry."
f"Error: {result.error_message}. "
f"{'Marked for human intervention' if pattern and pattern.should_cool_off else 'Waiting for manual retry'}."
)
return result