Compare commits

..

1 Commits

Author SHA1 Message Date
ي
87925c8fdc Add tiered anomaly policy and safety lock arbitration 2026-05-18 16:01:43 +05:30
6 changed files with 166 additions and 91 deletions

View File

@@ -40,10 +40,6 @@ class OAuthTokenMonitoringTask(Base):
# Scheduling
next_check = Column(DateTime, nullable=True, index=True) # Next scheduled check time
next_retry_at = Column(DateTime, nullable=True, index=True) # Backoff retry schedule for refresh failures
refresh_attempts = Column(Integer, default=0) # Current retry attempt count for refresh workflow
terminal_failure_reason = Column(Text, nullable=True) # Permanent failure reason requiring user action
channel_status = Column(String(32), default='connected') # connected, degraded, disconnected
# Metadata
created_at = Column(DateTime, default=datetime.utcnow)
@@ -101,3 +97,4 @@ class OAuthTokenExecutionLog(Base):
def __repr__(self):
return f"<OAuthTokenExecutionLog(id={self.id}, task_id={self.task_id}, status={self.status}, execution_date={self.execution_date})>"

View File

@@ -99,6 +99,17 @@ class OptimizationRecommendation:
expires = datetime.utcnow().timestamp() + (7 * 24 * 60 * 60)
self.expires_at = datetime.fromtimestamp(expires).isoformat()
@dataclass
class TierPolicyConfig:
"""Structured policy for anomaly tiers and remediation controls"""
tier: int
trigger_metrics: List[str]
thresholds: Dict[str, float]
max_iterations: int
lock_criteria: Dict[str, Any]
class AgentPerformanceMonitor:
"""Main performance monitoring system for agents"""
@@ -108,6 +119,32 @@ class AgentPerformanceMonitor:
self.agent_snapshots: Dict[str, AgentPerformanceSnapshot] = {}
self.recommendations: List[OptimizationRecommendation] = []
self.performance_history: deque = deque(maxlen=1000) # Keep last 1000 data points
self.systemic_alerts: List[Dict[str, Any]] = []
# Structured tier policy config
self.tier_policy_config: Dict[int, TierPolicyConfig] = {
1: TierPolicyConfig(
tier=1,
trigger_metrics=["success_rate", "efficiency_score", "response_time"],
thresholds={"success_rate": 0.80, "efficiency_score": 0.65, "response_time": 45.0},
max_iterations=3,
lock_criteria={"min_confidence": 0.85, "consecutive_failures": 6}
),
2: TierPolicyConfig(
tier=2,
trigger_metrics=["success_rate", "efficiency_score", "response_time", "market_impact"],
thresholds={"success_rate": 0.70, "efficiency_score": 0.50, "response_time": 60.0, "market_impact": 0.35},
max_iterations=2,
lock_criteria={"min_confidence": 0.75, "consecutive_failures": 4}
),
3: TierPolicyConfig(
tier=3,
trigger_metrics=["success_rate", "efficiency_score", "response_time", "market_impact"],
thresholds={"success_rate": 0.55, "efficiency_score": 0.35, "response_time": 90.0, "market_impact": 0.25},
max_iterations=1,
lock_criteria={"min_confidence": 0.65, "consecutive_failures": 3}
)
}
# Performance thresholds and targets
self.performance_targets = {
@@ -513,6 +550,54 @@ class AgentPerformanceMonitor:
}
return priority_weights.get(priority, 0)
def _build_recommended_action_payload(self, agent_id: str, snapshot: AgentPerformanceSnapshot) -> Dict[str, Any]:
"""Build recommended action payload including tier and confidence."""
tier = 1
if (snapshot.success_rate <= self.tier_policy_config[3].thresholds["success_rate"] or
snapshot.efficiency_score <= self.tier_policy_config[3].thresholds["efficiency_score"] or
snapshot.average_response_time >= self.tier_policy_config[3].thresholds["response_time"] or
snapshot.market_impact_score <= self.tier_policy_config[3].thresholds["market_impact"]):
tier = 3
elif (snapshot.success_rate <= self.tier_policy_config[2].thresholds["success_rate"] or
snapshot.efficiency_score <= self.tier_policy_config[2].thresholds["efficiency_score"] or
snapshot.average_response_time >= self.tier_policy_config[2].thresholds["response_time"] or
snapshot.market_impact_score <= self.tier_policy_config[2].thresholds["market_impact"]):
tier = 2
confidence = round(max(0.0, min(1.0, 1.0 - abs(0.75 - self._calculate_health_score(snapshot)))) , 2)
policy = self.tier_policy_config[tier]
return {
"agent_id": agent_id,
"tier": tier,
"confidence": confidence,
"max_iterations": policy.max_iterations,
"lock_criteria": policy.lock_criteria,
"trigger_metrics": policy.trigger_metrics
}
def _route_tier3_systemic_alert(self, action_payload: Dict[str, Any], alerts: List[Dict[str, Any]]) -> None:
"""Route Tier 3 systemic anomalies to alerting subsystem with diagnostic brief."""
diagnostic_brief = {
"type": "systemic_anomaly",
"severity": "critical",
"tier": 3,
"confidence": action_payload.get("confidence", 0.0),
"agent_id": action_payload.get("agent_id"),
"timestamp": datetime.utcnow().isoformat(),
"diagnostic_brief": {
"trigger_metrics": action_payload.get("trigger_metrics", []),
"alerts": alerts,
"max_iterations": action_payload.get("max_iterations"),
"lock_criteria": action_payload.get("lock_criteria", {})
}
}
self.systemic_alerts.append(diagnostic_brief)
if len(self.systemic_alerts) > 200:
self.systemic_alerts = self.systemic_alerts[-200:]
logger.critical(f"[ALERTING_SUBSYSTEM] Tier 3 systemic anomaly routed: {json.dumps(diagnostic_brief)}")
async def get_performance_alerts(self, agent_id: str) -> List[Dict[str, Any]]:
"""Get performance alerts for an agent"""
alerts = []
@@ -574,6 +659,13 @@ class AgentPerformanceMonitor:
"timestamp": datetime.utcnow().isoformat()
})
action_payload = self._build_recommended_action_payload(agent_id, snapshot)
if action_payload["tier"] == 3:
self._route_tier3_systemic_alert(action_payload, alerts)
for alert in alerts:
alert["recommended_action"] = action_payload
return alerts
except Exception as e:

View File

@@ -84,6 +84,17 @@ class SafetyValidation:
if self.validation_timestamp is None:
self.validation_timestamp = datetime.utcnow().isoformat()
@dataclass
class SafetyArbitrationDecision:
"""Explicit allow/deny/lock decision with reasons."""
decision: str
reasons: List[str]
tier: int
confidence: float
lock_state_active: bool
class SafetyConstraintManager:
"""Manages safety constraints for agent actions"""
@@ -92,6 +103,8 @@ class SafetyConstraintManager:
self.constraints: Dict[str, SafetyConstraint] = {}
self.action_history: List[Dict[str, Any]] = []
self.violation_history: List[Dict[str, Any]] = []
self.lock_state_active: bool = False
self.lock_state_reason: Optional[str] = None
# Initialize default constraints
self._initialize_default_constraints()
@@ -163,6 +176,17 @@ class SafetyConstraintManager:
"""Validate an action against safety constraints"""
try:
logger.info(f"Validating action for user {self.user_id}: {action_data.get('action_type', 'unknown')}")
if self.lock_state_active and action_data.get("autonomous_modification", True):
reason = self.lock_state_reason or "Safety lock is active due to Tier 3 systemic anomaly"
return SafetyValidation(
is_valid=False,
risk_level=RiskLevel.CRITICAL,
violations=["Autonomous modifications blocked while lock state is active"],
recommendations=[reason],
requires_approval=True,
confidence_score=1.0
)
violations = []
recommendations = []
@@ -207,19 +231,29 @@ class SafetyConstraintManager:
# Final validation
is_valid = len(violations) == 0 and not requires_approval
logger.info(f"Action validation completed for user {self.user_id}. Valid: {is_valid}, Risk: {risk_level.value}, Violations: {len(violations)}")
confidence_score = max(0.0, min(1.0, confidence_score))
arbitration = self._arbitrate_decision(action_data, risk_level, violations, requires_approval, confidence_score)
if arbitration.decision == "lock":
self.lock_state_active = True
self.lock_state_reason = "; ".join(arbitration.reasons)
is_valid = False
requires_approval = True
recommendations.extend([f"Arbitration decision: {arbitration.decision}", *arbitration.reasons])
logger.info(f"Action validation completed for user {self.user_id}. Decision: {arbitration.decision}, Valid: {is_valid}, Risk: {risk_level.value}, Violations: {len(violations)}")
# Record in history
await self._record_validation_history(action_data, is_valid, violations)
return SafetyValidation(
is_valid=is_valid,
risk_level=risk_level,
violations=violations,
recommendations=recommendations,
requires_approval=requires_approval,
confidence_score=max(0.0, min(1.0, confidence_score))
confidence_score=confidence_score
)
except Exception as e:
@@ -235,6 +269,30 @@ class SafetyConstraintManager:
confidence_score=0.0
)
def _arbitrate_decision(self, action_data: Dict[str, Any], risk_level: RiskLevel, violations: List[str], requires_approval: bool, confidence_score: float) -> SafetyArbitrationDecision:
"""Arbitrate allow/deny/lock with explicit reasons."""
reasons: List[str] = []
tier = int(action_data.get("recommended_tier", 1))
if self.lock_state_active:
reasons.append("Existing lock state is active")
return SafetyArbitrationDecision("lock", reasons, tier, confidence_score, True)
if tier >= 3 or risk_level == RiskLevel.CRITICAL:
reasons.append("Tier 3 systemic anomaly or critical risk detected")
if violations:
reasons.extend(violations)
return SafetyArbitrationDecision("lock", reasons, 3, confidence_score, True)
if violations or requires_approval:
reasons.append("Safety policy violation or approval requirement triggered")
reasons.extend(violations)
return SafetyArbitrationDecision("deny", reasons, tier, confidence_score, False)
reasons.append("No policy violations detected")
return SafetyArbitrationDecision("allow", reasons, tier, confidence_score, False)
def _determine_action_category(self, action_type: str) -> ActionCategory:
"""Determine the category of an action"""
action_type_lower = action_type.lower()

View File

@@ -26,10 +26,7 @@ from .executors.advertools_executor import AdvertoolsExecutor
from .executors.sif_indexing_executor import SIFIndexingExecutor
from .executors.market_trends_executor import MarketTrendsExecutor
from .utils.task_loader import load_due_monitoring_tasks
from .utils.oauth_token_task_loader import (
load_due_oauth_token_monitoring_tasks,
load_near_expiry_oauth_token_tasks
)
from .utils.oauth_token_task_loader import load_due_oauth_token_monitoring_tasks
from .utils.website_analysis_task_loader import load_due_website_analysis_tasks
from .utils.onboarding_full_website_analysis_task_loader import load_due_onboarding_full_website_analysis_tasks
from .utils.deep_competitor_analysis_task_loader import load_due_deep_competitor_analysis_tasks
@@ -73,11 +70,6 @@ def get_scheduler() -> TaskScheduler:
oauth_token_executor,
load_due_oauth_token_monitoring_tasks
)
_scheduler_instance.register_executor(
'oauth_token_refresh',
oauth_token_executor,
load_near_expiry_oauth_token_tasks
)
# Register website analysis executor
website_analysis_executor = WebsiteAnalysisExecutor()

View File

@@ -42,8 +42,6 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
self.exception_handler = SchedulerExceptionHandler()
# Expiration warning window (7 days before expiration)
self.expiration_warning_days = 7
self.max_refresh_retries = 3
self.base_retry_backoff_minutes = 15
async def execute_task(self, task: OAuthTokenMonitoringTask, db: Session) -> TaskExecutionResult:
"""
@@ -95,10 +93,6 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
task.last_success = datetime.utcnow()
task.status = 'active'
task.failure_reason = None
task.terminal_failure_reason = None
task.channel_status = 'connected'
task.refresh_attempts = 0
task.next_retry_at = None
# Reset failure tracking on success
task.consecutive_failures = 0
task.failure_pattern = None
@@ -118,7 +112,6 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
task.last_failure = datetime.utcnow()
task.failure_reason = result.error_message
task.refresh_attempts = (task.refresh_attempts or 0) + 1
if pattern and pattern.should_cool_off:
# Mark task for human intervention
@@ -133,9 +126,6 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
}
# Clear next_check - task won't run automatically
task.next_check = None
task.next_retry_at = None
task.channel_status = "disconnected"
task.terminal_failure_reason = result.error_message
self.logger.warning(
f"Task {task.id} marked for human intervention: "
@@ -143,17 +133,10 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
f"reason: {pattern.failure_reason.value}"
)
else:
# Normal failure handling
task.status = 'failed'
task.consecutive_failures = (task.consecutive_failures or 0) + 1
if task.refresh_attempts >= self.max_refresh_retries:
task.status = 'failed'
task.channel_status = 'disconnected'
task.terminal_failure_reason = result.error_message
task.next_retry_at = None
else:
task.status = 'degraded'
task.channel_status = 'degraded'
delay_minutes = self.base_retry_backoff_minutes * (2 ** (task.refresh_attempts - 1))
task.next_retry_at = datetime.utcnow() + timedelta(minutes=delay_minutes)
# Do NOT update next_check - wait for manual trigger
self.logger.warning(
f"OAuth token refresh failed for user {user_id}, platform {platform}. "
@@ -161,7 +144,7 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
)
# Create UsageAlert notification for the user
self._create_failure_alert(user_id, platform, result.error_message, result.result_data, db, task)
self._create_failure_alert(user_id, platform, result.error_message, result.result_data, db)
task.updated_at = datetime.utcnow()
db.commit()
@@ -210,14 +193,12 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
task.last_failure = datetime.utcnow()
task.failure_reason = str(e)
task.status = 'failed'
task.channel_status = 'disconnected'
task.terminal_failure_reason = str(e)
task.last_check = datetime.utcnow()
task.updated_at = datetime.utcnow()
task.next_retry_at = None
# Do NOT update next_check - wait for manual trigger
# Create UsageAlert notification for the user
self._create_failure_alert(user_id, task.platform, str(e), None, db, task)
self._create_failure_alert(user_id, task.platform, str(e), None, db)
db.commit()
except Exception as commit_error:
@@ -670,8 +651,7 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
platform: str,
error_message: str,
result_data: Optional[Dict[str, Any]],
db: Session,
task: Optional[OAuthTokenMonitoringTask] = None
db: Session
):
"""
Create a UsageAlert notification when OAuth token refresh fails.
@@ -743,20 +723,6 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
# Get current billing period (YYYY-MM format)
from datetime import datetime
billing_period = datetime.utcnow().strftime("%Y-%m")
alert_payload = {
"requires_user_action": True,
"platform": platform,
"channel_status": getattr(task, "channel_status", "disconnected"),
"terminal_failure_reason": getattr(task, "terminal_failure_reason", error_message),
"next_retry_at": (
task.next_retry_at.isoformat() if task and task.next_retry_at else None
),
"refresh_attempts": getattr(task, "refresh_attempts", 0),
"max_refresh_retries": self.max_refresh_retries,
}
message = f"{message} [ALERT_PAYLOAD] {alert_payload}"
# Create UsageAlert
alert = UsageAlert(
@@ -820,3 +786,4 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
f"Defaulting to Weekly (7 days)."
)
return last_execution + timedelta(days=7)

View File

@@ -3,7 +3,7 @@ OAuth Token Monitoring Task Loader
Functions to load due OAuth token monitoring tasks from database.
"""
from datetime import datetime, timedelta
from datetime import datetime
from typing import List, Optional, Union
from sqlalchemy.orm import Session
from sqlalchemy import and_, or_
@@ -52,34 +52,3 @@ def load_due_oauth_token_monitoring_tasks(
return query.all()
def load_near_expiry_oauth_token_tasks(
db: Session,
refresh_horizon_hours: int = 24,
user_id: Optional[Union[str, int]] = None
) -> List[OAuthTokenMonitoringTask]:
"""
Load OAuth tasks that should run token refresh logic soon.
Includes:
- tasks with a scheduled retry now due (next_retry_at <= now)
- tasks whose routine check is inside the near-expiry horizon window
"""
now = datetime.utcnow()
horizon = now + timedelta(hours=max(refresh_horizon_hours, 1))
query = db.query(OAuthTokenMonitoringTask).filter(
and_(
OAuthTokenMonitoringTask.status.in_(['active', 'failed', 'degraded']),
or_(
OAuthTokenMonitoringTask.next_retry_at <= now,
OAuthTokenMonitoringTask.next_check <= horizon,
OAuthTokenMonitoringTask.next_check.is_(None)
)
)
)
if user_id is not None:
query = query.filter(OAuthTokenMonitoringTask.user_id == str(user_id))
return query.all()