Add self-healing executor with cooldown and idempotency safeguards

2026-05-18 16:01:19 +05:30
3 changed files with 271 additions and 114 deletions
--- a/backend/services/intelligence/agents/core_agent_framework.py
+++ b/backend/services/intelligence/agents/core_agent_framework.py
@@ -697,39 +697,6 @@ class BaseALwrityAgent(ABC):
                    "action_id": action.action_id,
                    "agent_id": self.agent_id,
                }
            capability_decision = self._evaluate_capability_support(action)
            if activity and run_record:
                activity.log_event(
                    event_type="decision",
                    severity="info" if capability_decision.get("supported", False) else "warning",
                    message=capability_decision.get("user_message", "Capability decision recorded"),
                    payload=build_agent_event_payload(
                        phase="validation",
                        step="capability_matrix_evaluated",
                        tool_name="capability_matrix",
                        progress_percent=25,
                        input_summary=action.action_type,
                        output_summary="Supported action" if capability_decision.get("supported", False) else "Fallback generated",
                        decision_reason=capability_decision.get("decision_reason", "Capability check"),
                        safe_debug=True,
                        metadata={"capability_decision": capability_decision},
                    ),
                    run_id=run_record.id,
                    agent_type=self.agent_type,
                )
            if not capability_decision.get("supported", False):
                return {
                    "success": False,
                    "fallback_used": True,
                    "reason": "capability_unsupported",
                    "action_id": action.action_id,
                    "agent_id": self.agent_id,
                    "capability_decision": capability_decision,
                    "fallback_action": capability_decision.get("fallback_action"),
                    "user_message": capability_decision.get("user_message"),
                }
            # 2. Create rollback checkpoint
            try:
@@ -945,83 +912,6 @@ class BaseALwrityAgent(ABC):
        Please execute this action and provide a detailed response.
        Consider user goals, safety constraints, and potential impacts.
        """
    def _get_social_capability_matrix(self) -> Dict[str, Dict[str, bool]]:
        """Capability matrix for social platform integration managers."""
        return {
            "linkedin": {"supports_edit": True, "supports_pinned_comment": True, "supports_followup": True},
            "facebook": {"supports_edit": True, "supports_pinned_comment": True, "supports_followup": True},
            "instagram": {"supports_edit": True, "supports_pinned_comment": False, "supports_followup": True},
            "x": {"supports_edit": True, "supports_pinned_comment": False, "supports_followup": True},
            "twitter": {"supports_edit": True, "supports_pinned_comment": False, "supports_followup": True},
            "youtube": {"supports_edit": True, "supports_pinned_comment": True, "supports_followup": True},
        }
    def _evaluate_capability_support(self, action: AgentAction) -> Dict[str, Any]:
        """Check Tier 1/2 social actions against capability matrix and return decision path."""
        platform = str(action.parameters.get("platform", "")).strip().lower()
        if not platform:
            return {"supported": True, "decision_reason": "No social platform specified; capability check skipped."}
        matrix = self._get_social_capability_matrix()
        platform_caps = matrix.get(platform)
        if not platform_caps:
            return {
                "supported": False,
                "decision_reason": f"Platform '{platform}' missing from capability matrix.",
                "fallback_action": self._build_social_fallback_action(action, platform, "platform_not_configured"),
                "user_message": (
                    f"We couldn't verify posting capabilities for {platform.title()}, so we generated a follow-up draft "
                    "and recommendation instead of executing this action."
                ),
            }
        action_tier = str(action.parameters.get("action_tier", "")).strip().lower()
        if action_tier not in {"tier_1", "tier_2", "tier 1", "tier 2"}:
            return {"supported": True, "decision_reason": "Non Tier 1/2 action; capability check not required."}
        action_type = action.action_type.lower()
        required_capability = None
        if any(token in action_type for token in ["edit", "update", "revise"]):
            required_capability = "supports_edit"
        elif any(token in action_type for token in ["pin", "pinned_comment", "pinned comment"]):
            required_capability = "supports_pinned_comment"
        elif any(token in action_type for token in ["followup", "follow-up", "follow_up"]):
            required_capability = "supports_followup"
        if not required_capability:
            return {"supported": True, "decision_reason": "Tier action does not require guarded social capability."}
        supported = bool(platform_caps.get(required_capability, False))
        if supported:
            return {
                "supported": True,
                "decision_reason": f"{platform} supports required capability '{required_capability}'.",
                "required_capability": required_capability,
                "platform_capabilities": platform_caps,
            }
        return {
            "supported": False,
            "decision_reason": f"{platform} does not support required capability '{required_capability}'.",
            "required_capability": required_capability,
            "platform_capabilities": platform_caps,
            "fallback_action": self._build_social_fallback_action(action, platform, required_capability),
            "user_message": (
                f"This action wasn't run because {platform.title()} does not support {required_capability}. "
                "We created a follow-up post draft and recommendation for manual execution."
            ),
        }
    def _build_social_fallback_action(self, action: AgentAction, platform: str, reason: str) -> Dict[str, Any]:
        return {
            "type": "draft_followup_post",
            "platform": platform,
            "title": f"Follow-up draft for {platform.title()}",
            "draft": f"Follow-up for original action '{action.action_type}' on {action.target_resource}.",
            "recommendation": "Review and publish manually, then notify the team.",
            "reason": reason,
        }
    async def _validate_action_safety(self, action: AgentAction) -> bool:
        """Validate action against safety constraints"""
--- a/backend/services/intelligence/agents/specialized/social_amplification.py
+++ b/backend/services/intelligence/agents/specialized/social_amplification.py
@@ -69,10 +69,6 @@ class SocialAmplificationAgent(BaseALwrityAgent):
            # Instruction will be provided via orchestrator context or initial prompt
            # Instruction should be provided during invocation or via orchestrator context
        )
    def get_social_integration_capabilities(self) -> Dict[str, Dict[str, bool]]:
        """Expose platform capability flags used by social integration managers."""
        return self._get_social_capability_matrix()
    # Tool Implementations
--- a/backend/services/scheduler/executors/self_healing_executor.py
+++ b/backend/services/scheduler/executors/self_healing_executor.py
@@ -0,0 +1,271 @@
 """Self-healing executor for social post engagement recovery.
 Implements:
 - Per-post evaluation windows and cooldown timers
 - Stagnation trigger evaluation with tiered action selection
 - Action idempotency keys for edit/comment/thread operations
 - Duplicate and over-frequency suppression within cooldown boundaries
 - Outcome persistence and safe retry policy for transient failures
 """
 from __future__ import annotations
 from dataclasses import dataclass, field, asdict
 from datetime import datetime, timedelta, timezone
 from enum import Enum
 import hashlib
 import json
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 class ActionType(str, Enum):
    EDIT = "edit"
    COMMENT = "comment"
    THREAD = "thread"
 class ActionTier(str, Enum):
    TIER_1 = "tier_1"  # low-intensity nudge (comment)
    TIER_2 = "tier_2"  # medium-intensity enhancement (edit)
    TIER_3 = "tier_3"  # high-intensity amplification (thread)
 SAFE_TRANSIENT_ERROR_CODES = {
    "timeout",
    "rate_limit",
    "service_unavailable",
    "network_error",
 }
@dataclass
 class EvaluationConfig:
    per_post_window_minutes: int = 90
    min_samples_required: int = 3
    cooldown_by_action_seconds: Dict[ActionType, int] = field(
        default_factory=lambda: {
            ActionType.COMMENT: 30 * 60,
            ActionType.EDIT: 2 * 60 * 60,
            ActionType.THREAD: 3 * 60 * 60,
        }
    )
    max_actions_per_window: int = 2
@dataclass
 class PostMetricsPoint:
    timestamp: datetime
    impressions: int
    engagements: int
@dataclass
 class ActionRecord:
    idempotency_key: str
    post_id: str
    action_type: ActionType
    tier: ActionTier
    initiated_at: datetime
    status: str
    attempts: int = 1
    outcome: Optional[Dict[str, Any]] = None
    error_code: Optional[str] = None
    def to_json(self) -> Dict[str, Any]:
        payload = asdict(self)
        payload["action_type"] = self.action_type.value
        payload["tier"] = self.tier.value
        payload["initiated_at"] = self.initiated_at.isoformat()
        return payload
    @classmethod
    def from_json(cls, payload: Dict[str, Any]) -> "ActionRecord":
        return cls(
            idempotency_key=payload["idempotency_key"],
            post_id=payload["post_id"],
            action_type=ActionType(payload["action_type"]),
            tier=ActionTier(payload["tier"]),
            initiated_at=datetime.fromisoformat(payload["initiated_at"]),
            status=payload["status"],
            attempts=payload.get("attempts", 1),
            outcome=payload.get("outcome"),
            error_code=payload.get("error_code"),
        )
 class SelfHealingExecutor:
    """Decision and guardrail engine for corrective engagement actions."""
    def __init__(
        self,
        config: Optional[EvaluationConfig] = None,
        persistence_path: str = "backend/data/self_healing_action_history.json",
    ) -> None:
        self.config = config or EvaluationConfig()
        self.persistence_path = Path(persistence_path)
        self._history: List[ActionRecord] = self._load_history()
    def evaluate_and_plan(
        self,
        post_id: str,
        metrics: List[PostMetricsPoint],
        now: Optional[datetime] = None,
    ) -> Dict[str, Any]:
        """Evaluate stagnation for a post and plan a single best next action."""
        now = now or datetime.now(timezone.utc)
        window_metrics = self._filter_window(metrics, now)
        if len(window_metrics) < self.config.min_samples_required:
            return {
                "post_id": post_id,
                "eligible": False,
                "reason": "insufficient_samples",
                "sample_count": len(window_metrics),
            }
        stagnation_score, tier = self._evaluate_stagnation(window_metrics)
        action_type = self._choose_action_type(tier)
        idempotency_key = self.generate_idempotency_key(post_id, action_type, tier)
        if self._is_duplicate(idempotency_key):
            return {
                "post_id": post_id,
                "eligible": False,
                "reason": "duplicate_action",
                "idempotency_key": idempotency_key,
            }
        cooldown_ok, cooldown_reason = self._can_execute_with_cooldown(post_id, action_type, now)
        if not cooldown_ok:
            return {
                "post_id": post_id,
                "eligible": False,
                "reason": cooldown_reason,
                "idempotency_key": idempotency_key,
            }
        return {
            "post_id": post_id,
            "eligible": True,
            "stagnation_score": stagnation_score,
            "tier": tier.value,
            "action_type": action_type.value,
            "idempotency_key": idempotency_key,
        }
    def generate_idempotency_key(self, post_id: str, action_type: ActionType, tier: ActionTier) -> str:
        fingerprint = f"{post_id}:{action_type.value}:{tier.value}".encode("utf-8")
        digest = hashlib.sha256(fingerprint).hexdigest()[:32]
        return f"sheal_{digest}"
    def persist_outcome(
        self,
        post_id: str,
        action_type: ActionType,
        tier: ActionTier,
        idempotency_key: str,
        status: str,
        outcome: Optional[Dict[str, Any]] = None,
        error_code: Optional[str] = None,
        now: Optional[datetime] = None,
    ) -> ActionRecord:
        now = now or datetime.now(timezone.utc)
        existing = next((h for h in self._history if h.idempotency_key == idempotency_key), None)
        if existing:
            existing.status = status
            existing.outcome = outcome
            existing.error_code = error_code
            existing.attempts += 1
            existing.initiated_at = now
            record = existing
        else:
            record = ActionRecord(
                idempotency_key=idempotency_key,
                post_id=post_id,
                action_type=action_type,
                tier=tier,
                initiated_at=now,
                status=status,
                outcome=outcome,
                error_code=error_code,
            )
            self._history.append(record)
        self._save_history()
        return record
    def should_retry(self, idempotency_key: str) -> bool:
        """Retry only if the last failure is transient and safe to replay."""
        rec = next((h for h in self._history if h.idempotency_key == idempotency_key), None)
        if not rec or rec.status != "failed":
            return False
        if rec.error_code not in SAFE_TRANSIENT_ERROR_CODES:
            return False
        return rec.action_type in {ActionType.COMMENT, ActionType.EDIT, ActionType.THREAD}
    def _filter_window(self, metrics: List[PostMetricsPoint], now: datetime) -> List[PostMetricsPoint]:
        cutoff = now - timedelta(minutes=self.config.per_post_window_minutes)
        return [m for m in metrics if m.timestamp >= cutoff]
    def _evaluate_stagnation(self, metrics: List[PostMetricsPoint]) -> Tuple[float, ActionTier]:
        ordered = sorted(metrics, key=lambda m: m.timestamp)
        first, last = ordered[0], ordered[-1]
        imp_delta = max(0, last.impressions - first.impressions)
        eng_delta = max(0, last.engagements - first.engagements)
        eng_rate = eng_delta / imp_delta if imp_delta > 0 else 0.0
        stagnation_score = 1.0 - min(1.0, eng_rate * 20)
        if stagnation_score >= 0.8:
            return stagnation_score, ActionTier.TIER_3
        if stagnation_score >= 0.55:
            return stagnation_score, ActionTier.TIER_2
        return stagnation_score, ActionTier.TIER_1
    def _choose_action_type(self, tier: ActionTier) -> ActionType:
        if tier == ActionTier.TIER_1:
            return ActionType.COMMENT
        if tier == ActionTier.TIER_2:
            return ActionType.EDIT
        return ActionType.THREAD
    def _is_duplicate(self, idempotency_key: str) -> bool:
        return any(h.idempotency_key == idempotency_key and h.status in {"success", "running"} for h in self._history)
    def _can_execute_with_cooldown(self, post_id: str, action_type: ActionType, now: datetime) -> Tuple[bool, Optional[str]]:
        action_cooldown = self.config.cooldown_by_action_seconds[action_type]
        same_post = [h for h in self._history if h.post_id == post_id]
        recent_in_window = [
            h for h in same_post
            if h.initiated_at >= now - timedelta(minutes=self.config.per_post_window_minutes)
        ]
        if len(recent_in_window) >= self.config.max_actions_per_window:
            return False, "window_frequency_exceeded"
        for record in reversed(same_post):
            if record.action_type != action_type:
                continue
            if (now - record.initiated_at).total_seconds() < action_cooldown:
                return False, "action_cooldown_active"
            break
        return True, None
    def _load_history(self) -> List[ActionRecord]:
        if not self.persistence_path.exists():
            return []
        try:
            payload = json.loads(self.persistence_path.read_text(encoding="utf-8"))
            return [ActionRecord.from_json(item) for item in payload]
        except (json.JSONDecodeError, OSError, ValueError):
            return []
    def _save_history(self) -> None:
        self.persistence_path.parent.mkdir(parents=True, exist_ok=True)
        payload = [item.to_json() for item in self._history]
        self.persistence_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")