Files
ALwrity/backend/services/intelligence/agents/performance_monitor.py

129 lines
4.3 KiB
Python

"""
Agent Performance Monitoring Framework for ALwrity Autonomous Marketing Agents
Tracks agent performance, efficiency, and provides optimization recommendations
"""
import asyncio
import json
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, asdict
from enum import Enum
from collections import defaultdict, deque
from utils.logger_utils import get_service_logger
from services.database import get_session_for_user
logger = get_service_logger(__name__)
class AgentStatus(Enum):
IDLE = "idle"
BUSY = "busy"
ERROR = "error"
OFFLINE = "offline"
INITIALIZING = "initializing"
class PerformanceMetric(Enum):
RESPONSE_TIME = "response_time"
SUCCESS_RATE = "success_rate"
TOKEN_USAGE = "token_usage"
COST_PER_ACTION = "cost_per_action"
RESOURCE_UTILIZATION = "resource_utilization"
GOAL_COMPLETION_RATE = "goal_completion_rate"
@dataclass
class AgentPerformanceMetrics:
agent_id: str
timestamp: datetime
metrics: Dict[str, float]
context: Dict[str, Any]
class PerformanceMonitor:
"""
Monitors and analyzes agent performance metrics
"""
def __init__(self):
self.metrics_buffer = deque(maxlen=1000)
self.performance_history = defaultdict(list)
self.alert_thresholds = {
PerformanceMetric.SUCCESS_RATE: 0.8, # Alert if success rate < 80%
PerformanceMetric.RESPONSE_TIME: 30.0, # Alert if response time > 30s
PerformanceMetric.GOAL_COMPLETION_RATE: 0.7 # Alert if completion < 70%
}
async def record_metric(self,
agent_id: str,
metric_type: PerformanceMetric,
value: float,
context: Optional[Dict[str, Any]] = None):
"""Record a performance metric for an agent"""
metric_entry = AgentPerformanceMetrics(
agent_id=agent_id,
timestamp=datetime.utcnow(),
metrics={metric_type.value: value},
context=context or {}
)
self.metrics_buffer.append(metric_entry)
self.performance_history[agent_id].append(metric_entry)
# Check thresholds
await self._check_thresholds(agent_id, metric_type, value)
# Persist if needed (batching implemented in production)
# await self._persist_metric(metric_entry)
async def get_agent_performance(self, agent_id: str, time_window_minutes: int = 60) -> Dict[str, Any]:
"""Get aggregated performance metrics for an agent"""
cutoff_time = datetime.utcnow() - timedelta(minutes=time_window_minutes)
relevant_metrics = [
m for m in self.performance_history[agent_id]
if m.timestamp > cutoff_time
]
if not relevant_metrics:
return {}
aggregated = defaultdict(list)
for m in relevant_metrics:
for k, v in m.metrics.items():
aggregated[k].append(v)
result = {
"agent_id": agent_id,
"period_minutes": time_window_minutes,
"sample_size": len(relevant_metrics),
"metrics": {
k: sum(v) / len(v) for k, v in aggregated.items()
}
}
return result
async def _check_thresholds(self, agent_id: str, metric_type: PerformanceMetric, value: float):
"""Check if metric violates thresholds"""
threshold = self.alert_thresholds.get(metric_type)
if not threshold:
return
is_violation = False
if metric_type in [PerformanceMetric.SUCCESS_RATE, PerformanceMetric.GOAL_COMPLETION_RATE]:
if value < threshold:
is_violation = True
elif value > threshold:
is_violation = True
if is_violation:
logger.warning(
f"Performance alert for agent {agent_id}: "
f"{metric_type.value} = {value} (Threshold: {threshold})"
)
# Trigger alert notification (impl via notification service)
# Singleton instance
performance_monitor = PerformanceMonitor()
AgentPerformanceMonitor = PerformanceMonitor
performance_service = performance_monitor