Recovered critical missing components: PerformanceMonitor, MarketSignalDetector, and SemanticDashboard
This commit is contained in:
@@ -17,112 +17,747 @@ from services.database import get_session_for_user
|
||||
|
||||
logger = get_service_logger(__name__)
|
||||
|
||||
class AgentStatus(Enum):
|
||||
IDLE = "idle"
|
||||
BUSY = "busy"
|
||||
ERROR = "error"
|
||||
OFFLINE = "offline"
|
||||
INITIALIZING = "initializing"
|
||||
|
||||
class PerformanceMetric(Enum):
|
||||
"""Types of performance metrics tracked"""
|
||||
RESPONSE_TIME = "response_time"
|
||||
SUCCESS_RATE = "success_rate"
|
||||
TOKEN_USAGE = "token_usage"
|
||||
COST_PER_ACTION = "cost_per_action"
|
||||
RESOURCE_UTILIZATION = "resource_utilization"
|
||||
GOAL_COMPLETION_RATE = "goal_completion_rate"
|
||||
EFFICIENCY_SCORE = "efficiency_score"
|
||||
RESOURCE_USAGE = "resource_usage"
|
||||
USER_SATISFACTION = "user_satisfaction"
|
||||
MARKET_IMPACT = "market_impact"
|
||||
|
||||
class AgentStatus(Enum):
|
||||
"""Status of agent operations"""
|
||||
ACTIVE = "active"
|
||||
IDLE = "idle"
|
||||
PROCESSING = "processing"
|
||||
ERROR = "error"
|
||||
MAINTENANCE = "maintenance"
|
||||
|
||||
@dataclass
|
||||
class AgentPerformanceMetrics:
|
||||
agent_id: str
|
||||
timestamp: datetime
|
||||
metrics: Dict[str, float]
|
||||
class PerformanceDataPoint:
|
||||
"""Single performance data point"""
|
||||
timestamp: str
|
||||
metric_type: PerformanceMetric
|
||||
value: float
|
||||
context: Dict[str, Any]
|
||||
agent_id: str
|
||||
user_id: str
|
||||
|
||||
class PerformanceMonitor:
|
||||
"""
|
||||
Monitors and analyzes agent performance metrics
|
||||
"""
|
||||
@dataclass
|
||||
class AgentPerformanceSnapshot:
|
||||
"""Complete performance snapshot for an agent"""
|
||||
agent_id: str
|
||||
user_id: str
|
||||
timestamp: str
|
||||
status: AgentStatus
|
||||
total_actions: int
|
||||
successful_actions: int
|
||||
failed_actions: int
|
||||
average_response_time: float
|
||||
success_rate: float
|
||||
efficiency_score: float
|
||||
resource_usage: Dict[str, float]
|
||||
market_impact_score: float
|
||||
last_action_at: str
|
||||
|
||||
def __post_init__(self):
|
||||
if self.timestamp is None:
|
||||
self.timestamp = datetime.utcnow().isoformat()
|
||||
|
||||
@dataclass
|
||||
class PerformanceTrend:
|
||||
"""Performance trend analysis"""
|
||||
metric_type: PerformanceMetric
|
||||
trend_direction: str # "improving", "declining", "stable"
|
||||
trend_strength: float # 0.0 to 1.0
|
||||
change_rate: float # Percentage change per time unit
|
||||
confidence: float # 0.0 to 1.0
|
||||
period_start: str
|
||||
period_end: str
|
||||
|
||||
@dataclass
|
||||
class OptimizationRecommendation:
|
||||
"""Performance optimization recommendation"""
|
||||
recommendation_id: str
|
||||
agent_id: str
|
||||
user_id: str
|
||||
recommendation_type: str
|
||||
priority: str # "high", "medium", "low"
|
||||
description: str
|
||||
expected_impact: float # Expected improvement in performance
|
||||
implementation_steps: List[str]
|
||||
estimated_effort: str # "low", "medium", "high"
|
||||
created_at: str
|
||||
expires_at: str
|
||||
|
||||
def __post_init__(self):
|
||||
if self.created_at is None:
|
||||
self.created_at = datetime.utcnow().isoformat()
|
||||
if self.expires_at is None:
|
||||
# Default expiration: 7 days
|
||||
expires = datetime.utcnow().timestamp() + (7 * 24 * 60 * 60)
|
||||
self.expires_at = datetime.fromtimestamp(expires).isoformat()
|
||||
|
||||
class AgentPerformanceMonitor:
|
||||
"""Main performance monitoring system for agents"""
|
||||
|
||||
def __init__(self, user_id: str):
|
||||
self.user_id = user_id
|
||||
self.performance_data: Dict[str, List[PerformanceDataPoint]] = defaultdict(list)
|
||||
self.agent_snapshots: Dict[str, AgentPerformanceSnapshot] = {}
|
||||
self.recommendations: List[OptimizationRecommendation] = []
|
||||
self.performance_history: deque = deque(maxlen=1000) # Keep last 1000 data points
|
||||
|
||||
# Performance thresholds and targets
|
||||
self.performance_targets = {
|
||||
"success_rate": 0.85, # 85% success rate target
|
||||
"response_time": 30.0, # 30 seconds average response time target
|
||||
"efficiency_score": 0.75, # 75% efficiency score target
|
||||
"market_impact": 0.60 # 60% market impact score target
|
||||
}
|
||||
|
||||
# Alert thresholds
|
||||
self.alert_thresholds = {
|
||||
"success_rate": 0.70, # Alert if below 70%
|
||||
"response_time": 60.0, # Alert if above 60 seconds
|
||||
"efficiency_score": 0.50, # Alert if below 50%
|
||||
"market_impact": 0.30 # Alert if below 30%
|
||||
}
|
||||
|
||||
logger.info(f"Initialized AgentPerformanceMonitor for user: {user_id}")
|
||||
|
||||
async def record_performance_data(self, agent_id: str, metric_type: PerformanceMetric, value: float, context: Dict[str, Any] = None) -> bool:
|
||||
"""Record a performance data point"""
|
||||
try:
|
||||
if context is None:
|
||||
context = {}
|
||||
|
||||
data_point = PerformanceDataPoint(
|
||||
timestamp=datetime.utcnow().isoformat(),
|
||||
metric_type=metric_type,
|
||||
value=value,
|
||||
context=context,
|
||||
agent_id=agent_id,
|
||||
user_id=self.user_id
|
||||
)
|
||||
|
||||
# Store in performance data
|
||||
self.performance_data[agent_id].append(data_point)
|
||||
self.performance_history.append(data_point)
|
||||
|
||||
# Keep only recent data (last 24 hours for real-time analysis)
|
||||
cutoff_time = datetime.utcnow().timestamp() - (24 * 60 * 60)
|
||||
self.performance_data[agent_id] = [
|
||||
dp for dp in self.performance_data[agent_id]
|
||||
if datetime.fromisoformat(dp.timestamp).timestamp() > cutoff_time
|
||||
]
|
||||
|
||||
logger.debug(f"Recorded performance data for agent {agent_id}: {metric_type.value} = {value}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error recording performance data for agent {agent_id}: {e}")
|
||||
return False
|
||||
|
||||
async def update_agent_snapshot(self, agent_id: str, status: AgentStatus, action_result: Dict[str, Any] = None) -> AgentPerformanceSnapshot:
|
||||
"""Update performance snapshot for an agent"""
|
||||
try:
|
||||
# Get recent performance data
|
||||
recent_data = self.performance_data[agent_id]
|
||||
|
||||
# Calculate metrics from recent data
|
||||
total_actions = len([dp for dp in recent_data if dp.metric_type == PerformanceMetric.SUCCESS_RATE])
|
||||
successful_actions = len([dp for dp in recent_data if dp.metric_type == PerformanceMetric.SUCCESS_RATE and dp.value > 0.5])
|
||||
failed_actions = total_actions - successful_actions
|
||||
|
||||
# Calculate average response time
|
||||
response_time_data = [dp.value for dp in recent_data if dp.metric_type == PerformanceMetric.RESPONSE_TIME]
|
||||
avg_response_time = sum(response_time_data) / len(response_time_data) if response_time_data else 0.0
|
||||
|
||||
# Calculate success rate
|
||||
success_rate = successful_actions / total_actions if total_actions > 0 else 0.0
|
||||
|
||||
# Calculate efficiency score
|
||||
efficiency_data = [dp.value for dp in recent_data if dp.metric_type == PerformanceMetric.EFFICIENCY_SCORE]
|
||||
avg_efficiency = sum(efficiency_data) / len(efficiency_data) if efficiency_data else 0.0
|
||||
|
||||
# Calculate market impact
|
||||
market_impact_data = [dp.value for dp in recent_data if dp.metric_type == PerformanceMetric.MARKET_IMPACT]
|
||||
avg_market_impact = sum(market_impact_data) / len(market_impact_data) if market_impact_data else 0.0
|
||||
|
||||
# Get resource usage
|
||||
resource_usage = self._calculate_resource_usage(agent_id, recent_data)
|
||||
|
||||
# Get last action time
|
||||
last_action_at = max([dp.timestamp for dp in recent_data], default=datetime.utcnow().isoformat()) if recent_data else datetime.utcnow().isoformat()
|
||||
|
||||
# Create snapshot
|
||||
snapshot = AgentPerformanceSnapshot(
|
||||
agent_id=agent_id,
|
||||
user_id=self.user_id,
|
||||
timestamp=datetime.utcnow().isoformat(),
|
||||
status=status,
|
||||
total_actions=total_actions,
|
||||
successful_actions=successful_actions,
|
||||
failed_actions=failed_actions,
|
||||
average_response_time=avg_response_time,
|
||||
success_rate=success_rate,
|
||||
efficiency_score=avg_efficiency,
|
||||
resource_usage=resource_usage,
|
||||
market_impact_score=avg_market_impact,
|
||||
last_action_at=last_action_at
|
||||
)
|
||||
|
||||
self.agent_snapshots[agent_id] = snapshot
|
||||
|
||||
logger.info(f"Updated performance snapshot for agent {agent_id}: success_rate={success_rate:.2f}, efficiency={avg_efficiency:.2f}")
|
||||
return snapshot
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating performance snapshot for agent {agent_id}: {e}")
|
||||
# Return a default snapshot
|
||||
return AgentPerformanceSnapshot(
|
||||
agent_id=agent_id,
|
||||
user_id=self.user_id,
|
||||
timestamp=datetime.utcnow().isoformat(),
|
||||
status=AgentStatus.ERROR,
|
||||
total_actions=0,
|
||||
successful_actions=0,
|
||||
failed_actions=0,
|
||||
average_response_time=0.0,
|
||||
success_rate=0.0,
|
||||
efficiency_score=0.0,
|
||||
resource_usage={},
|
||||
market_impact_score=0.0,
|
||||
last_action_at=datetime.utcnow().isoformat()
|
||||
)
|
||||
|
||||
def _calculate_resource_usage(self, agent_id: str, recent_data: List[PerformanceDataPoint]) -> Dict[str, float]:
|
||||
"""Calculate resource usage metrics"""
|
||||
resource_usage = {
|
||||
"cpu_usage": 0.0,
|
||||
"memory_usage": 0.0,
|
||||
"api_calls": 0,
|
||||
"processing_time": 0.0
|
||||
}
|
||||
|
||||
try:
|
||||
# Extract resource usage from context
|
||||
for dp in recent_data:
|
||||
if dp.metric_type == PerformanceMetric.RESOURCE_USAGE and dp.context:
|
||||
resource_usage["cpu_usage"] = max(resource_usage["cpu_usage"], dp.context.get("cpu_usage", 0.0))
|
||||
resource_usage["memory_usage"] = max(resource_usage["memory_usage"], dp.context.get("memory_usage", 0.0))
|
||||
resource_usage["api_calls"] += dp.context.get("api_calls", 0)
|
||||
resource_usage["processing_time"] += dp.context.get("processing_time", 0.0)
|
||||
|
||||
# Calculate averages if multiple data points
|
||||
if len(recent_data) > 0:
|
||||
resource_usage["processing_time"] = resource_usage["processing_time"] / len(recent_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating resource usage for agent {agent_id}: {e}")
|
||||
|
||||
return resource_usage
|
||||
|
||||
async def analyze_performance_trends(self, agent_id: str, period_hours: int = 24) -> List[PerformanceTrend]:
|
||||
"""Analyze performance trends for an agent"""
|
||||
try:
|
||||
cutoff_time = datetime.utcnow().timestamp() - (period_hours * 60 * 60)
|
||||
agent_data = [
|
||||
dp for dp in self.performance_data[agent_id]
|
||||
if datetime.fromisoformat(dp.timestamp).timestamp() > cutoff_time
|
||||
]
|
||||
|
||||
if len(agent_data) < 5: # Need at least 5 data points for trend analysis
|
||||
return []
|
||||
|
||||
trends = []
|
||||
|
||||
# Analyze trends for each metric type
|
||||
for metric_type in PerformanceMetric:
|
||||
metric_data = [dp for dp in agent_data if dp.metric_type == metric_type]
|
||||
|
||||
if len(metric_data) < 3: # Need at least 3 points for trend
|
||||
continue
|
||||
|
||||
# Sort by timestamp
|
||||
metric_data.sort(key=lambda x: x.timestamp)
|
||||
|
||||
# Calculate trend
|
||||
trend_result = self._calculate_trend(metric_data)
|
||||
|
||||
if trend_result:
|
||||
trend = PerformanceTrend(
|
||||
metric_type=metric_type,
|
||||
trend_direction=trend_result["direction"],
|
||||
trend_strength=trend_result["strength"],
|
||||
change_rate=trend_result["change_rate"],
|
||||
confidence=trend_result["confidence"],
|
||||
period_start=metric_data[0].timestamp,
|
||||
period_end=metric_data[-1].timestamp
|
||||
)
|
||||
trends.append(trend)
|
||||
|
||||
logger.info(f"Analyzed performance trends for agent {agent_id}: found {len(trends)} trends")
|
||||
return trends
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing performance trends for agent {agent_id}: {e}")
|
||||
return []
|
||||
|
||||
def _calculate_trend(self, data_points: List[PerformanceDataPoint]) -> Optional[Dict[str, Any]]:
|
||||
"""Calculate trend from performance data points"""
|
||||
try:
|
||||
if len(data_points) < 3:
|
||||
return None
|
||||
|
||||
# Extract values and timestamps
|
||||
values = [dp.value for dp in data_points]
|
||||
timestamps = [datetime.fromisoformat(dp.timestamp).timestamp() for dp in data_points]
|
||||
|
||||
# Simple linear trend calculation
|
||||
n = len(values)
|
||||
sum_x = sum(timestamps)
|
||||
sum_y = sum(values)
|
||||
sum_xy = sum(x * y for x, y in zip(timestamps, values))
|
||||
sum_x2 = sum(x * x for x in timestamps)
|
||||
|
||||
# Calculate slope and intercept
|
||||
slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x * sum_x)
|
||||
intercept = (sum_y - slope * sum_x) / n
|
||||
|
||||
# Calculate correlation coefficient (confidence)
|
||||
mean_y = sum_y / n
|
||||
ss_tot = sum((y - mean_y) ** 2 for y in values)
|
||||
ss_res = sum((y - (slope * x + intercept)) ** 2 for x, y in zip(timestamps, values))
|
||||
r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
|
||||
|
||||
# Determine trend direction
|
||||
if abs(slope) < 0.001: # Nearly flat
|
||||
direction = "stable"
|
||||
strength = 0.0
|
||||
elif slope > 0:
|
||||
direction = "improving"
|
||||
strength = min(1.0, abs(slope) * 100) # Scale slope to 0-1
|
||||
else:
|
||||
direction = "declining"
|
||||
strength = min(1.0, abs(slope) * 100)
|
||||
|
||||
# Calculate change rate (percentage change per hour)
|
||||
time_span = timestamps[-1] - timestamps[0]
|
||||
if time_span > 0:
|
||||
change_rate = (slope * 3600) / (values[0] if values[0] != 0 else 1) * 100 # Per hour
|
||||
else:
|
||||
change_rate = 0.0
|
||||
|
||||
return {
|
||||
"direction": direction,
|
||||
"strength": strength,
|
||||
"change_rate": change_rate,
|
||||
"confidence": r_squared
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating trend: {e}")
|
||||
return None
|
||||
|
||||
async def generate_optimization_recommendations(self, agent_id: str) -> List[OptimizationRecommendation]:
|
||||
"""Generate optimization recommendations for an agent"""
|
||||
try:
|
||||
recommendations = []
|
||||
|
||||
# Get current snapshot
|
||||
snapshot = self.agent_snapshots.get(agent_id)
|
||||
if not snapshot:
|
||||
return []
|
||||
|
||||
# Get performance trends
|
||||
trends = await self.analyze_performance_trends(agent_id)
|
||||
|
||||
# Generate recommendations based on performance analysis
|
||||
|
||||
# 1. Success rate recommendations
|
||||
if snapshot.success_rate < self.performance_targets["success_rate"]:
|
||||
recommendation = OptimizationRecommendation(
|
||||
recommendation_id=f"success_rate_{agent_id}_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}",
|
||||
agent_id=agent_id,
|
||||
user_id=self.user_id,
|
||||
recommendation_type="success_rate_improvement",
|
||||
priority="high" if snapshot.success_rate < self.alert_thresholds["success_rate"] else "medium",
|
||||
description=f"Agent success rate is {snapshot.success_rate:.1%}, target is {self.performance_targets['success_rate']:.1%}",
|
||||
expected_impact=self.performance_targets["success_rate"] - snapshot.success_rate,
|
||||
implementation_steps=[
|
||||
"Analyze recent failed actions to identify patterns",
|
||||
"Review error logs for common failure causes",
|
||||
"Update agent parameters or logic to address identified issues",
|
||||
"Test improvements with small batch of actions",
|
||||
"Monitor success rate improvement over time"
|
||||
],
|
||||
estimated_effort="medium"
|
||||
)
|
||||
recommendations.append(recommendation)
|
||||
|
||||
# 2. Response time recommendations
|
||||
if snapshot.average_response_time > self.performance_targets["response_time"]:
|
||||
recommendation = OptimizationRecommendation(
|
||||
recommendation_id=f"response_time_{agent_id}_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}",
|
||||
agent_id=agent_id,
|
||||
user_id=self.user_id,
|
||||
recommendation_type="response_time_optimization",
|
||||
priority="high" if snapshot.average_response_time > self.alert_thresholds["response_time"] else "medium",
|
||||
description=f"Agent average response time is {snapshot.average_response_time:.1f}s, target is {self.performance_targets['response_time']:.1f}s",
|
||||
expected_impact=(self.performance_targets["response_time"] - snapshot.average_response_time) / snapshot.average_response_time,
|
||||
implementation_steps=[
|
||||
"Profile agent execution to identify bottlenecks",
|
||||
"Optimize API calls and external service interactions",
|
||||
"Implement caching for frequently accessed data",
|
||||
"Review and optimize agent logic and decision-making",
|
||||
"Monitor response time improvement"
|
||||
],
|
||||
estimated_effort="high"
|
||||
)
|
||||
recommendations.append(recommendation)
|
||||
|
||||
# 3. Efficiency score recommendations
|
||||
if snapshot.efficiency_score < self.performance_targets["efficiency_score"]:
|
||||
recommendation = OptimizationRecommendation(
|
||||
recommendation_id=f"efficiency_{agent_id}_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}",
|
||||
agent_id=agent_id,
|
||||
user_id=self.user_id,
|
||||
recommendation_type="efficiency_improvement",
|
||||
priority="high" if snapshot.efficiency_score < self.alert_thresholds["efficiency_score"] else "medium",
|
||||
description=f"Agent efficiency score is {snapshot.efficiency_score:.2f}, target is {self.performance_targets['efficiency_score']:.2f}",
|
||||
expected_impact=self.performance_targets["efficiency_score"] - snapshot.efficiency_score,
|
||||
implementation_steps=[
|
||||
"Analyze agent decision-making patterns",
|
||||
"Identify redundant or unnecessary operations",
|
||||
"Optimize agent parameters and thresholds",
|
||||
"Implement better error handling and recovery",
|
||||
"Monitor efficiency score improvement"
|
||||
],
|
||||
estimated_effort="medium"
|
||||
)
|
||||
recommendations.append(recommendation)
|
||||
|
||||
# 4. Market impact recommendations
|
||||
if snapshot.market_impact_score < self.performance_targets["market_impact"]:
|
||||
recommendation = OptimizationRecommendation(
|
||||
recommendation_id=f"market_impact_{agent_id}_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}",
|
||||
agent_id=agent_id,
|
||||
user_id=self.user_id,
|
||||
recommendation_type="market_impact_enhancement",
|
||||
priority="medium",
|
||||
description=f"Agent market impact score is {snapshot.market_impact_score:.2f}, target is {self.performance_targets['market_impact']:.2f}",
|
||||
expected_impact=self.performance_targets["market_impact"] - snapshot.market_impact_score,
|
||||
implementation_steps=[
|
||||
"Analyze market signal detection accuracy",
|
||||
"Improve market trend analysis and prediction",
|
||||
"Enhance competitive intelligence gathering",
|
||||
"Optimize timing and execution of market actions",
|
||||
"Monitor market impact score improvement"
|
||||
],
|
||||
estimated_effort="high"
|
||||
)
|
||||
recommendations.append(recommendation)
|
||||
|
||||
# 5. Trend-based recommendations
|
||||
for trend in trends:
|
||||
if trend.trend_strength > 0.7 and trend.confidence > 0.8: # Strong trend with high confidence
|
||||
if trend.trend_direction == "declining":
|
||||
recommendation = OptimizationRecommendation(
|
||||
recommendation_id=f"trend_{trend.metric_type.value}_{agent_id}_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}",
|
||||
agent_id=agent_id,
|
||||
user_id=self.user_id,
|
||||
recommendation_type="trend_reversal",
|
||||
priority="high" if trend.trend_strength > 0.8 else "medium",
|
||||
description=f"Strong declining trend detected in {trend.metric_type.value}: {trend.change_rate:.1f}% change per hour",
|
||||
expected_impact=0.3, # Estimate 30% improvement potential
|
||||
implementation_steps=[
|
||||
f"Investigate causes of declining {trend.metric_type.value}",
|
||||
"Identify specific factors contributing to the trend",
|
||||
"Implement corrective measures based on findings",
|
||||
"Monitor trend reversal over time",
|
||||
"Adjust approach if trend continues"
|
||||
],
|
||||
estimated_effort="medium"
|
||||
)
|
||||
recommendations.append(recommendation)
|
||||
|
||||
# Sort by priority and expected impact
|
||||
recommendations.sort(key=lambda x: (self._priority_weight(x.priority), x.expected_impact), reverse=True)
|
||||
|
||||
# Keep only top 10 recommendations
|
||||
recommendations = recommendations[:10]
|
||||
|
||||
# Store recommendations
|
||||
self.recommendations.extend(recommendations)
|
||||
|
||||
# Keep only recent recommendations (last 50)
|
||||
if len(self.recommendations) > 50:
|
||||
self.recommendations = self.recommendations[-50:]
|
||||
|
||||
logger.info(f"Generated {len(recommendations)} optimization recommendations for agent {agent_id}")
|
||||
return recommendations
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating optimization recommendations for agent {agent_id}: {e}")
|
||||
return []
|
||||
|
||||
def _priority_weight(self, priority: str) -> int:
|
||||
"""Convert priority to numeric weight for sorting"""
|
||||
priority_weights = {
|
||||
"high": 3,
|
||||
"medium": 2,
|
||||
"low": 1
|
||||
}
|
||||
return priority_weights.get(priority, 0)
|
||||
|
||||
async def get_performance_alerts(self, agent_id: str) -> List[Dict[str, Any]]:
|
||||
"""Get performance alerts for an agent"""
|
||||
alerts = []
|
||||
|
||||
try:
|
||||
snapshot = self.agent_snapshots.get(agent_id)
|
||||
if not snapshot:
|
||||
return []
|
||||
|
||||
# Check success rate alert
|
||||
if snapshot.success_rate < self.alert_thresholds["success_rate"]:
|
||||
alerts.append({
|
||||
"type": "performance_alert",
|
||||
"metric": "success_rate",
|
||||
"current_value": snapshot.success_rate,
|
||||
"threshold": self.alert_thresholds["success_rate"],
|
||||
"target": self.performance_targets["success_rate"],
|
||||
"severity": "high" if snapshot.success_rate < 0.5 else "medium",
|
||||
"message": f"Agent success rate ({snapshot.success_rate:.1%}) is below alert threshold ({self.alert_thresholds['success_rate']:.1%})",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
})
|
||||
|
||||
# Check response time alert
|
||||
if snapshot.average_response_time > self.alert_thresholds["response_time"]:
|
||||
alerts.append({
|
||||
"type": "performance_alert",
|
||||
"metric": "response_time",
|
||||
"current_value": snapshot.average_response_time,
|
||||
"threshold": self.alert_thresholds["response_time"],
|
||||
"target": self.performance_targets["response_time"],
|
||||
"severity": "high" if snapshot.average_response_time > 120 else "medium",
|
||||
"message": f"Agent response time ({snapshot.average_response_time:.1f}s) exceeds alert threshold ({self.alert_thresholds['response_time']:.1f}s)",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
})
|
||||
|
||||
# Check efficiency score alert
|
||||
if snapshot.efficiency_score < self.alert_thresholds["efficiency_score"]:
|
||||
alerts.append({
|
||||
"type": "performance_alert",
|
||||
"metric": "efficiency_score",
|
||||
"current_value": snapshot.efficiency_score,
|
||||
"threshold": self.alert_thresholds["efficiency_score"],
|
||||
"target": self.performance_targets["efficiency_score"],
|
||||
"severity": "high" if snapshot.efficiency_score < 0.3 else "medium",
|
||||
"message": f"Agent efficiency score ({snapshot.efficiency_score:.2f}) is below alert threshold ({self.alert_thresholds['efficiency_score']:.2f})",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
})
|
||||
|
||||
# Check market impact alert
|
||||
if snapshot.market_impact_score < self.alert_thresholds["market_impact"]:
|
||||
alerts.append({
|
||||
"type": "performance_alert",
|
||||
"metric": "market_impact",
|
||||
"current_value": snapshot.market_impact_score,
|
||||
"threshold": self.alert_thresholds["market_impact"],
|
||||
"target": self.performance_targets["market_impact"],
|
||||
"severity": "medium",
|
||||
"message": f"Agent market impact score ({snapshot.market_impact_score:.2f}) is below alert threshold ({self.alert_thresholds['market_impact']:.2f})",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
})
|
||||
|
||||
return alerts
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting performance alerts for agent {agent_id}: {e}")
|
||||
return []
|
||||
|
||||
async def get_performance_summary(self, agent_id: str) -> Dict[str, Any]:
|
||||
"""Get comprehensive performance summary for an agent"""
|
||||
try:
|
||||
snapshot = self.agent_snapshots.get(agent_id)
|
||||
if not snapshot:
|
||||
return {}
|
||||
|
||||
# Get trends
|
||||
trends = await self.analyze_performance_trends(agent_id)
|
||||
|
||||
# Get recommendations
|
||||
recommendations = await self.generate_optimization_recommendations(agent_id)
|
||||
|
||||
# Get alerts
|
||||
alerts = await self.get_performance_alerts(agent_id)
|
||||
|
||||
# Calculate overall health score
|
||||
health_score = self._calculate_health_score(snapshot)
|
||||
|
||||
return {
|
||||
"agent_id": agent_id,
|
||||
"user_id": self.user_id,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"overall_health": health_score,
|
||||
"current_performance": asdict(snapshot),
|
||||
"performance_trends": [asdict(trend) for trend in trends],
|
||||
"optimization_recommendations": [asdict(rec) for rec in recommendations],
|
||||
"performance_alerts": alerts,
|
||||
"performance_targets": self.performance_targets,
|
||||
"alert_thresholds": self.alert_thresholds
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting performance summary for agent {agent_id}: {e}")
|
||||
return {}
|
||||
|
||||
def _calculate_health_score(self, snapshot: AgentPerformanceSnapshot) -> float:
|
||||
"""Calculate overall health score based on key metrics"""
|
||||
try:
|
||||
# Weighted scoring based on key metrics
|
||||
weights = {
|
||||
"success_rate": 0.3,
|
||||
"response_time": 0.25,
|
||||
"efficiency_score": 0.25,
|
||||
"market_impact": 0.2
|
||||
}
|
||||
|
||||
scores = {
|
||||
"success_rate": min(1.0, snapshot.success_rate / self.performance_targets["success_rate"]),
|
||||
"response_time": max(0.0, 1.0 - (snapshot.average_response_time / self.performance_targets["response_time"])),
|
||||
"efficiency_score": min(1.0, snapshot.efficiency_score / self.performance_targets["efficiency_score"]),
|
||||
"market_impact": min(1.0, snapshot.market_impact_score / self.performance_targets["market_impact"])
|
||||
}
|
||||
|
||||
# Calculate weighted health score
|
||||
health_score = sum(scores[metric] * weights[metric] for metric in weights.keys())
|
||||
|
||||
return round(health_score, 2)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating health score: {e}")
|
||||
return 0.0
|
||||
|
||||
def get_all_agents_performance(self) -> List[Dict[str, Any]]:
|
||||
"""Get performance summary for all agents"""
|
||||
all_performance = []
|
||||
|
||||
for agent_id, snapshot in self.agent_snapshots.items():
|
||||
performance_summary = {
|
||||
"agent_id": agent_id,
|
||||
"user_id": self.user_id,
|
||||
"status": snapshot.status.value,
|
||||
"success_rate": snapshot.success_rate,
|
||||
"efficiency_score": snapshot.efficiency_score,
|
||||
"response_time": snapshot.average_response_time,
|
||||
"market_impact": snapshot.market_impact_score,
|
||||
"total_actions": snapshot.total_actions,
|
||||
"last_action": snapshot.last_action_at,
|
||||
"health_score": self._calculate_health_score(snapshot)
|
||||
}
|
||||
all_performance.append(performance_summary)
|
||||
|
||||
return all_performance
|
||||
|
||||
# Service class for performance monitoring
|
||||
class AgentPerformanceService:
|
||||
"""Service class for agent performance monitoring operations"""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics_buffer = deque(maxlen=1000)
|
||||
self.performance_history = defaultdict(list)
|
||||
self.alert_thresholds = {
|
||||
PerformanceMetric.SUCCESS_RATE: 0.8, # Alert if success rate < 80%
|
||||
PerformanceMetric.RESPONSE_TIME: 30.0, # Alert if response time > 30s
|
||||
PerformanceMetric.GOAL_COMPLETION_RATE: 0.7 # Alert if completion < 70%
|
||||
}
|
||||
self.monitors: Dict[str, AgentPerformanceMonitor] = {}
|
||||
self.global_performance_history: deque = deque(maxlen=5000) # Global history
|
||||
|
||||
async def record_metric(self,
|
||||
agent_id: str,
|
||||
metric_type: PerformanceMetric,
|
||||
value: float,
|
||||
context: Optional[Dict[str, Any]] = None):
|
||||
"""Record a performance metric for an agent"""
|
||||
metric_entry = AgentPerformanceMetrics(
|
||||
agent_id=agent_id,
|
||||
timestamp=datetime.utcnow(),
|
||||
metrics={metric_type.value: value},
|
||||
context=context or {}
|
||||
)
|
||||
async def get_monitor(self, user_id: str) -> AgentPerformanceMonitor:
|
||||
"""Get or create a performance monitor for a user"""
|
||||
if user_id not in self.monitors:
|
||||
self.monitors[user_id] = AgentPerformanceMonitor(user_id)
|
||||
return self.monitors[user_id]
|
||||
|
||||
async def record_agent_performance(self, user_id: str, agent_id: str, metric_type: PerformanceMetric, value: float, context: Dict[str, Any] = None) -> bool:
|
||||
"""Record performance data for an agent"""
|
||||
monitor = await self.get_monitor(user_id)
|
||||
success = await monitor.record_performance_data(agent_id, metric_type, value, context)
|
||||
|
||||
self.metrics_buffer.append(metric_entry)
|
||||
self.performance_history[agent_id].append(metric_entry)
|
||||
|
||||
# Check thresholds
|
||||
await self._check_thresholds(agent_id, metric_type, value)
|
||||
|
||||
# Persist if needed (batching implemented in production)
|
||||
# await self._persist_metric(metric_entry)
|
||||
|
||||
async def get_agent_performance(self, agent_id: str, time_window_minutes: int = 60) -> Dict[str, Any]:
|
||||
"""Get aggregated performance metrics for an agent"""
|
||||
cutoff_time = datetime.utcnow() - timedelta(minutes=time_window_minutes)
|
||||
relevant_metrics = [
|
||||
m for m in self.performance_history[agent_id]
|
||||
if m.timestamp > cutoff_time
|
||||
]
|
||||
|
||||
if not relevant_metrics:
|
||||
return {}
|
||||
|
||||
aggregated = defaultdict(list)
|
||||
for m in relevant_metrics:
|
||||
for k, v in m.metrics.items():
|
||||
aggregated[k].append(v)
|
||||
|
||||
result = {
|
||||
"agent_id": agent_id,
|
||||
"period_minutes": time_window_minutes,
|
||||
"sample_size": len(relevant_metrics),
|
||||
"metrics": {
|
||||
k: sum(v) / len(v) for k, v in aggregated.items()
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
async def _check_thresholds(self, agent_id: str, metric_type: PerformanceMetric, value: float):
|
||||
"""Check if metric violates thresholds"""
|
||||
threshold = self.alert_thresholds.get(metric_type)
|
||||
if not threshold:
|
||||
return
|
||||
|
||||
is_violation = False
|
||||
if metric_type in [PerformanceMetric.SUCCESS_RATE, PerformanceMetric.GOAL_COMPLETION_RATE]:
|
||||
if value < threshold:
|
||||
is_violation = True
|
||||
elif value > threshold:
|
||||
is_violation = True
|
||||
|
||||
if is_violation:
|
||||
logger.warning(
|
||||
f"Performance alert for agent {agent_id}: "
|
||||
f"{metric_type.value} = {value} (Threshold: {threshold})"
|
||||
if success:
|
||||
# Also record in global history
|
||||
data_point = PerformanceDataPoint(
|
||||
timestamp=datetime.utcnow().isoformat(),
|
||||
metric_type=metric_type,
|
||||
value=value,
|
||||
context=context or {},
|
||||
agent_id=agent_id,
|
||||
user_id=user_id
|
||||
)
|
||||
# Trigger alert notification (impl via notification service)
|
||||
self.global_performance_history.append(data_point)
|
||||
|
||||
return success
|
||||
|
||||
async def update_agent_performance_snapshot(self, user_id: str, agent_id: str, status: AgentStatus, action_result: Dict[str, Any] = None) -> AgentPerformanceSnapshot:
|
||||
"""Update performance snapshot for an agent"""
|
||||
monitor = await self.get_monitor(user_id)
|
||||
return await monitor.update_agent_snapshot(agent_id, status, action_result)
|
||||
|
||||
async def get_agent_performance_summary(self, user_id: str, agent_id: str) -> Dict[str, Any]:
|
||||
"""Get comprehensive performance summary for an agent"""
|
||||
monitor = await self.get_monitor(user_id)
|
||||
return await monitor.get_performance_summary(agent_id)
|
||||
|
||||
async def get_all_agents_performance_summary(self, user_id: str) -> List[Dict[str, Any]]:
|
||||
"""Get performance summary for all agents for a user"""
|
||||
monitor = await self.get_monitor(user_id)
|
||||
return monitor.get_all_agents_performance()
|
||||
|
||||
async def get_global_performance_stats(self) -> Dict[str, Any]:
|
||||
"""Get global performance statistics across all users and agents"""
|
||||
if not self.global_performance_history:
|
||||
return {}
|
||||
|
||||
# Calculate global statistics
|
||||
total_actions = len([dp for dp in self.global_performance_history if dp.metric_type == PerformanceMetric.SUCCESS_RATE])
|
||||
successful_actions = len([dp for dp in self.global_performance_history if dp.metric_type == PerformanceMetric.SUCCESS_RATE and dp.value > 0.5])
|
||||
|
||||
response_times = [dp.value for dp in self.global_performance_history if dp.metric_type == PerformanceMetric.RESPONSE_TIME]
|
||||
avg_response_time = sum(response_times) / len(response_times) if response_times else 0.0
|
||||
|
||||
efficiency_scores = [dp.value for dp in self.global_performance_history if dp.metric_type == PerformanceMetric.EFFICIENCY_SCORE]
|
||||
avg_efficiency = sum(efficiency_scores) / len(efficiency_scores) if efficiency_scores else 0.0
|
||||
|
||||
unique_users = len(set(dp.user_id for dp in self.global_performance_history))
|
||||
unique_agents = len(set(dp.agent_id for dp in self.global_performance_history))
|
||||
|
||||
return {
|
||||
"total_actions": total_actions,
|
||||
"successful_actions": successful_actions,
|
||||
"overall_success_rate": successful_actions / total_actions if total_actions > 0 else 0.0,
|
||||
"average_response_time": avg_response_time,
|
||||
"average_efficiency_score": avg_efficiency,
|
||||
"unique_users": unique_users,
|
||||
"unique_agents": unique_agents,
|
||||
"total_data_points": len(self.global_performance_history),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Singleton instance
|
||||
performance_monitor = PerformanceMonitor()
|
||||
AgentPerformanceMonitor = PerformanceMonitor
|
||||
performance_service = performance_monitor
|
||||
# Global service instance
|
||||
performance_service = AgentPerformanceService()
|
||||
|
||||
# Convenience functions for external use
|
||||
async def record_agent_performance(user_id: str, agent_id: str, metric_type: PerformanceMetric, value: float, context: Dict[str, Any] = None) -> bool:
|
||||
"""Record performance data for an agent"""
|
||||
return await performance_service.record_agent_performance(user_id, agent_id, metric_type, value, context)
|
||||
|
||||
async def update_agent_performance_snapshot(user_id: str, agent_id: str, status: AgentStatus, action_result: Dict[str, Any] = None) -> AgentPerformanceSnapshot:
|
||||
"""Update performance snapshot for an agent"""
|
||||
return await performance_service.update_agent_performance_snapshot(user_id, agent_id, status, action_result)
|
||||
|
||||
async def get_agent_performance_summary(user_id: str, agent_id: str) -> Dict[str, Any]:
|
||||
"""Get comprehensive performance summary for an agent"""
|
||||
return await performance_service.get_agent_performance_summary(user_id, agent_id)
|
||||
|
||||
async def get_all_agents_performance_summary(user_id: str) -> List[Dict[str, Any]]:
|
||||
"""Get performance summary for all agents for a user"""
|
||||
return await performance_service.get_all_agents_performance_summary(user_id)
|
||||
Reference in New Issue
Block a user