1223 lines
52 KiB
Python
1223 lines
52 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Agent Evaluator - Multi-Agent System Performance Analysis
|
|
|
|
Takes agent execution logs (task, actions taken, results, time, tokens used)
|
|
and evaluates performance: task success rate, average cost per task, latency
|
|
distribution, error patterns, tool usage efficiency, identifies bottlenecks
|
|
and improvement opportunities.
|
|
|
|
Input: execution logs JSON
|
|
Output: performance report + bottleneck analysis + optimization recommendations
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import sys
|
|
import statistics
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
from dataclasses import dataclass, asdict
|
|
from collections import defaultdict, Counter
|
|
from datetime import datetime, timedelta
|
|
import re
|
|
|
|
|
|
@dataclass
|
|
class ExecutionLog:
|
|
"""Single execution log entry"""
|
|
task_id: str
|
|
agent_id: str
|
|
task_type: str
|
|
task_description: str
|
|
start_time: str
|
|
end_time: str
|
|
duration_ms: int
|
|
status: str # success, failure, partial, timeout
|
|
actions: List[Dict[str, Any]]
|
|
results: Dict[str, Any]
|
|
tokens_used: Dict[str, int] # input_tokens, output_tokens, total_tokens
|
|
cost_usd: float
|
|
error_details: Optional[Dict[str, Any]]
|
|
tools_used: List[str]
|
|
retry_count: int
|
|
metadata: Dict[str, Any]
|
|
|
|
|
|
@dataclass
|
|
class PerformanceMetrics:
|
|
"""Performance metrics for an agent or system"""
|
|
total_tasks: int
|
|
successful_tasks: int
|
|
failed_tasks: int
|
|
partial_tasks: int
|
|
timeout_tasks: int
|
|
success_rate: float
|
|
failure_rate: float
|
|
average_duration_ms: float
|
|
median_duration_ms: float
|
|
percentile_95_duration_ms: float
|
|
min_duration_ms: int
|
|
max_duration_ms: int
|
|
total_tokens_used: int
|
|
average_tokens_per_task: float
|
|
total_cost_usd: float
|
|
average_cost_per_task: float
|
|
cost_per_token: float
|
|
throughput_tasks_per_hour: float
|
|
error_rate: float
|
|
retry_rate: float
|
|
|
|
|
|
@dataclass
|
|
class ErrorAnalysis:
|
|
"""Error pattern analysis"""
|
|
error_type: str
|
|
count: int
|
|
percentage: float
|
|
affected_agents: List[str]
|
|
affected_task_types: List[str]
|
|
common_patterns: List[str]
|
|
suggested_fixes: List[str]
|
|
impact_level: str # high, medium, low
|
|
|
|
|
|
@dataclass
|
|
class BottleneckAnalysis:
|
|
"""System bottleneck analysis"""
|
|
bottleneck_type: str # agent, tool, communication, resource
|
|
location: str
|
|
severity: str # critical, high, medium, low
|
|
description: str
|
|
impact_on_performance: Dict[str, float]
|
|
affected_workflows: List[str]
|
|
optimization_suggestions: List[str]
|
|
estimated_improvement: Dict[str, float]
|
|
|
|
|
|
@dataclass
|
|
class OptimizationRecommendation:
|
|
"""Performance optimization recommendation"""
|
|
category: str # performance, cost, reliability, scalability
|
|
priority: str # high, medium, low
|
|
title: str
|
|
description: str
|
|
implementation_effort: str # low, medium, high
|
|
expected_impact: Dict[str, Any]
|
|
estimated_cost_savings: Optional[float]
|
|
estimated_performance_gain: Optional[float]
|
|
implementation_steps: List[str]
|
|
risks: List[str]
|
|
prerequisites: List[str]
|
|
|
|
|
|
@dataclass
|
|
class EvaluationReport:
|
|
"""Complete evaluation report"""
|
|
summary: Dict[str, Any]
|
|
system_metrics: PerformanceMetrics
|
|
agent_metrics: Dict[str, PerformanceMetrics]
|
|
task_type_metrics: Dict[str, PerformanceMetrics]
|
|
tool_usage_analysis: Dict[str, Any]
|
|
error_analysis: List[ErrorAnalysis]
|
|
bottleneck_analysis: List[BottleneckAnalysis]
|
|
optimization_recommendations: List[OptimizationRecommendation]
|
|
trends_analysis: Dict[str, Any]
|
|
cost_breakdown: Dict[str, Any]
|
|
sla_compliance: Dict[str, Any]
|
|
metadata: Dict[str, Any]
|
|
|
|
|
|
class AgentEvaluator:
|
|
"""Evaluate multi-agent system performance from execution logs"""
|
|
|
|
def __init__(self):
|
|
self.error_patterns = self._define_error_patterns()
|
|
self.performance_thresholds = self._define_performance_thresholds()
|
|
self.cost_benchmarks = self._define_cost_benchmarks()
|
|
|
|
def _define_error_patterns(self) -> Dict[str, Dict[str, Any]]:
|
|
"""Define common error patterns and their classifications"""
|
|
return {
|
|
"timeout": {
|
|
"patterns": [r"timeout", r"timed out", r"deadline exceeded"],
|
|
"category": "performance",
|
|
"severity": "high",
|
|
"common_fixes": [
|
|
"Increase timeout values",
|
|
"Optimize slow operations",
|
|
"Add retry logic with exponential backoff",
|
|
"Parallelize independent operations"
|
|
]
|
|
},
|
|
"rate_limit": {
|
|
"patterns": [r"rate limit", r"too many requests", r"quota exceeded"],
|
|
"category": "resource",
|
|
"severity": "medium",
|
|
"common_fixes": [
|
|
"Implement request throttling",
|
|
"Add circuit breaker pattern",
|
|
"Use request queuing",
|
|
"Negotiate higher limits"
|
|
]
|
|
},
|
|
"authentication": {
|
|
"patterns": [r"unauthorized", r"authentication failed", r"invalid credentials"],
|
|
"category": "security",
|
|
"severity": "high",
|
|
"common_fixes": [
|
|
"Check credential rotation",
|
|
"Implement token refresh logic",
|
|
"Add authentication retry",
|
|
"Verify permission scopes"
|
|
]
|
|
},
|
|
"network": {
|
|
"patterns": [r"connection refused", r"network error", r"dns resolution"],
|
|
"category": "infrastructure",
|
|
"severity": "high",
|
|
"common_fixes": [
|
|
"Add network retry logic",
|
|
"Implement fallback endpoints",
|
|
"Use connection pooling",
|
|
"Add health checks"
|
|
]
|
|
},
|
|
"validation": {
|
|
"patterns": [r"validation error", r"invalid input", r"schema violation"],
|
|
"category": "data",
|
|
"severity": "medium",
|
|
"common_fixes": [
|
|
"Strengthen input validation",
|
|
"Add data sanitization",
|
|
"Improve error messages",
|
|
"Add input examples"
|
|
]
|
|
},
|
|
"resource": {
|
|
"patterns": [r"out of memory", r"disk full", r"cpu overload"],
|
|
"category": "resource",
|
|
"severity": "critical",
|
|
"common_fixes": [
|
|
"Scale up resources",
|
|
"Optimize memory usage",
|
|
"Add resource monitoring",
|
|
"Implement graceful degradation"
|
|
]
|
|
}
|
|
}
|
|
|
|
def _define_performance_thresholds(self) -> Dict[str, Any]:
|
|
"""Define performance thresholds for different metrics"""
|
|
return {
|
|
"success_rate": {"excellent": 0.98, "good": 0.95, "acceptable": 0.90, "poor": 0.80},
|
|
"average_duration": {"excellent": 1000, "good": 3000, "acceptable": 10000, "poor": 30000},
|
|
"error_rate": {"excellent": 0.01, "good": 0.03, "acceptable": 0.05, "poor": 0.10},
|
|
"retry_rate": {"excellent": 0.05, "good": 0.10, "acceptable": 0.20, "poor": 0.40},
|
|
"cost_per_task": {"excellent": 0.01, "good": 0.05, "acceptable": 0.10, "poor": 0.25},
|
|
"throughput": {"excellent": 100, "good": 50, "acceptable": 20, "poor": 5} # tasks per hour
|
|
}
|
|
|
|
def _define_cost_benchmarks(self) -> Dict[str, Any]:
|
|
"""Define cost benchmarks for different operations"""
|
|
return {
|
|
"token_costs": {
|
|
"gpt-4": {"input": 0.00003, "output": 0.00006},
|
|
"gpt-3.5-turbo": {"input": 0.000002, "output": 0.000002},
|
|
"claude-3": {"input": 0.000015, "output": 0.000075}
|
|
},
|
|
"operation_costs": {
|
|
"simple_task": 0.005,
|
|
"complex_task": 0.050,
|
|
"research_task": 0.020,
|
|
"analysis_task": 0.030,
|
|
"generation_task": 0.015
|
|
}
|
|
}
|
|
|
|
def parse_execution_logs(self, logs_data: List[Dict[str, Any]]) -> List[ExecutionLog]:
|
|
"""Parse raw execution logs into structured format"""
|
|
logs = []
|
|
|
|
for log_entry in logs_data:
|
|
try:
|
|
log = ExecutionLog(
|
|
task_id=log_entry.get("task_id", ""),
|
|
agent_id=log_entry.get("agent_id", ""),
|
|
task_type=log_entry.get("task_type", "unknown"),
|
|
task_description=log_entry.get("task_description", ""),
|
|
start_time=log_entry.get("start_time", ""),
|
|
end_time=log_entry.get("end_time", ""),
|
|
duration_ms=log_entry.get("duration_ms", 0),
|
|
status=log_entry.get("status", "unknown"),
|
|
actions=log_entry.get("actions", []),
|
|
results=log_entry.get("results", {}),
|
|
tokens_used=log_entry.get("tokens_used", {"total_tokens": 0}),
|
|
cost_usd=log_entry.get("cost_usd", 0.0),
|
|
error_details=log_entry.get("error_details"),
|
|
tools_used=log_entry.get("tools_used", []),
|
|
retry_count=log_entry.get("retry_count", 0),
|
|
metadata=log_entry.get("metadata", {})
|
|
)
|
|
logs.append(log)
|
|
except Exception as e:
|
|
print(f"Warning: Failed to parse log entry: {e}", file=sys.stderr)
|
|
continue
|
|
|
|
return logs
|
|
|
|
def calculate_performance_metrics(self, logs: List[ExecutionLog]) -> PerformanceMetrics:
|
|
"""Calculate performance metrics from execution logs"""
|
|
if not logs:
|
|
return PerformanceMetrics(
|
|
total_tasks=0, successful_tasks=0, failed_tasks=0, partial_tasks=0,
|
|
timeout_tasks=0, success_rate=0.0, failure_rate=0.0,
|
|
average_duration_ms=0.0, median_duration_ms=0.0, percentile_95_duration_ms=0.0,
|
|
min_duration_ms=0, max_duration_ms=0, total_tokens_used=0,
|
|
average_tokens_per_task=0.0, total_cost_usd=0.0, average_cost_per_task=0.0,
|
|
cost_per_token=0.0, throughput_tasks_per_hour=0.0, error_rate=0.0, retry_rate=0.0
|
|
)
|
|
|
|
total_tasks = len(logs)
|
|
successful_tasks = sum(1 for log in logs if log.status == "success")
|
|
failed_tasks = sum(1 for log in logs if log.status == "failure")
|
|
partial_tasks = sum(1 for log in logs if log.status == "partial")
|
|
timeout_tasks = sum(1 for log in logs if log.status == "timeout")
|
|
|
|
success_rate = successful_tasks / total_tasks if total_tasks > 0 else 0.0
|
|
failure_rate = (failed_tasks + timeout_tasks) / total_tasks if total_tasks > 0 else 0.0
|
|
|
|
durations = [log.duration_ms for log in logs if log.duration_ms > 0]
|
|
if durations:
|
|
average_duration_ms = statistics.mean(durations)
|
|
median_duration_ms = statistics.median(durations)
|
|
percentile_95_duration_ms = self._percentile(durations, 95)
|
|
min_duration_ms = min(durations)
|
|
max_duration_ms = max(durations)
|
|
else:
|
|
average_duration_ms = median_duration_ms = percentile_95_duration_ms = 0.0
|
|
min_duration_ms = max_duration_ms = 0
|
|
|
|
total_tokens = sum(log.tokens_used.get("total_tokens", 0) for log in logs)
|
|
average_tokens_per_task = total_tokens / total_tasks if total_tasks > 0 else 0.0
|
|
|
|
total_cost = sum(log.cost_usd for log in logs)
|
|
average_cost_per_task = total_cost / total_tasks if total_tasks > 0 else 0.0
|
|
cost_per_token = total_cost / total_tokens if total_tokens > 0 else 0.0
|
|
|
|
# Calculate throughput (tasks per hour)
|
|
if logs and len(logs) > 1:
|
|
start_time = min(log.start_time for log in logs if log.start_time)
|
|
end_time = max(log.end_time for log in logs if log.end_time)
|
|
if start_time and end_time:
|
|
try:
|
|
start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
|
|
end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00"))
|
|
time_diff_hours = (end_dt - start_dt).total_seconds() / 3600
|
|
throughput_tasks_per_hour = total_tasks / time_diff_hours if time_diff_hours > 0 else 0.0
|
|
except:
|
|
throughput_tasks_per_hour = 0.0
|
|
else:
|
|
throughput_tasks_per_hour = 0.0
|
|
else:
|
|
throughput_tasks_per_hour = 0.0
|
|
|
|
error_rate = sum(1 for log in logs if log.error_details) / total_tasks if total_tasks > 0 else 0.0
|
|
retry_rate = sum(1 for log in logs if log.retry_count > 0) / total_tasks if total_tasks > 0 else 0.0
|
|
|
|
return PerformanceMetrics(
|
|
total_tasks=total_tasks,
|
|
successful_tasks=successful_tasks,
|
|
failed_tasks=failed_tasks,
|
|
partial_tasks=partial_tasks,
|
|
timeout_tasks=timeout_tasks,
|
|
success_rate=success_rate,
|
|
failure_rate=failure_rate,
|
|
average_duration_ms=average_duration_ms,
|
|
median_duration_ms=median_duration_ms,
|
|
percentile_95_duration_ms=percentile_95_duration_ms,
|
|
min_duration_ms=min_duration_ms,
|
|
max_duration_ms=max_duration_ms,
|
|
total_tokens_used=total_tokens,
|
|
average_tokens_per_task=average_tokens_per_task,
|
|
total_cost_usd=total_cost,
|
|
average_cost_per_task=average_cost_per_task,
|
|
cost_per_token=cost_per_token,
|
|
throughput_tasks_per_hour=throughput_tasks_per_hour,
|
|
error_rate=error_rate,
|
|
retry_rate=retry_rate
|
|
)
|
|
|
|
def _percentile(self, data: List[float], percentile: int) -> float:
|
|
"""Calculate percentile value from data"""
|
|
if not data:
|
|
return 0.0
|
|
sorted_data = sorted(data)
|
|
index = (percentile / 100) * (len(sorted_data) - 1)
|
|
if index.is_integer():
|
|
return sorted_data[int(index)]
|
|
else:
|
|
lower_index = int(index)
|
|
upper_index = lower_index + 1
|
|
weight = index - lower_index
|
|
return sorted_data[lower_index] * (1 - weight) + sorted_data[upper_index] * weight
|
|
|
|
def analyze_errors(self, logs: List[ExecutionLog]) -> List[ErrorAnalysis]:
|
|
"""Analyze error patterns in execution logs"""
|
|
error_analyses = []
|
|
|
|
# Collect all errors
|
|
errors = []
|
|
for log in logs:
|
|
if log.error_details:
|
|
errors.append({
|
|
"error": log.error_details,
|
|
"agent_id": log.agent_id,
|
|
"task_type": log.task_type,
|
|
"task_id": log.task_id
|
|
})
|
|
|
|
if not errors:
|
|
return error_analyses
|
|
|
|
# Group errors by pattern
|
|
error_groups = defaultdict(list)
|
|
unclassified_errors = []
|
|
|
|
for error in errors:
|
|
error_message = str(error.get("error", {})).lower()
|
|
classified = False
|
|
|
|
for pattern_name, pattern_info in self.error_patterns.items():
|
|
for pattern in pattern_info["patterns"]:
|
|
if re.search(pattern, error_message):
|
|
error_groups[pattern_name].append(error)
|
|
classified = True
|
|
break
|
|
if classified:
|
|
break
|
|
|
|
if not classified:
|
|
unclassified_errors.append(error)
|
|
|
|
# Analyze each error group
|
|
total_errors = len(errors)
|
|
|
|
for error_type, error_list in error_groups.items():
|
|
count = len(error_list)
|
|
percentage = (count / total_errors) * 100 if total_errors > 0 else 0.0
|
|
|
|
affected_agents = list(set(error["agent_id"] for error in error_list))
|
|
affected_task_types = list(set(error["task_type"] for error in error_list))
|
|
|
|
# Extract common patterns from error messages
|
|
common_patterns = self._extract_common_patterns([str(e["error"]) for e in error_list])
|
|
|
|
# Get suggested fixes
|
|
pattern_info = self.error_patterns.get(error_type, {})
|
|
suggested_fixes = pattern_info.get("common_fixes", [])
|
|
|
|
# Determine impact level
|
|
if percentage > 20 or pattern_info.get("severity") == "critical":
|
|
impact_level = "high"
|
|
elif percentage > 10 or pattern_info.get("severity") == "high":
|
|
impact_level = "medium"
|
|
else:
|
|
impact_level = "low"
|
|
|
|
error_analysis = ErrorAnalysis(
|
|
error_type=error_type,
|
|
count=count,
|
|
percentage=percentage,
|
|
affected_agents=affected_agents,
|
|
affected_task_types=affected_task_types,
|
|
common_patterns=common_patterns,
|
|
suggested_fixes=suggested_fixes,
|
|
impact_level=impact_level
|
|
)
|
|
|
|
error_analyses.append(error_analysis)
|
|
|
|
# Handle unclassified errors
|
|
if unclassified_errors:
|
|
count = len(unclassified_errors)
|
|
percentage = (count / total_errors) * 100
|
|
|
|
error_analysis = ErrorAnalysis(
|
|
error_type="unclassified",
|
|
count=count,
|
|
percentage=percentage,
|
|
affected_agents=list(set(error["agent_id"] for error in unclassified_errors)),
|
|
affected_task_types=list(set(error["task_type"] for error in unclassified_errors)),
|
|
common_patterns=self._extract_common_patterns([str(e["error"]) for e in unclassified_errors]),
|
|
suggested_fixes=["Review and classify error patterns", "Add specific error handling"],
|
|
impact_level="medium" if percentage > 10 else "low"
|
|
)
|
|
|
|
error_analyses.append(error_analysis)
|
|
|
|
# Sort by impact and count
|
|
error_analyses.sort(key=lambda x: (x.impact_level == "high", x.count), reverse=True)
|
|
|
|
return error_analyses
|
|
|
|
def _extract_common_patterns(self, error_messages: List[str]) -> List[str]:
|
|
"""Extract common patterns from error messages"""
|
|
if not error_messages:
|
|
return []
|
|
|
|
# Simple pattern extraction - find common phrases
|
|
word_counts = Counter()
|
|
for message in error_messages:
|
|
words = re.findall(r'\w+', message.lower())
|
|
for word in words:
|
|
if len(word) > 3: # Ignore short words
|
|
word_counts[word] += 1
|
|
|
|
# Return most common words/patterns
|
|
common_patterns = [word for word, count in word_counts.most_common(5)
|
|
if count > 1]
|
|
|
|
return common_patterns
|
|
|
|
def identify_bottlenecks(self, logs: List[ExecutionLog],
|
|
agent_metrics: Dict[str, PerformanceMetrics]) -> List[BottleneckAnalysis]:
|
|
"""Identify system bottlenecks"""
|
|
bottlenecks = []
|
|
|
|
# Agent performance bottlenecks
|
|
for agent_id, metrics in agent_metrics.items():
|
|
if metrics.success_rate < 0.8:
|
|
severity = "critical" if metrics.success_rate < 0.5 else "high"
|
|
bottlenecks.append(BottleneckAnalysis(
|
|
bottleneck_type="agent",
|
|
location=agent_id,
|
|
severity=severity,
|
|
description=f"Agent {agent_id} has low success rate ({metrics.success_rate:.1%})",
|
|
impact_on_performance={
|
|
"success_rate_impact": (0.95 - metrics.success_rate) * 100,
|
|
"cost_impact": metrics.average_cost_per_task * metrics.failed_tasks
|
|
},
|
|
affected_workflows=self._get_agent_workflows(agent_id, logs),
|
|
optimization_suggestions=[
|
|
"Review and improve agent logic",
|
|
"Add better error handling",
|
|
"Optimize tool usage",
|
|
"Consider agent specialization"
|
|
],
|
|
estimated_improvement={
|
|
"success_rate_gain": min(0.15, 0.95 - metrics.success_rate),
|
|
"cost_reduction": metrics.average_cost_per_task * 0.2
|
|
}
|
|
))
|
|
|
|
if metrics.average_duration_ms > 30000: # 30 seconds
|
|
severity = "high" if metrics.average_duration_ms > 60000 else "medium"
|
|
bottlenecks.append(BottleneckAnalysis(
|
|
bottleneck_type="agent",
|
|
location=agent_id,
|
|
severity=severity,
|
|
description=f"Agent {agent_id} has high latency ({metrics.average_duration_ms/1000:.1f}s avg)",
|
|
impact_on_performance={
|
|
"latency_impact": metrics.average_duration_ms - 10000,
|
|
"throughput_impact": max(0, 50 - metrics.total_tasks)
|
|
},
|
|
affected_workflows=self._get_agent_workflows(agent_id, logs),
|
|
optimization_suggestions=[
|
|
"Profile and optimize slow operations",
|
|
"Implement caching strategies",
|
|
"Parallelize independent tasks",
|
|
"Optimize API calls"
|
|
],
|
|
estimated_improvement={
|
|
"latency_reduction": min(0.5, (metrics.average_duration_ms - 10000) / metrics.average_duration_ms),
|
|
"throughput_gain": 1.3
|
|
}
|
|
))
|
|
|
|
# Tool usage bottlenecks
|
|
tool_usage = self._analyze_tool_usage(logs)
|
|
for tool, usage_stats in tool_usage.items():
|
|
if usage_stats.get("error_rate", 0) > 0.2:
|
|
bottlenecks.append(BottleneckAnalysis(
|
|
bottleneck_type="tool",
|
|
location=tool,
|
|
severity="high" if usage_stats["error_rate"] > 0.4 else "medium",
|
|
description=f"Tool {tool} has high error rate ({usage_stats['error_rate']:.1%})",
|
|
impact_on_performance={
|
|
"reliability_impact": usage_stats["error_rate"] * usage_stats["usage_count"],
|
|
"retry_overhead": usage_stats.get("retry_count", 0) * 1000 # ms
|
|
},
|
|
affected_workflows=usage_stats.get("affected_workflows", []),
|
|
optimization_suggestions=[
|
|
"Review tool implementation",
|
|
"Add better error handling for tool",
|
|
"Implement tool fallbacks",
|
|
"Consider alternative tools"
|
|
],
|
|
estimated_improvement={
|
|
"error_reduction": usage_stats["error_rate"] * 0.7,
|
|
"performance_gain": 1.2
|
|
}
|
|
))
|
|
|
|
# Communication bottlenecks
|
|
communication_analysis = self._analyze_communication_patterns(logs)
|
|
if communication_analysis.get("high_latency_communications", 0) > 5:
|
|
bottlenecks.append(BottleneckAnalysis(
|
|
bottleneck_type="communication",
|
|
location="inter_agent_communication",
|
|
severity="medium",
|
|
description="High latency in inter-agent communications detected",
|
|
impact_on_performance={
|
|
"communication_overhead": communication_analysis.get("avg_communication_latency", 0),
|
|
"coordination_efficiency": 0.8 # Assumed impact
|
|
},
|
|
affected_workflows=communication_analysis.get("affected_workflows", []),
|
|
optimization_suggestions=[
|
|
"Optimize message serialization",
|
|
"Implement message batching",
|
|
"Add communication caching",
|
|
"Consider direct communication patterns"
|
|
],
|
|
estimated_improvement={
|
|
"communication_latency_reduction": 0.4,
|
|
"overall_efficiency_gain": 1.15
|
|
}
|
|
))
|
|
|
|
# Resource bottlenecks
|
|
resource_analysis = self._analyze_resource_usage(logs)
|
|
if resource_analysis.get("high_token_usage_tasks", 0) > 10:
|
|
bottlenecks.append(BottleneckAnalysis(
|
|
bottleneck_type="resource",
|
|
location="token_usage",
|
|
severity="medium",
|
|
description="High token usage detected in multiple tasks",
|
|
impact_on_performance={
|
|
"cost_impact": resource_analysis.get("excess_token_cost", 0),
|
|
"latency_impact": resource_analysis.get("token_processing_overhead", 0)
|
|
},
|
|
affected_workflows=resource_analysis.get("high_usage_workflows", []),
|
|
optimization_suggestions=[
|
|
"Optimize prompt engineering",
|
|
"Implement response caching",
|
|
"Use more efficient models for simple tasks",
|
|
"Add token usage monitoring"
|
|
],
|
|
estimated_improvement={
|
|
"cost_reduction": 0.3,
|
|
"efficiency_gain": 1.1
|
|
}
|
|
))
|
|
|
|
# Sort bottlenecks by severity and impact
|
|
severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
|
|
bottlenecks.sort(key=lambda x: (severity_order[x.severity],
|
|
-sum(x.impact_on_performance.values())))
|
|
|
|
return bottlenecks
|
|
|
|
def _get_agent_workflows(self, agent_id: str, logs: List[ExecutionLog]) -> List[str]:
|
|
"""Get workflows affected by a specific agent"""
|
|
workflows = set()
|
|
for log in logs:
|
|
if log.agent_id == agent_id:
|
|
workflows.add(log.task_type)
|
|
return list(workflows)
|
|
|
|
def _analyze_tool_usage(self, logs: List[ExecutionLog]) -> Dict[str, Dict[str, Any]]:
|
|
"""Analyze tool usage patterns"""
|
|
tool_stats = defaultdict(lambda: {
|
|
"usage_count": 0,
|
|
"error_count": 0,
|
|
"total_duration": 0,
|
|
"affected_workflows": set(),
|
|
"retry_count": 0
|
|
})
|
|
|
|
for log in logs:
|
|
for tool in log.tools_used:
|
|
stats = tool_stats[tool]
|
|
stats["usage_count"] += 1
|
|
stats["total_duration"] += log.duration_ms
|
|
stats["affected_workflows"].add(log.task_type)
|
|
|
|
if log.error_details:
|
|
stats["error_count"] += 1
|
|
if log.retry_count > 0:
|
|
stats["retry_count"] += log.retry_count
|
|
|
|
# Calculate derived metrics
|
|
result = {}
|
|
for tool, stats in tool_stats.items():
|
|
result[tool] = {
|
|
"usage_count": stats["usage_count"],
|
|
"error_rate": stats["error_count"] / stats["usage_count"] if stats["usage_count"] > 0 else 0,
|
|
"avg_duration": stats["total_duration"] / stats["usage_count"] if stats["usage_count"] > 0 else 0,
|
|
"affected_workflows": list(stats["affected_workflows"]),
|
|
"retry_count": stats["retry_count"]
|
|
}
|
|
|
|
return result
|
|
|
|
def _analyze_communication_patterns(self, logs: List[ExecutionLog]) -> Dict[str, Any]:
|
|
"""Analyze communication patterns between agents"""
|
|
# This is a simplified analysis - in a real system, you'd have more detailed communication logs
|
|
communication_actions = []
|
|
for log in logs:
|
|
for action in log.actions:
|
|
if action.get("type") in ["message", "delegate", "coordinate", "respond"]:
|
|
communication_actions.append({
|
|
"duration": action.get("duration_ms", 0),
|
|
"success": action.get("success", True),
|
|
"workflow": log.task_type
|
|
})
|
|
|
|
if not communication_actions:
|
|
return {}
|
|
|
|
avg_latency = sum(action["duration"] for action in communication_actions) / len(communication_actions)
|
|
high_latency_count = sum(1 for action in communication_actions if action["duration"] > 5000)
|
|
|
|
return {
|
|
"total_communications": len(communication_actions),
|
|
"avg_communication_latency": avg_latency,
|
|
"high_latency_communications": high_latency_count,
|
|
"affected_workflows": list(set(action["workflow"] for action in communication_actions))
|
|
}
|
|
|
|
def _analyze_resource_usage(self, logs: List[ExecutionLog]) -> Dict[str, Any]:
|
|
"""Analyze resource usage patterns"""
|
|
token_usage = [log.tokens_used.get("total_tokens", 0) for log in logs]
|
|
|
|
if not token_usage:
|
|
return {}
|
|
|
|
avg_tokens = sum(token_usage) / len(token_usage)
|
|
high_usage_threshold = avg_tokens * 2
|
|
high_usage_tasks = sum(1 for tokens in token_usage if tokens > high_usage_threshold)
|
|
|
|
# Estimate excess cost
|
|
excess_tokens = sum(max(0, tokens - avg_tokens) for tokens in token_usage)
|
|
excess_cost = excess_tokens * 0.00002 # Rough estimate
|
|
|
|
return {
|
|
"avg_token_usage": avg_tokens,
|
|
"high_token_usage_tasks": high_usage_tasks,
|
|
"excess_token_cost": excess_cost,
|
|
"token_processing_overhead": high_usage_tasks * 500, # Estimated overhead in ms
|
|
"high_usage_workflows": [log.task_type for log in logs
|
|
if log.tokens_used.get("total_tokens", 0) > high_usage_threshold]
|
|
}
|
|
|
|
def generate_optimization_recommendations(self,
|
|
system_metrics: PerformanceMetrics,
|
|
error_analyses: List[ErrorAnalysis],
|
|
bottlenecks: List[BottleneckAnalysis]) -> List[OptimizationRecommendation]:
|
|
"""Generate optimization recommendations based on analysis"""
|
|
recommendations = []
|
|
|
|
# Performance optimization recommendations
|
|
if system_metrics.success_rate < 0.9:
|
|
recommendations.append(OptimizationRecommendation(
|
|
category="reliability",
|
|
priority="high",
|
|
title="Improve System Reliability",
|
|
description=f"System success rate is {system_metrics.success_rate:.1%}, below target of 90%",
|
|
implementation_effort="medium",
|
|
expected_impact={
|
|
"success_rate_improvement": min(0.1, 0.95 - system_metrics.success_rate),
|
|
"cost_reduction": system_metrics.average_cost_per_task * 0.15
|
|
},
|
|
estimated_cost_savings=system_metrics.total_cost_usd * 0.1,
|
|
estimated_performance_gain=1.2,
|
|
implementation_steps=[
|
|
"Identify and fix top error patterns",
|
|
"Implement better error handling and retries",
|
|
"Add comprehensive monitoring and alerting",
|
|
"Implement graceful degradation patterns"
|
|
],
|
|
risks=["Temporary increase in complexity", "Potential initial performance overhead"],
|
|
prerequisites=["Error analysis completion", "Monitoring infrastructure"]
|
|
))
|
|
|
|
# Cost optimization recommendations
|
|
if system_metrics.average_cost_per_task > 0.1:
|
|
recommendations.append(OptimizationRecommendation(
|
|
category="cost",
|
|
priority="medium",
|
|
title="Optimize Token Usage and Costs",
|
|
description=f"Average cost per task (${system_metrics.average_cost_per_task:.3f}) is above optimal range",
|
|
implementation_effort="low",
|
|
expected_impact={
|
|
"cost_reduction": system_metrics.average_cost_per_task * 0.3,
|
|
"efficiency_improvement": 1.15
|
|
},
|
|
estimated_cost_savings=system_metrics.total_cost_usd * 0.3,
|
|
estimated_performance_gain=1.05,
|
|
implementation_steps=[
|
|
"Implement prompt optimization",
|
|
"Add response caching for repeated queries",
|
|
"Use smaller models for simple tasks",
|
|
"Implement token usage monitoring and alerts"
|
|
],
|
|
risks=["Potential quality reduction with smaller models"],
|
|
prerequisites=["Token usage analysis", "Caching infrastructure"]
|
|
))
|
|
|
|
# Performance optimization recommendations
|
|
if system_metrics.average_duration_ms > 10000:
|
|
recommendations.append(OptimizationRecommendation(
|
|
category="performance",
|
|
priority="high",
|
|
title="Reduce Task Latency",
|
|
description=f"Average task duration ({system_metrics.average_duration_ms/1000:.1f}s) exceeds target",
|
|
implementation_effort="high",
|
|
expected_impact={
|
|
"latency_reduction": min(0.5, (system_metrics.average_duration_ms - 5000) / system_metrics.average_duration_ms),
|
|
"throughput_improvement": 1.5
|
|
},
|
|
estimated_performance_gain=1.4,
|
|
implementation_steps=[
|
|
"Profile and optimize slow operations",
|
|
"Implement parallel processing where possible",
|
|
"Add caching for expensive operations",
|
|
"Optimize API calls and reduce round trips"
|
|
],
|
|
risks=["Increased system complexity", "Potential resource usage increase"],
|
|
prerequisites=["Performance profiling tools", "Caching infrastructure"]
|
|
))
|
|
|
|
# Error-based recommendations
|
|
high_impact_errors = [ea for ea in error_analyses if ea.impact_level == "high"]
|
|
if high_impact_errors:
|
|
for error_analysis in high_impact_errors[:3]: # Top 3 high impact errors
|
|
recommendations.append(OptimizationRecommendation(
|
|
category="reliability",
|
|
priority="high",
|
|
title=f"Address {error_analysis.error_type.title()} Errors",
|
|
description=f"{error_analysis.error_type.title()} errors occur in {error_analysis.percentage:.1f}% of cases",
|
|
implementation_effort="medium",
|
|
expected_impact={
|
|
"error_reduction": error_analysis.percentage / 100,
|
|
"reliability_improvement": 1.1
|
|
},
|
|
estimated_cost_savings=system_metrics.total_cost_usd * (error_analysis.percentage / 100) * 0.5,
|
|
implementation_steps=error_analysis.suggested_fixes,
|
|
risks=["May require significant code changes"],
|
|
prerequisites=["Root cause analysis", "Testing framework"]
|
|
))
|
|
|
|
# Bottleneck-based recommendations
|
|
critical_bottlenecks = [b for b in bottlenecks if b.severity in ["critical", "high"]]
|
|
for bottleneck in critical_bottlenecks[:2]: # Top 2 critical bottlenecks
|
|
recommendations.append(OptimizationRecommendation(
|
|
category="performance",
|
|
priority="high" if bottleneck.severity == "critical" else "medium",
|
|
title=f"Address {bottleneck.bottleneck_type.title()} Bottleneck",
|
|
description=bottleneck.description,
|
|
implementation_effort="medium",
|
|
expected_impact=bottleneck.estimated_improvement,
|
|
estimated_performance_gain=list(bottleneck.estimated_improvement.values())[0] if bottleneck.estimated_improvement else 1.1,
|
|
implementation_steps=bottleneck.optimization_suggestions,
|
|
risks=["System downtime during implementation", "Potential cascade effects"],
|
|
prerequisites=["Impact assessment", "Rollback plan"]
|
|
))
|
|
|
|
# Scalability recommendations
|
|
if system_metrics.throughput_tasks_per_hour < 20:
|
|
recommendations.append(OptimizationRecommendation(
|
|
category="scalability",
|
|
priority="medium",
|
|
title="Improve System Scalability",
|
|
description="Current throughput indicates potential scalability issues",
|
|
implementation_effort="high",
|
|
expected_impact={
|
|
"throughput_improvement": 2.0,
|
|
"scalability_headroom": 5.0
|
|
},
|
|
estimated_performance_gain=2.0,
|
|
implementation_steps=[
|
|
"Implement horizontal scaling for agents",
|
|
"Add load balancing and resource pooling",
|
|
"Optimize resource allocation algorithms",
|
|
"Implement auto-scaling policies"
|
|
],
|
|
risks=["High implementation complexity", "Increased operational overhead"],
|
|
prerequisites=["Infrastructure scaling capability", "Monitoring and metrics"]
|
|
))
|
|
|
|
# Sort recommendations by priority and impact
|
|
priority_order = {"high": 0, "medium": 1, "low": 2}
|
|
recommendations.sort(key=lambda x: (
|
|
priority_order[x.priority],
|
|
-x.estimated_performance_gain if x.estimated_performance_gain else 0,
|
|
-x.estimated_cost_savings if x.estimated_cost_savings else 0
|
|
))
|
|
|
|
return recommendations
|
|
|
|
def generate_report(self, logs: List[ExecutionLog]) -> EvaluationReport:
|
|
"""Generate complete evaluation report"""
|
|
|
|
# Calculate system metrics
|
|
system_metrics = self.calculate_performance_metrics(logs)
|
|
|
|
# Calculate per-agent metrics
|
|
agents = set(log.agent_id for log in logs)
|
|
agent_metrics = {}
|
|
for agent_id in agents:
|
|
agent_logs = [log for log in logs if log.agent_id == agent_id]
|
|
agent_metrics[agent_id] = self.calculate_performance_metrics(agent_logs)
|
|
|
|
# Calculate per-task-type metrics
|
|
task_types = set(log.task_type for log in logs)
|
|
task_type_metrics = {}
|
|
for task_type in task_types:
|
|
task_logs = [log for log in logs if log.task_type == task_type]
|
|
task_type_metrics[task_type] = self.calculate_performance_metrics(task_logs)
|
|
|
|
# Analyze tool usage
|
|
tool_usage_analysis = self._analyze_tool_usage(logs)
|
|
|
|
# Analyze errors
|
|
error_analysis = self.analyze_errors(logs)
|
|
|
|
# Identify bottlenecks
|
|
bottleneck_analysis = self.identify_bottlenecks(logs, agent_metrics)
|
|
|
|
# Generate optimization recommendations
|
|
optimization_recommendations = self.generate_optimization_recommendations(
|
|
system_metrics, error_analysis, bottleneck_analysis)
|
|
|
|
# Generate trends analysis (simplified)
|
|
trends_analysis = self._generate_trends_analysis(logs)
|
|
|
|
# Generate cost breakdown
|
|
cost_breakdown = self._generate_cost_breakdown(logs, agent_metrics)
|
|
|
|
# Check SLA compliance
|
|
sla_compliance = self._check_sla_compliance(system_metrics)
|
|
|
|
# Create summary
|
|
summary = {
|
|
"evaluation_period": {
|
|
"start_time": min(log.start_time for log in logs if log.start_time) if logs else None,
|
|
"end_time": max(log.end_time for log in logs if log.end_time) if logs else None,
|
|
"total_duration_hours": system_metrics.total_tasks / system_metrics.throughput_tasks_per_hour if system_metrics.throughput_tasks_per_hour > 0 else 0
|
|
},
|
|
"overall_health": self._assess_overall_health(system_metrics),
|
|
"key_findings": self._extract_key_findings(system_metrics, error_analysis, bottleneck_analysis),
|
|
"critical_issues": len([b for b in bottleneck_analysis if b.severity == "critical"]),
|
|
"improvement_opportunities": len(optimization_recommendations)
|
|
}
|
|
|
|
# Create metadata
|
|
metadata = {
|
|
"generated_at": datetime.now().isoformat(),
|
|
"evaluator_version": "1.0",
|
|
"total_logs_processed": len(logs),
|
|
"agents_analyzed": len(agents),
|
|
"task_types_analyzed": len(task_types),
|
|
"analysis_completeness": "full"
|
|
}
|
|
|
|
return EvaluationReport(
|
|
summary=summary,
|
|
system_metrics=system_metrics,
|
|
agent_metrics=agent_metrics,
|
|
task_type_metrics=task_type_metrics,
|
|
tool_usage_analysis=tool_usage_analysis,
|
|
error_analysis=error_analysis,
|
|
bottleneck_analysis=bottleneck_analysis,
|
|
optimization_recommendations=optimization_recommendations,
|
|
trends_analysis=trends_analysis,
|
|
cost_breakdown=cost_breakdown,
|
|
sla_compliance=sla_compliance,
|
|
metadata=metadata
|
|
)
|
|
|
|
def _generate_trends_analysis(self, logs: List[ExecutionLog]) -> Dict[str, Any]:
|
|
"""Generate trends analysis (simplified version)"""
|
|
# Group logs by time periods (daily)
|
|
daily_metrics = defaultdict(list)
|
|
|
|
for log in logs:
|
|
if log.start_time:
|
|
try:
|
|
date = log.start_time.split('T')[0] # Extract date part
|
|
daily_metrics[date].append(log)
|
|
except:
|
|
continue
|
|
|
|
trends = {}
|
|
if len(daily_metrics) > 1:
|
|
daily_success_rates = {}
|
|
daily_avg_durations = {}
|
|
daily_costs = {}
|
|
|
|
for date, date_logs in daily_metrics.items():
|
|
if date_logs:
|
|
metrics = self.calculate_performance_metrics(date_logs)
|
|
daily_success_rates[date] = metrics.success_rate
|
|
daily_avg_durations[date] = metrics.average_duration_ms
|
|
daily_costs[date] = metrics.total_cost_usd
|
|
|
|
trends = {
|
|
"daily_success_rates": daily_success_rates,
|
|
"daily_avg_durations": daily_avg_durations,
|
|
"daily_costs": daily_costs,
|
|
"trend_direction": {
|
|
"success_rate": "stable", # Simplified
|
|
"duration": "stable",
|
|
"cost": "stable"
|
|
}
|
|
}
|
|
|
|
return trends
|
|
|
|
def _generate_cost_breakdown(self, logs: List[ExecutionLog],
|
|
agent_metrics: Dict[str, PerformanceMetrics]) -> Dict[str, Any]:
|
|
"""Generate cost breakdown analysis"""
|
|
total_cost = sum(log.cost_usd for log in logs)
|
|
|
|
# Cost by agent
|
|
agent_costs = {}
|
|
for agent_id, metrics in agent_metrics.items():
|
|
agent_costs[agent_id] = metrics.total_cost_usd
|
|
|
|
# Cost by task type
|
|
task_type_costs = defaultdict(float)
|
|
for log in logs:
|
|
task_type_costs[log.task_type] += log.cost_usd
|
|
|
|
# Token cost breakdown
|
|
total_tokens = sum(log.tokens_used.get("total_tokens", 0) for log in logs)
|
|
|
|
return {
|
|
"total_cost": total_cost,
|
|
"cost_by_agent": dict(agent_costs),
|
|
"cost_by_task_type": dict(task_type_costs),
|
|
"cost_per_token": total_cost / total_tokens if total_tokens > 0 else 0,
|
|
"top_cost_drivers": sorted(task_type_costs.items(), key=lambda x: x[1], reverse=True)[:5]
|
|
}
|
|
|
|
def _check_sla_compliance(self, metrics: PerformanceMetrics) -> Dict[str, Any]:
|
|
"""Check SLA compliance"""
|
|
thresholds = self.performance_thresholds
|
|
|
|
compliance = {
|
|
"success_rate": {
|
|
"target": 0.95,
|
|
"actual": metrics.success_rate,
|
|
"compliant": metrics.success_rate >= 0.95,
|
|
"gap": max(0, 0.95 - metrics.success_rate)
|
|
},
|
|
"average_latency": {
|
|
"target": 10000, # 10 seconds
|
|
"actual": metrics.average_duration_ms,
|
|
"compliant": metrics.average_duration_ms <= 10000,
|
|
"gap": max(0, metrics.average_duration_ms - 10000)
|
|
},
|
|
"error_rate": {
|
|
"target": 0.05, # 5%
|
|
"actual": metrics.error_rate,
|
|
"compliant": metrics.error_rate <= 0.05,
|
|
"gap": max(0, metrics.error_rate - 0.05)
|
|
}
|
|
}
|
|
|
|
overall_compliance = all(sla["compliant"] for sla in compliance.values())
|
|
|
|
return {
|
|
"overall_compliant": overall_compliance,
|
|
"sla_details": compliance,
|
|
"compliance_score": sum(1 for sla in compliance.values() if sla["compliant"]) / len(compliance)
|
|
}
|
|
|
|
def _assess_overall_health(self, metrics: PerformanceMetrics) -> str:
|
|
"""Assess overall system health"""
|
|
health_score = 0
|
|
|
|
# Success rate contribution (40%)
|
|
if metrics.success_rate >= 0.95:
|
|
health_score += 40
|
|
elif metrics.success_rate >= 0.90:
|
|
health_score += 30
|
|
elif metrics.success_rate >= 0.80:
|
|
health_score += 20
|
|
else:
|
|
health_score += 10
|
|
|
|
# Performance contribution (30%)
|
|
if metrics.average_duration_ms <= 5000:
|
|
health_score += 30
|
|
elif metrics.average_duration_ms <= 10000:
|
|
health_score += 20
|
|
elif metrics.average_duration_ms <= 30000:
|
|
health_score += 15
|
|
else:
|
|
health_score += 5
|
|
|
|
# Error rate contribution (20%)
|
|
if metrics.error_rate <= 0.02:
|
|
health_score += 20
|
|
elif metrics.error_rate <= 0.05:
|
|
health_score += 15
|
|
elif metrics.error_rate <= 0.10:
|
|
health_score += 10
|
|
else:
|
|
health_score += 0
|
|
|
|
# Cost efficiency contribution (10%)
|
|
if metrics.cost_per_token <= 0.00005:
|
|
health_score += 10
|
|
elif metrics.cost_per_token <= 0.0001:
|
|
health_score += 7
|
|
else:
|
|
health_score += 3
|
|
|
|
if health_score >= 85:
|
|
return "excellent"
|
|
elif health_score >= 70:
|
|
return "good"
|
|
elif health_score >= 50:
|
|
return "fair"
|
|
else:
|
|
return "poor"
|
|
|
|
def _extract_key_findings(self, metrics: PerformanceMetrics,
|
|
errors: List[ErrorAnalysis],
|
|
bottlenecks: List[BottleneckAnalysis]) -> List[str]:
|
|
"""Extract key findings from analysis"""
|
|
findings = []
|
|
|
|
# Performance findings
|
|
if metrics.success_rate < 0.9:
|
|
findings.append(f"Success rate ({metrics.success_rate:.1%}) below target")
|
|
|
|
if metrics.average_duration_ms > 15000:
|
|
findings.append(f"High average latency ({metrics.average_duration_ms/1000:.1f}s)")
|
|
|
|
# Error findings
|
|
high_impact_errors = [e for e in errors if e.impact_level == "high"]
|
|
if high_impact_errors:
|
|
findings.append(f"{len(high_impact_errors)} high-impact error patterns identified")
|
|
|
|
# Bottleneck findings
|
|
critical_bottlenecks = [b for b in bottlenecks if b.severity == "critical"]
|
|
if critical_bottlenecks:
|
|
findings.append(f"{len(critical_bottlenecks)} critical bottlenecks found")
|
|
|
|
# Cost findings
|
|
if metrics.cost_per_token > 0.0001:
|
|
findings.append("Token usage costs above optimal range")
|
|
|
|
return findings
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Multi-Agent System Performance Evaluator")
|
|
parser.add_argument("input_file", help="JSON file with execution logs")
|
|
parser.add_argument("-o", "--output", help="Output file prefix (default: evaluation_report)")
|
|
parser.add_argument("--format", choices=["json", "both"], default="both",
|
|
help="Output format")
|
|
parser.add_argument("--detailed", action="store_true",
|
|
help="Include detailed analysis in output")
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
# Load execution logs
|
|
with open(args.input_file, 'r') as f:
|
|
logs_data = json.load(f)
|
|
|
|
# Parse logs
|
|
evaluator = AgentEvaluator()
|
|
logs = evaluator.parse_execution_logs(logs_data.get("execution_logs", []))
|
|
|
|
if not logs:
|
|
print("No valid execution logs found in input file", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Generate evaluation report
|
|
report = evaluator.generate_report(logs)
|
|
|
|
# Prepare output
|
|
output_data = asdict(report)
|
|
|
|
# Output files
|
|
output_prefix = args.output or "evaluation_report"
|
|
|
|
if args.format in ["json", "both"]:
|
|
with open(f"{output_prefix}.json", 'w') as f:
|
|
json.dump(output_data, f, indent=2, default=str)
|
|
print(f"JSON report written to {output_prefix}.json")
|
|
|
|
if args.format == "both":
|
|
# Generate separate detailed files
|
|
|
|
# Performance summary
|
|
summary_data = {
|
|
"summary": report.summary,
|
|
"system_metrics": asdict(report.system_metrics),
|
|
"sla_compliance": report.sla_compliance
|
|
}
|
|
with open(f"{output_prefix}_summary.json", 'w') as f:
|
|
json.dump(summary_data, f, indent=2, default=str)
|
|
print(f"Summary report written to {output_prefix}_summary.json")
|
|
|
|
# Recommendations
|
|
recommendations_data = {
|
|
"optimization_recommendations": [asdict(rec) for rec in report.optimization_recommendations],
|
|
"bottleneck_analysis": [asdict(b) for b in report.bottleneck_analysis]
|
|
}
|
|
with open(f"{output_prefix}_recommendations.json", 'w') as f:
|
|
json.dump(recommendations_data, f, indent=2)
|
|
print(f"Recommendations written to {output_prefix}_recommendations.json")
|
|
|
|
# Error analysis
|
|
error_data = {
|
|
"error_analysis": [asdict(e) for e in report.error_analysis],
|
|
"error_summary": {
|
|
"total_errors": sum(e.count for e in report.error_analysis),
|
|
"high_impact_errors": len([e for e in report.error_analysis if e.impact_level == "high"])
|
|
}
|
|
}
|
|
with open(f"{output_prefix}_errors.json", 'w') as f:
|
|
json.dump(error_data, f, indent=2)
|
|
print(f"Error analysis written to {output_prefix}_errors.json")
|
|
|
|
# Print executive summary
|
|
print(f"\n{'='*60}")
|
|
print(f"AGENT SYSTEM EVALUATION REPORT")
|
|
print(f"{'='*60}")
|
|
print(f"Overall Health: {report.summary['overall_health'].upper()}")
|
|
print(f"Total Tasks: {report.system_metrics.total_tasks}")
|
|
print(f"Success Rate: {report.system_metrics.success_rate:.1%}")
|
|
print(f"Average Duration: {report.system_metrics.average_duration_ms/1000:.1f}s")
|
|
print(f"Total Cost: ${report.system_metrics.total_cost_usd:.2f}")
|
|
print(f"Agents Analyzed: {len(report.agent_metrics)}")
|
|
|
|
print(f"\nKey Findings:")
|
|
for finding in report.summary['key_findings']:
|
|
print(f" • {finding}")
|
|
|
|
print(f"\nTop Recommendations:")
|
|
high_priority_recs = [r for r in report.optimization_recommendations if r.priority == "high"][:3]
|
|
for i, rec in enumerate(high_priority_recs, 1):
|
|
print(f" {i}. {rec.title}")
|
|
|
|
if report.summary['critical_issues'] > 0:
|
|
print(f"\n⚠️ CRITICAL: {report.summary['critical_issues']} critical issues require immediate attention")
|
|
|
|
print(f"\n📊 Detailed reports available in generated files")
|
|
print(f"{'='*60}")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |