CleanArchitecture-template/.brain/.agent/skills/engineering-advanced-skills/agent-designer/agent_evaluator.py

#!/usr/bin/env python3
"""
Agent Evaluator - Multi-Agent System Performance Analysis

Takes agent execution logs (task, actions taken, results, time, tokens used)
and evaluates performance: task success rate, average cost per task, latency
distribution, error patterns, tool usage efficiency, identifies bottlenecks
and improvement opportunities.

Input: execution logs JSON
Output: performance report + bottleneck analysis + optimization recommendations
"""

import json
import argparse
import sys
import statistics
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, asdict
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import re


@dataclass
class ExecutionLog:
    """Single execution log entry"""
    task_id: str
    agent_id: str
    task_type: str
    task_description: str
    start_time: str
    end_time: str
    duration_ms: int
    status: str  # success, failure, partial, timeout
    actions: List[Dict[str, Any]]
    results: Dict[str, Any]
    tokens_used: Dict[str, int]  # input_tokens, output_tokens, total_tokens
    cost_usd: float
    error_details: Optional[Dict[str, Any]]
    tools_used: List[str]
    retry_count: int
    metadata: Dict[str, Any]


@dataclass
class PerformanceMetrics:
    """Performance metrics for an agent or system"""
    total_tasks: int
    successful_tasks: int
    failed_tasks: int
    partial_tasks: int
    timeout_tasks: int
    success_rate: float
    failure_rate: float
    average_duration_ms: float
    median_duration_ms: float
    percentile_95_duration_ms: float
    min_duration_ms: int
    max_duration_ms: int
    total_tokens_used: int
    average_tokens_per_task: float
    total_cost_usd: float
    average_cost_per_task: float
    cost_per_token: float
    throughput_tasks_per_hour: float
    error_rate: float
    retry_rate: float


@dataclass
class ErrorAnalysis:
    """Error pattern analysis"""
    error_type: str
    count: int
    percentage: float
    affected_agents: List[str]
    affected_task_types: List[str]
    common_patterns: List[str]
    suggested_fixes: List[str]
    impact_level: str  # high, medium, low


@dataclass
class BottleneckAnalysis:
    """System bottleneck analysis"""
    bottleneck_type: str  # agent, tool, communication, resource
    location: str
    severity: str  # critical, high, medium, low
    description: str
    impact_on_performance: Dict[str, float]
    affected_workflows: List[str]
    optimization_suggestions: List[str]
    estimated_improvement: Dict[str, float]


@dataclass
class OptimizationRecommendation:
    """Performance optimization recommendation"""
    category: str  # performance, cost, reliability, scalability
    priority: str  # high, medium, low
    title: str
    description: str
    implementation_effort: str  # low, medium, high
    expected_impact: Dict[str, Any]
    estimated_cost_savings: Optional[float]
    estimated_performance_gain: Optional[float]
    implementation_steps: List[str]
    risks: List[str]
    prerequisites: List[str]


@dataclass
class EvaluationReport:
    """Complete evaluation report"""
    summary: Dict[str, Any]
    system_metrics: PerformanceMetrics
    agent_metrics: Dict[str, PerformanceMetrics]
    task_type_metrics: Dict[str, PerformanceMetrics]
    tool_usage_analysis: Dict[str, Any]
    error_analysis: List[ErrorAnalysis]
    bottleneck_analysis: List[BottleneckAnalysis]
    optimization_recommendations: List[OptimizationRecommendation]
    trends_analysis: Dict[str, Any]
    cost_breakdown: Dict[str, Any]
    sla_compliance: Dict[str, Any]
    metadata: Dict[str, Any]


class AgentEvaluator:
    """Evaluate multi-agent system performance from execution logs"""

    def __init__(self):
        self.error_patterns = self._define_error_patterns()
        self.performance_thresholds = self._define_performance_thresholds()
        self.cost_benchmarks = self._define_cost_benchmarks()

    def _define_error_patterns(self) -> Dict[str, Dict[str, Any]]:
        """Define common error patterns and their classifications"""
        return {
            "timeout": {
                "patterns": [r"timeout", r"timed out", r"deadline exceeded"],
                "category": "performance",
                "severity": "high",
                "common_fixes": [
                    "Increase timeout values",
                    "Optimize slow operations",
                    "Add retry logic with exponential backoff",
                    "Parallelize independent operations"
                ]
            },
            "rate_limit": {
                "patterns": [r"rate limit", r"too many requests", r"quota exceeded"],
                "category": "resource",
                "severity": "medium",
                "common_fixes": [
                    "Implement request throttling",
                    "Add circuit breaker pattern",
                    "Use request queuing",
                    "Negotiate higher limits"
                ]
            },
            "authentication": {
                "patterns": [r"unauthorized", r"authentication failed", r"invalid credentials"],
                "category": "security",
                "severity": "high",
                "common_fixes": [
                    "Check credential rotation",
                    "Implement token refresh logic",
                    "Add authentication retry",
                    "Verify permission scopes"
                ]
            },
            "network": {
                "patterns": [r"connection refused", r"network error", r"dns resolution"],
                "category": "infrastructure",
                "severity": "high",
                "common_fixes": [
                    "Add network retry logic",
                    "Implement fallback endpoints",
                    "Use connection pooling",
                    "Add health checks"
                ]
            },
            "validation": {
                "patterns": [r"validation error", r"invalid input", r"schema violation"],
                "category": "data",
                "severity": "medium",
                "common_fixes": [
                    "Strengthen input validation",
                    "Add data sanitization",
                    "Improve error messages",
                    "Add input examples"
                ]
            },
            "resource": {
                "patterns": [r"out of memory", r"disk full", r"cpu overload"],
                "category": "resource",
                "severity": "critical",
                "common_fixes": [
                    "Scale up resources",
                    "Optimize memory usage",
                    "Add resource monitoring",
                    "Implement graceful degradation"
                ]
            }
        }

    def _define_performance_thresholds(self) -> Dict[str, Any]:
        """Define performance thresholds for different metrics"""
        return {
            "success_rate": {"excellent": 0.98, "good": 0.95, "acceptable": 0.90, "poor": 0.80},
            "average_duration": {"excellent": 1000, "good": 3000, "acceptable": 10000, "poor": 30000},
            "error_rate": {"excellent": 0.01, "good": 0.03, "acceptable": 0.05, "poor": 0.10},
            "retry_rate": {"excellent": 0.05, "good": 0.10, "acceptable": 0.20, "poor": 0.40},
            "cost_per_task": {"excellent": 0.01, "good": 0.05, "acceptable": 0.10, "poor": 0.25},
            "throughput": {"excellent": 100, "good": 50, "acceptable": 20, "poor": 5}  # tasks per hour
        }

    def _define_cost_benchmarks(self) -> Dict[str, Any]:
        """Define cost benchmarks for different operations"""
        return {
            "token_costs": {
                "gpt-4": {"input": 0.00003, "output": 0.00006},
                "gpt-3.5-turbo": {"input": 0.000002, "output": 0.000002},
                "claude-3": {"input": 0.000015, "output": 0.000075}
            },
            "operation_costs": {
                "simple_task": 0.005,
                "complex_task": 0.050,
                "research_task": 0.020,
                "analysis_task": 0.030,
                "generation_task": 0.015
            }
        }

    def parse_execution_logs(self, logs_data: List[Dict[str, Any]]) -> List[ExecutionLog]:
        """Parse raw execution logs into structured format"""
        logs = []

        for log_entry in logs_data:
            try:
                log = ExecutionLog(
                    task_id=log_entry.get("task_id", ""),
                    agent_id=log_entry.get("agent_id", ""),
                    task_type=log_entry.get("task_type", "unknown"),
                    task_description=log_entry.get("task_description", ""),
                    start_time=log_entry.get("start_time", ""),
                    end_time=log_entry.get("end_time", ""),
                    duration_ms=log_entry.get("duration_ms", 0),
                    status=log_entry.get("status", "unknown"),
                    actions=log_entry.get("actions", []),
                    results=log_entry.get("results", {}),
                    tokens_used=log_entry.get("tokens_used", {"total_tokens": 0}),
                    cost_usd=log_entry.get("cost_usd", 0.0),
                    error_details=log_entry.get("error_details"),
                    tools_used=log_entry.get("tools_used", []),
                    retry_count=log_entry.get("retry_count", 0),
                    metadata=log_entry.get("metadata", {})
                )
                logs.append(log)
            except Exception as e:
                print(f"Warning: Failed to parse log entry: {e}", file=sys.stderr)
                continue

        return logs

    def calculate_performance_metrics(self, logs: List[ExecutionLog]) -> PerformanceMetrics:
        """Calculate performance metrics from execution logs"""
        if not logs:
            return PerformanceMetrics(
                total_tasks=0, successful_tasks=0, failed_tasks=0, partial_tasks=0,
                timeout_tasks=0, success_rate=0.0, failure_rate=0.0,
                average_duration_ms=0.0, median_duration_ms=0.0, percentile_95_duration_ms=0.0,
                min_duration_ms=0, max_duration_ms=0, total_tokens_used=0,
                average_tokens_per_task=0.0, total_cost_usd=0.0, average_cost_per_task=0.0,
                cost_per_token=0.0, throughput_tasks_per_hour=0.0, error_rate=0.0, retry_rate=0.0
            )

        total_tasks = len(logs)
        successful_tasks = sum(1 for log in logs if log.status == "success")
        failed_tasks = sum(1 for log in logs if log.status == "failure")
        partial_tasks = sum(1 for log in logs if log.status == "partial")
        timeout_tasks = sum(1 for log in logs if log.status == "timeout")

        success_rate = successful_tasks / total_tasks if total_tasks > 0 else 0.0
        failure_rate = (failed_tasks + timeout_tasks) / total_tasks if total_tasks > 0 else 0.0

        durations = [log.duration_ms for log in logs if log.duration_ms > 0]
        if durations:
            average_duration_ms = statistics.mean(durations)
            median_duration_ms = statistics.median(durations)
            percentile_95_duration_ms = self._percentile(durations, 95)
            min_duration_ms = min(durations)
            max_duration_ms = max(durations)
        else:
            average_duration_ms = median_duration_ms = percentile_95_duration_ms = 0.0
            min_duration_ms = max_duration_ms = 0

        total_tokens = sum(log.tokens_used.get("total_tokens", 0) for log in logs)
        average_tokens_per_task = total_tokens / total_tasks if total_tasks > 0 else 0.0

        total_cost = sum(log.cost_usd for log in logs)
        average_cost_per_task = total_cost / total_tasks if total_tasks > 0 else 0.0
        cost_per_token = total_cost / total_tokens if total_tokens > 0 else 0.0

        # Calculate throughput (tasks per hour)
        if logs and len(logs) > 1:
            start_time = min(log.start_time for log in logs if log.start_time)
            end_time = max(log.end_time for log in logs if log.end_time)
            if start_time and end_time:
                try:
                    start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
                    end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00"))
                    time_diff_hours = (end_dt - start_dt).total_seconds() / 3600
                    throughput_tasks_per_hour = total_tasks / time_diff_hours if time_diff_hours > 0 else 0.0
                except:
                    throughput_tasks_per_hour = 0.0
            else:
                throughput_tasks_per_hour = 0.0
        else:
            throughput_tasks_per_hour = 0.0

        error_rate = sum(1 for log in logs if log.error_details) / total_tasks if total_tasks > 0 else 0.0
        retry_rate = sum(1 for log in logs if log.retry_count > 0) / total_tasks if total_tasks > 0 else 0.0

        return PerformanceMetrics(
            total_tasks=total_tasks,
            successful_tasks=successful_tasks,
            failed_tasks=failed_tasks,
            partial_tasks=partial_tasks,
            timeout_tasks=timeout_tasks,
            success_rate=success_rate,
            failure_rate=failure_rate,
            average_duration_ms=average_duration_ms,
            median_duration_ms=median_duration_ms,
            percentile_95_duration_ms=percentile_95_duration_ms,
            min_duration_ms=min_duration_ms,
            max_duration_ms=max_duration_ms,
            total_tokens_used=total_tokens,
            average_tokens_per_task=average_tokens_per_task,
            total_cost_usd=total_cost,
            average_cost_per_task=average_cost_per_task,
            cost_per_token=cost_per_token,
            throughput_tasks_per_hour=throughput_tasks_per_hour,
            error_rate=error_rate,
            retry_rate=retry_rate
        )

    def _percentile(self, data: List[float], percentile: int) -> float:
        """Calculate percentile value from data"""
        if not data:
            return 0.0
        sorted_data = sorted(data)
        index = (percentile / 100) * (len(sorted_data) - 1)
        if index.is_integer():
            return sorted_data[int(index)]
        else:
            lower_index = int(index)
            upper_index = lower_index + 1
            weight = index - lower_index
            return sorted_data[lower_index] * (1 - weight) + sorted_data[upper_index] * weight

    def analyze_errors(self, logs: List[ExecutionLog]) -> List[ErrorAnalysis]:
        """Analyze error patterns in execution logs"""
        error_analyses = []

        # Collect all errors
        errors = []
        for log in logs:
            if log.error_details:
                errors.append({
                    "error": log.error_details,
                    "agent_id": log.agent_id,
                    "task_type": log.task_type,
                    "task_id": log.task_id
                })

        if not errors:
            return error_analyses

        # Group errors by pattern
        error_groups = defaultdict(list)
        unclassified_errors = []

        for error in errors:
            error_message = str(error.get("error", {})).lower()
            classified = False

            for pattern_name, pattern_info in self.error_patterns.items():
                for pattern in pattern_info["patterns"]:
                    if re.search(pattern, error_message):
                        error_groups[pattern_name].append(error)
                        classified = True
                        break
                if classified:
                    break

            if not classified:
                unclassified_errors.append(error)

        # Analyze each error group
        total_errors = len(errors)

        for error_type, error_list in error_groups.items():
            count = len(error_list)
            percentage = (count / total_errors) * 100 if total_errors > 0 else 0.0

            affected_agents = list(set(error["agent_id"] for error in error_list))
            affected_task_types = list(set(error["task_type"] for error in error_list))

            # Extract common patterns from error messages
            common_patterns = self._extract_common_patterns([str(e["error"]) for e in error_list])

            # Get suggested fixes
            pattern_info = self.error_patterns.get(error_type, {})
            suggested_fixes = pattern_info.get("common_fixes", [])

            # Determine impact level
            if percentage > 20 or pattern_info.get("severity") == "critical":
                impact_level = "high"
            elif percentage > 10 or pattern_info.get("severity") == "high":
                impact_level = "medium"
            else:
                impact_level = "low"

            error_analysis = ErrorAnalysis(
                error_type=error_type,
                count=count,
                percentage=percentage,
                affected_agents=affected_agents,
                affected_task_types=affected_task_types,
                common_patterns=common_patterns,
                suggested_fixes=suggested_fixes,
                impact_level=impact_level
            )

            error_analyses.append(error_analysis)

        # Handle unclassified errors
        if unclassified_errors:
            count = len(unclassified_errors)
            percentage = (count / total_errors) * 100

            error_analysis = ErrorAnalysis(
                error_type="unclassified",
                count=count,
                percentage=percentage,
                affected_agents=list(set(error["agent_id"] for error in unclassified_errors)),
                affected_task_types=list(set(error["task_type"] for error in unclassified_errors)),
                common_patterns=self._extract_common_patterns([str(e["error"]) for e in unclassified_errors]),
                suggested_fixes=["Review and classify error patterns", "Add specific error handling"],
                impact_level="medium" if percentage > 10 else "low"
            )

            error_analyses.append(error_analysis)

        # Sort by impact and count
        error_analyses.sort(key=lambda x: (x.impact_level == "high", x.count), reverse=True)

        return error_analyses

    def _extract_common_patterns(self, error_messages: List[str]) -> List[str]:
        """Extract common patterns from error messages"""
        if not error_messages:
            return []

        # Simple pattern extraction - find common phrases
        word_counts = Counter()
        for message in error_messages:
            words = re.findall(r'\w+', message.lower())
            for word in words:
                if len(word) > 3:  # Ignore short words
                    word_counts[word] += 1

        # Return most common words/patterns
        common_patterns = [word for word, count in word_counts.most_common(5)
                          if count > 1]

        return common_patterns

    def identify_bottlenecks(self, logs: List[ExecutionLog],
                           agent_metrics: Dict[str, PerformanceMetrics]) -> List[BottleneckAnalysis]:
        """Identify system bottlenecks"""
        bottlenecks = []

        # Agent performance bottlenecks
        for agent_id, metrics in agent_metrics.items():
            if metrics.success_rate < 0.8:
                severity = "critical" if metrics.success_rate < 0.5 else "high"
                bottlenecks.append(BottleneckAnalysis(
                    bottleneck_type="agent",
                    location=agent_id,
                    severity=severity,
                    description=f"Agent {agent_id} has low success rate ({metrics.success_rate:.1%})",
                    impact_on_performance={
                        "success_rate_impact": (0.95 - metrics.success_rate) * 100,
                        "cost_impact": metrics.average_cost_per_task * metrics.failed_tasks
                    },
                    affected_workflows=self._get_agent_workflows(agent_id, logs),
                    optimization_suggestions=[
                        "Review and improve agent logic",
                        "Add better error handling",
                        "Optimize tool usage",
                        "Consider agent specialization"
                    ],
                    estimated_improvement={
                        "success_rate_gain": min(0.15, 0.95 - metrics.success_rate),
                        "cost_reduction": metrics.average_cost_per_task * 0.2
                    }
                ))

            if metrics.average_duration_ms > 30000:  # 30 seconds
                severity = "high" if metrics.average_duration_ms > 60000 else "medium"
                bottlenecks.append(BottleneckAnalysis(
                    bottleneck_type="agent",
                    location=agent_id,
                    severity=severity,
                    description=f"Agent {agent_id} has high latency ({metrics.average_duration_ms/1000:.1f}s avg)",
                    impact_on_performance={
                        "latency_impact": metrics.average_duration_ms - 10000,
                        "throughput_impact": max(0, 50 - metrics.total_tasks)
                    },
                    affected_workflows=self._get_agent_workflows(agent_id, logs),
                    optimization_suggestions=[
                        "Profile and optimize slow operations",
                        "Implement caching strategies",
                        "Parallelize independent tasks",
                        "Optimize API calls"
                    ],
                    estimated_improvement={
                        "latency_reduction": min(0.5, (metrics.average_duration_ms - 10000) / metrics.average_duration_ms),
                        "throughput_gain": 1.3
                    }
                ))

        # Tool usage bottlenecks
        tool_usage = self._analyze_tool_usage(logs)
        for tool, usage_stats in tool_usage.items():
            if usage_stats.get("error_rate", 0) > 0.2:
                bottlenecks.append(BottleneckAnalysis(
                    bottleneck_type="tool",
                    location=tool,
                    severity="high" if usage_stats["error_rate"] > 0.4 else "medium",
                    description=f"Tool {tool} has high error rate ({usage_stats['error_rate']:.1%})",
                    impact_on_performance={
                        "reliability_impact": usage_stats["error_rate"] * usage_stats["usage_count"],
                        "retry_overhead": usage_stats.get("retry_count", 0) * 1000  # ms
                    },
                    affected_workflows=usage_stats.get("affected_workflows", []),
                    optimization_suggestions=[
                        "Review tool implementation",
                        "Add better error handling for tool",
                        "Implement tool fallbacks",
                        "Consider alternative tools"
                    ],
                    estimated_improvement={
                        "error_reduction": usage_stats["error_rate"] * 0.7,
                        "performance_gain": 1.2
                    }
                ))

        # Communication bottlenecks
        communication_analysis = self._analyze_communication_patterns(logs)
        if communication_analysis.get("high_latency_communications", 0) > 5:
            bottlenecks.append(BottleneckAnalysis(
                bottleneck_type="communication",
                location="inter_agent_communication",
                severity="medium",
                description="High latency in inter-agent communications detected",
                impact_on_performance={
                    "communication_overhead": communication_analysis.get("avg_communication_latency", 0),
                    "coordination_efficiency": 0.8  # Assumed impact
                },
                affected_workflows=communication_analysis.get("affected_workflows", []),
                optimization_suggestions=[
                    "Optimize message serialization",
                    "Implement message batching",
                    "Add communication caching",
                    "Consider direct communication patterns"
                ],
                estimated_improvement={
                    "communication_latency_reduction": 0.4,
                    "overall_efficiency_gain": 1.15
                }
            ))

        # Resource bottlenecks
        resource_analysis = self._analyze_resource_usage(logs)
        if resource_analysis.get("high_token_usage_tasks", 0) > 10:
            bottlenecks.append(BottleneckAnalysis(
                bottleneck_type="resource",
                location="token_usage",
                severity="medium",
                description="High token usage detected in multiple tasks",
                impact_on_performance={
                    "cost_impact": resource_analysis.get("excess_token_cost", 0),
                    "latency_impact": resource_analysis.get("token_processing_overhead", 0)
                },
                affected_workflows=resource_analysis.get("high_usage_workflows", []),
                optimization_suggestions=[
                    "Optimize prompt engineering",
                    "Implement response caching",
                    "Use more efficient models for simple tasks",
                    "Add token usage monitoring"
                ],
                estimated_improvement={
                    "cost_reduction": 0.3,
                    "efficiency_gain": 1.1
                }
            ))

        # Sort bottlenecks by severity and impact
        severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
        bottlenecks.sort(key=lambda x: (severity_order[x.severity],
                                       -sum(x.impact_on_performance.values())))

        return bottlenecks

    def _get_agent_workflows(self, agent_id: str, logs: List[ExecutionLog]) -> List[str]:
        """Get workflows affected by a specific agent"""
        workflows = set()
        for log in logs:
            if log.agent_id == agent_id:
                workflows.add(log.task_type)
        return list(workflows)

    def _analyze_tool_usage(self, logs: List[ExecutionLog]) -> Dict[str, Dict[str, Any]]:
        """Analyze tool usage patterns"""
        tool_stats = defaultdict(lambda: {
            "usage_count": 0,
            "error_count": 0,
            "total_duration": 0,
            "affected_workflows": set(),
            "retry_count": 0
        })

        for log in logs:
            for tool in log.tools_used:
                stats = tool_stats[tool]
                stats["usage_count"] += 1
                stats["total_duration"] += log.duration_ms
                stats["affected_workflows"].add(log.task_type)

                if log.error_details:
                    stats["error_count"] += 1
                if log.retry_count > 0:
                    stats["retry_count"] += log.retry_count

        # Calculate derived metrics
        result = {}
        for tool, stats in tool_stats.items():
            result[tool] = {
                "usage_count": stats["usage_count"],
                "error_rate": stats["error_count"] / stats["usage_count"] if stats["usage_count"] > 0 else 0,
                "avg_duration": stats["total_duration"] / stats["usage_count"] if stats["usage_count"] > 0 else 0,
                "affected_workflows": list(stats["affected_workflows"]),
                "retry_count": stats["retry_count"]
            }

        return result

    def _analyze_communication_patterns(self, logs: List[ExecutionLog]) -> Dict[str, Any]:
        """Analyze communication patterns between agents"""
        # This is a simplified analysis - in a real system, you'd have more detailed communication logs
        communication_actions = []
        for log in logs:
            for action in log.actions:
                if action.get("type") in ["message", "delegate", "coordinate", "respond"]:
                    communication_actions.append({
                        "duration": action.get("duration_ms", 0),
                        "success": action.get("success", True),
                        "workflow": log.task_type
                    })

        if not communication_actions:
            return {}

        avg_latency = sum(action["duration"] for action in communication_actions) / len(communication_actions)
        high_latency_count = sum(1 for action in communication_actions if action["duration"] > 5000)

        return {
            "total_communications": len(communication_actions),
            "avg_communication_latency": avg_latency,
            "high_latency_communications": high_latency_count,
            "affected_workflows": list(set(action["workflow"] for action in communication_actions))
        }

    def _analyze_resource_usage(self, logs: List[ExecutionLog]) -> Dict[str, Any]:
        """Analyze resource usage patterns"""
        token_usage = [log.tokens_used.get("total_tokens", 0) for log in logs]

        if not token_usage:
            return {}

        avg_tokens = sum(token_usage) / len(token_usage)
        high_usage_threshold = avg_tokens * 2
        high_usage_tasks = sum(1 for tokens in token_usage if tokens > high_usage_threshold)

        # Estimate excess cost
        excess_tokens = sum(max(0, tokens - avg_tokens) for tokens in token_usage)
        excess_cost = excess_tokens * 0.00002  # Rough estimate

        return {
            "avg_token_usage": avg_tokens,
            "high_token_usage_tasks": high_usage_tasks,
            "excess_token_cost": excess_cost,
            "token_processing_overhead": high_usage_tasks * 500,  # Estimated overhead in ms
            "high_usage_workflows": [log.task_type for log in logs
                                   if log.tokens_used.get("total_tokens", 0) > high_usage_threshold]
        }

    def generate_optimization_recommendations(self,
                                            system_metrics: PerformanceMetrics,
                                            error_analyses: List[ErrorAnalysis],
                                            bottlenecks: List[BottleneckAnalysis]) -> List[OptimizationRecommendation]:
        """Generate optimization recommendations based on analysis"""
        recommendations = []

        # Performance optimization recommendations
        if system_metrics.success_rate < 0.9:
            recommendations.append(OptimizationRecommendation(
                category="reliability",
                priority="high",
                title="Improve System Reliability",
                description=f"System success rate is {system_metrics.success_rate:.1%}, below target of 90%",
                implementation_effort="medium",
                expected_impact={
                    "success_rate_improvement": min(0.1, 0.95 - system_metrics.success_rate),
                    "cost_reduction": system_metrics.average_cost_per_task * 0.15
                },
                estimated_cost_savings=system_metrics.total_cost_usd * 0.1,
                estimated_performance_gain=1.2,
                implementation_steps=[
                    "Identify and fix top error patterns",
                    "Implement better error handling and retries",
                    "Add comprehensive monitoring and alerting",
                    "Implement graceful degradation patterns"
                ],
                risks=["Temporary increase in complexity", "Potential initial performance overhead"],
                prerequisites=["Error analysis completion", "Monitoring infrastructure"]
            ))

        # Cost optimization recommendations
        if system_metrics.average_cost_per_task > 0.1:
            recommendations.append(OptimizationRecommendation(
                category="cost",
                priority="medium",
                title="Optimize Token Usage and Costs",
                description=f"Average cost per task (${system_metrics.average_cost_per_task:.3f}) is above optimal range",
                implementation_effort="low",
                expected_impact={
                    "cost_reduction": system_metrics.average_cost_per_task * 0.3,
                    "efficiency_improvement": 1.15
                },
                estimated_cost_savings=system_metrics.total_cost_usd * 0.3,
                estimated_performance_gain=1.05,
                implementation_steps=[
                    "Implement prompt optimization",
                    "Add response caching for repeated queries",
                    "Use smaller models for simple tasks",
                    "Implement token usage monitoring and alerts"
                ],
                risks=["Potential quality reduction with smaller models"],
                prerequisites=["Token usage analysis", "Caching infrastructure"]
            ))

        # Performance optimization recommendations
        if system_metrics.average_duration_ms > 10000:
            recommendations.append(OptimizationRecommendation(
                category="performance",
                priority="high",
                title="Reduce Task Latency",
                description=f"Average task duration ({system_metrics.average_duration_ms/1000:.1f}s) exceeds target",
                implementation_effort="high",
                expected_impact={
                    "latency_reduction": min(0.5, (system_metrics.average_duration_ms - 5000) / system_metrics.average_duration_ms),
                    "throughput_improvement": 1.5
                },
                estimated_performance_gain=1.4,
                implementation_steps=[
                    "Profile and optimize slow operations",
                    "Implement parallel processing where possible",
                    "Add caching for expensive operations",
                    "Optimize API calls and reduce round trips"
                ],
                risks=["Increased system complexity", "Potential resource usage increase"],
                prerequisites=["Performance profiling tools", "Caching infrastructure"]
            ))

        # Error-based recommendations
        high_impact_errors = [ea for ea in error_analyses if ea.impact_level == "high"]
        if high_impact_errors:
            for error_analysis in high_impact_errors[:3]:  # Top 3 high impact errors
                recommendations.append(OptimizationRecommendation(
                    category="reliability",
                    priority="high",
                    title=f"Address {error_analysis.error_type.title()} Errors",
                    description=f"{error_analysis.error_type.title()} errors occur in {error_analysis.percentage:.1f}% of cases",
                    implementation_effort="medium",
                    expected_impact={
                        "error_reduction": error_analysis.percentage / 100,
                        "reliability_improvement": 1.1
                    },
                    estimated_cost_savings=system_metrics.total_cost_usd * (error_analysis.percentage / 100) * 0.5,
                    implementation_steps=error_analysis.suggested_fixes,
                    risks=["May require significant code changes"],
                    prerequisites=["Root cause analysis", "Testing framework"]
                ))

        # Bottleneck-based recommendations
        critical_bottlenecks = [b for b in bottlenecks if b.severity in ["critical", "high"]]
        for bottleneck in critical_bottlenecks[:2]:  # Top 2 critical bottlenecks
            recommendations.append(OptimizationRecommendation(
                category="performance",
                priority="high" if bottleneck.severity == "critical" else "medium",
                title=f"Address {bottleneck.bottleneck_type.title()} Bottleneck",
                description=bottleneck.description,
                implementation_effort="medium",
                expected_impact=bottleneck.estimated_improvement,
                estimated_performance_gain=list(bottleneck.estimated_improvement.values())[0] if bottleneck.estimated_improvement else 1.1,
                implementation_steps=bottleneck.optimization_suggestions,
                risks=["System downtime during implementation", "Potential cascade effects"],
                prerequisites=["Impact assessment", "Rollback plan"]
            ))

        # Scalability recommendations
        if system_metrics.throughput_tasks_per_hour < 20:
            recommendations.append(OptimizationRecommendation(
                category="scalability",
                priority="medium",
                title="Improve System Scalability",
                description="Current throughput indicates potential scalability issues",
                implementation_effort="high",
                expected_impact={
                    "throughput_improvement": 2.0,
                    "scalability_headroom": 5.0
                },
                estimated_performance_gain=2.0,
                implementation_steps=[
                    "Implement horizontal scaling for agents",
                    "Add load balancing and resource pooling",
                    "Optimize resource allocation algorithms",
                    "Implement auto-scaling policies"
                ],
                risks=["High implementation complexity", "Increased operational overhead"],
                prerequisites=["Infrastructure scaling capability", "Monitoring and metrics"]
            ))

        # Sort recommendations by priority and impact
        priority_order = {"high": 0, "medium": 1, "low": 2}
        recommendations.sort(key=lambda x: (
            priority_order[x.priority],
            -x.estimated_performance_gain if x.estimated_performance_gain else 0,
            -x.estimated_cost_savings if x.estimated_cost_savings else 0
        ))

        return recommendations

    def generate_report(self, logs: List[ExecutionLog]) -> EvaluationReport:
        """Generate complete evaluation report"""

        # Calculate system metrics
        system_metrics = self.calculate_performance_metrics(logs)

        # Calculate per-agent metrics
        agents = set(log.agent_id for log in logs)
        agent_metrics = {}
        for agent_id in agents:
            agent_logs = [log for log in logs if log.agent_id == agent_id]
            agent_metrics[agent_id] = self.calculate_performance_metrics(agent_logs)

        # Calculate per-task-type metrics
        task_types = set(log.task_type for log in logs)
        task_type_metrics = {}
        for task_type in task_types:
            task_logs = [log for log in logs if log.task_type == task_type]
            task_type_metrics[task_type] = self.calculate_performance_metrics(task_logs)

        # Analyze tool usage
        tool_usage_analysis = self._analyze_tool_usage(logs)

        # Analyze errors
        error_analysis = self.analyze_errors(logs)

        # Identify bottlenecks
        bottleneck_analysis = self.identify_bottlenecks(logs, agent_metrics)

        # Generate optimization recommendations
        optimization_recommendations = self.generate_optimization_recommendations(
            system_metrics, error_analysis, bottleneck_analysis)

        # Generate trends analysis (simplified)
        trends_analysis = self._generate_trends_analysis(logs)

        # Generate cost breakdown
        cost_breakdown = self._generate_cost_breakdown(logs, agent_metrics)

        # Check SLA compliance
        sla_compliance = self._check_sla_compliance(system_metrics)

        # Create summary
        summary = {
            "evaluation_period": {
                "start_time": min(log.start_time for log in logs if log.start_time) if logs else None,
                "end_time": max(log.end_time for log in logs if log.end_time) if logs else None,
                "total_duration_hours": system_metrics.total_tasks / system_metrics.throughput_tasks_per_hour if system_metrics.throughput_tasks_per_hour > 0 else 0
            },
            "overall_health": self._assess_overall_health(system_metrics),
            "key_findings": self._extract_key_findings(system_metrics, error_analysis, bottleneck_analysis),
            "critical_issues": len([b for b in bottleneck_analysis if b.severity == "critical"]),
            "improvement_opportunities": len(optimization_recommendations)
        }

        # Create metadata
        metadata = {
            "generated_at": datetime.now().isoformat(),
            "evaluator_version": "1.0",
            "total_logs_processed": len(logs),
            "agents_analyzed": len(agents),
            "task_types_analyzed": len(task_types),
            "analysis_completeness": "full"
        }

        return EvaluationReport(
            summary=summary,
            system_metrics=system_metrics,
            agent_metrics=agent_metrics,
            task_type_metrics=task_type_metrics,
            tool_usage_analysis=tool_usage_analysis,
            error_analysis=error_analysis,
            bottleneck_analysis=bottleneck_analysis,
            optimization_recommendations=optimization_recommendations,
            trends_analysis=trends_analysis,
            cost_breakdown=cost_breakdown,
            sla_compliance=sla_compliance,
            metadata=metadata
        )

    def _generate_trends_analysis(self, logs: List[ExecutionLog]) -> Dict[str, Any]:
        """Generate trends analysis (simplified version)"""
        # Group logs by time periods (daily)
        daily_metrics = defaultdict(list)

        for log in logs:
            if log.start_time:
                try:
                    date = log.start_time.split('T')[0]  # Extract date part
                    daily_metrics[date].append(log)
                except:
                    continue

        trends = {}
        if len(daily_metrics) > 1:
            daily_success_rates = {}
            daily_avg_durations = {}
            daily_costs = {}

            for date, date_logs in daily_metrics.items():
                if date_logs:
                    metrics = self.calculate_performance_metrics(date_logs)
                    daily_success_rates[date] = metrics.success_rate
                    daily_avg_durations[date] = metrics.average_duration_ms
                    daily_costs[date] = metrics.total_cost_usd

            trends = {
                "daily_success_rates": daily_success_rates,
                "daily_avg_durations": daily_avg_durations,
                "daily_costs": daily_costs,
                "trend_direction": {
                    "success_rate": "stable",  # Simplified
                    "duration": "stable",
                    "cost": "stable"
                }
            }

        return trends

    def _generate_cost_breakdown(self, logs: List[ExecutionLog],
                                agent_metrics: Dict[str, PerformanceMetrics]) -> Dict[str, Any]:
        """Generate cost breakdown analysis"""
        total_cost = sum(log.cost_usd for log in logs)

        # Cost by agent
        agent_costs = {}
        for agent_id, metrics in agent_metrics.items():
            agent_costs[agent_id] = metrics.total_cost_usd

        # Cost by task type
        task_type_costs = defaultdict(float)
        for log in logs:
            task_type_costs[log.task_type] += log.cost_usd

        # Token cost breakdown
        total_tokens = sum(log.tokens_used.get("total_tokens", 0) for log in logs)

        return {
            "total_cost": total_cost,
            "cost_by_agent": dict(agent_costs),
            "cost_by_task_type": dict(task_type_costs),
            "cost_per_token": total_cost / total_tokens if total_tokens > 0 else 0,
            "top_cost_drivers": sorted(task_type_costs.items(), key=lambda x: x[1], reverse=True)[:5]
        }

    def _check_sla_compliance(self, metrics: PerformanceMetrics) -> Dict[str, Any]:
        """Check SLA compliance"""
        thresholds = self.performance_thresholds

        compliance = {
            "success_rate": {
                "target": 0.95,
                "actual": metrics.success_rate,
                "compliant": metrics.success_rate >= 0.95,
                "gap": max(0, 0.95 - metrics.success_rate)
            },
            "average_latency": {
                "target": 10000,  # 10 seconds
                "actual": metrics.average_duration_ms,
                "compliant": metrics.average_duration_ms <= 10000,
                "gap": max(0, metrics.average_duration_ms - 10000)
            },
            "error_rate": {
                "target": 0.05,  # 5%
                "actual": metrics.error_rate,
                "compliant": metrics.error_rate <= 0.05,
                "gap": max(0, metrics.error_rate - 0.05)
            }
        }

        overall_compliance = all(sla["compliant"] for sla in compliance.values())

        return {
            "overall_compliant": overall_compliance,
            "sla_details": compliance,
            "compliance_score": sum(1 for sla in compliance.values() if sla["compliant"]) / len(compliance)
        }

    def _assess_overall_health(self, metrics: PerformanceMetrics) -> str:
        """Assess overall system health"""
        health_score = 0

        # Success rate contribution (40%)
        if metrics.success_rate >= 0.95:
            health_score += 40
        elif metrics.success_rate >= 0.90:
            health_score += 30
        elif metrics.success_rate >= 0.80:
            health_score += 20
        else:
            health_score += 10

        # Performance contribution (30%)
        if metrics.average_duration_ms <= 5000:
            health_score += 30
        elif metrics.average_duration_ms <= 10000:
            health_score += 20
        elif metrics.average_duration_ms <= 30000:
            health_score += 15
        else:
            health_score += 5

        # Error rate contribution (20%)
        if metrics.error_rate <= 0.02:
            health_score += 20
        elif metrics.error_rate <= 0.05:
            health_score += 15
        elif metrics.error_rate <= 0.10:
            health_score += 10
        else:
            health_score += 0

        # Cost efficiency contribution (10%)
        if metrics.cost_per_token <= 0.00005:
            health_score += 10
        elif metrics.cost_per_token <= 0.0001:
            health_score += 7
        else:
            health_score += 3

        if health_score >= 85:
            return "excellent"
        elif health_score >= 70:
            return "good"
        elif health_score >= 50:
            return "fair"
        else:
            return "poor"

    def _extract_key_findings(self, metrics: PerformanceMetrics,
                            errors: List[ErrorAnalysis],
                            bottlenecks: List[BottleneckAnalysis]) -> List[str]:
        """Extract key findings from analysis"""
        findings = []

        # Performance findings
        if metrics.success_rate < 0.9:
            findings.append(f"Success rate ({metrics.success_rate:.1%}) below target")

        if metrics.average_duration_ms > 15000:
            findings.append(f"High average latency ({metrics.average_duration_ms/1000:.1f}s)")

        # Error findings
        high_impact_errors = [e for e in errors if e.impact_level == "high"]
        if high_impact_errors:
            findings.append(f"{len(high_impact_errors)} high-impact error patterns identified")

        # Bottleneck findings
        critical_bottlenecks = [b for b in bottlenecks if b.severity == "critical"]
        if critical_bottlenecks:
            findings.append(f"{len(critical_bottlenecks)} critical bottlenecks found")

        # Cost findings
        if metrics.cost_per_token > 0.0001:
            findings.append("Token usage costs above optimal range")

        return findings


def main():
    parser = argparse.ArgumentParser(description="Multi-Agent System Performance Evaluator")
    parser.add_argument("input_file", help="JSON file with execution logs")
    parser.add_argument("-o", "--output", help="Output file prefix (default: evaluation_report)")
    parser.add_argument("--format", choices=["json", "both"], default="both",
                       help="Output format")
    parser.add_argument("--detailed", action="store_true",
                       help="Include detailed analysis in output")

    args = parser.parse_args()

    try:
        # Load execution logs
        with open(args.input_file, 'r') as f:
            logs_data = json.load(f)

        # Parse logs
        evaluator = AgentEvaluator()
        logs = evaluator.parse_execution_logs(logs_data.get("execution_logs", []))

        if not logs:
            print("No valid execution logs found in input file", file=sys.stderr)
            sys.exit(1)

        # Generate evaluation report
        report = evaluator.generate_report(logs)

        # Prepare output
        output_data = asdict(report)

        # Output files
        output_prefix = args.output or "evaluation_report"

        if args.format in ["json", "both"]:
            with open(f"{output_prefix}.json", 'w') as f:
                json.dump(output_data, f, indent=2, default=str)
            print(f"JSON report written to {output_prefix}.json")

        if args.format == "both":
            # Generate separate detailed files

            # Performance summary
            summary_data = {
                "summary": report.summary,
                "system_metrics": asdict(report.system_metrics),
                "sla_compliance": report.sla_compliance
            }
            with open(f"{output_prefix}_summary.json", 'w') as f:
                json.dump(summary_data, f, indent=2, default=str)
            print(f"Summary report written to {output_prefix}_summary.json")

            # Recommendations
            recommendations_data = {
                "optimization_recommendations": [asdict(rec) for rec in report.optimization_recommendations],
                "bottleneck_analysis": [asdict(b) for b in report.bottleneck_analysis]
            }
            with open(f"{output_prefix}_recommendations.json", 'w') as f:
                json.dump(recommendations_data, f, indent=2)
            print(f"Recommendations written to {output_prefix}_recommendations.json")

            # Error analysis
            error_data = {
                "error_analysis": [asdict(e) for e in report.error_analysis],
                "error_summary": {
                    "total_errors": sum(e.count for e in report.error_analysis),
                    "high_impact_errors": len([e for e in report.error_analysis if e.impact_level == "high"])
                }
            }
            with open(f"{output_prefix}_errors.json", 'w') as f:
                json.dump(error_data, f, indent=2)
            print(f"Error analysis written to {output_prefix}_errors.json")

        # Print executive summary
        print(f"\n{'='*60}")
        print(f"AGENT SYSTEM EVALUATION REPORT")
        print(f"{'='*60}")
        print(f"Overall Health: {report.summary['overall_health'].upper()}")
        print(f"Total Tasks: {report.system_metrics.total_tasks}")
        print(f"Success Rate: {report.system_metrics.success_rate:.1%}")
        print(f"Average Duration: {report.system_metrics.average_duration_ms/1000:.1f}s")
        print(f"Total Cost: ${report.system_metrics.total_cost_usd:.2f}")
        print(f"Agents Analyzed: {len(report.agent_metrics)}")

        print(f"\nKey Findings:")
        for finding in report.summary['key_findings']:
            print(f"  • {finding}")

        print(f"\nTop Recommendations:")
        high_priority_recs = [r for r in report.optimization_recommendations if r.priority == "high"][:3]
        for i, rec in enumerate(high_priority_recs, 1):
            print(f"  {i}. {rec.title}")

        if report.summary['critical_issues'] > 0:
            print(f"\n⚠️  CRITICAL: {report.summary['critical_issues']} critical issues require immediate attention")

        print(f"\n📊 Detailed reports available in generated files")
        print(f"{'='*60}")

    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()