Files
CleanArchitecture-template/.brain/.agent/skills/engineering-advanced-skills/observability-designer/scripts/dashboard_generator.py
2026-03-12 15:17:52 +07:00

1219 lines
48 KiB
Python

#!/usr/bin/env python3
"""
Dashboard Generator - Generate comprehensive dashboard specifications
This script generates dashboard specifications based on service/system descriptions:
- Panel layout optimized for different screen sizes and roles
- Metric queries (Prometheus-style) for comprehensive monitoring
- Visualization types appropriate for different metric types
- Drill-down paths for effective troubleshooting workflows
- Golden signals coverage (latency, traffic, errors, saturation)
- RED/USE method implementation
- Business metrics integration
Usage:
python dashboard_generator.py --input service_definition.json --output dashboard_spec.json
python dashboard_generator.py --service-type api --name "Payment Service" --output payment_dashboard.json
"""
import json
import argparse
import sys
import math
from typing import Dict, List, Any, Tuple
from datetime import datetime, timedelta
class DashboardGenerator:
"""Generate comprehensive dashboard specifications."""
# Dashboard layout templates by role
ROLE_LAYOUTS = {
'sre': {
'primary_focus': ['availability', 'latency', 'errors', 'resource_utilization'],
'secondary_focus': ['throughput', 'capacity', 'dependencies'],
'time_ranges': ['1h', '6h', '1d', '7d'],
'default_refresh': '30s'
},
'developer': {
'primary_focus': ['latency', 'errors', 'throughput', 'business_metrics'],
'secondary_focus': ['resource_utilization', 'dependencies'],
'time_ranges': ['15m', '1h', '6h', '1d'],
'default_refresh': '1m'
},
'executive': {
'primary_focus': ['availability', 'business_metrics', 'user_experience'],
'secondary_focus': ['cost', 'capacity_trends'],
'time_ranges': ['1d', '7d', '30d'],
'default_refresh': '5m'
},
'ops': {
'primary_focus': ['resource_utilization', 'capacity', 'alerts', 'deployments'],
'secondary_focus': ['throughput', 'latency'],
'time_ranges': ['5m', '30m', '2h', '1d'],
'default_refresh': '15s'
}
}
# Service type specific metric configurations
SERVICE_METRICS = {
'api': {
'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
'key_metrics': [
'http_requests_total',
'http_request_duration_seconds',
'http_request_size_bytes',
'http_response_size_bytes'
],
'resource_metrics': ['cpu_usage', 'memory_usage', 'goroutines']
},
'web': {
'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
'key_metrics': [
'http_requests_total',
'http_request_duration_seconds',
'page_load_time',
'user_sessions'
],
'resource_metrics': ['cpu_usage', 'memory_usage', 'connections']
},
'database': {
'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
'key_metrics': [
'db_connections_active',
'db_query_duration_seconds',
'db_queries_total',
'db_slow_queries_total'
],
'resource_metrics': ['cpu_usage', 'memory_usage', 'disk_io', 'connections']
},
'queue': {
'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
'key_metrics': [
'queue_depth',
'message_processing_duration',
'messages_published_total',
'messages_consumed_total'
],
'resource_metrics': ['cpu_usage', 'memory_usage', 'disk_usage']
}
}
# Visualization type recommendations
VISUALIZATION_TYPES = {
'latency': 'line_chart',
'throughput': 'line_chart',
'error_rate': 'line_chart',
'success_rate': 'stat',
'resource_utilization': 'gauge',
'queue_depth': 'bar_chart',
'status': 'stat',
'distribution': 'heatmap',
'alerts': 'table',
'logs': 'logs_panel'
}
def __init__(self):
"""Initialize the Dashboard Generator."""
self.service_config = {}
self.dashboard_spec = {}
def load_service_definition(self, file_path: str) -> Dict[str, Any]:
"""Load service definition from JSON file."""
try:
with open(file_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
raise ValueError(f"Service definition file not found: {file_path}")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in service definition: {e}")
def create_service_definition(self, service_type: str, name: str,
criticality: str = 'medium') -> Dict[str, Any]:
"""Create a service definition from parameters."""
return {
'name': name,
'type': service_type,
'criticality': criticality,
'description': f'{name} - A {criticality} criticality {service_type} service',
'team': 'platform',
'environment': 'production',
'dependencies': [],
'tags': []
}
def generate_dashboard_specification(self, service_def: Dict[str, Any],
target_role: str = 'sre') -> Dict[str, Any]:
"""Generate comprehensive dashboard specification."""
service_name = service_def.get('name', 'Service')
service_type = service_def.get('type', 'api')
# Get role-specific configuration
role_config = self.ROLE_LAYOUTS.get(target_role, self.ROLE_LAYOUTS['sre'])
dashboard_spec = {
'metadata': {
'title': f"{service_name} - {target_role.upper()} Dashboard",
'service': service_def,
'target_role': target_role,
'generated_at': datetime.utcnow().isoformat() + 'Z',
'version': '1.0'
},
'configuration': {
'time_ranges': role_config['time_ranges'],
'default_time_range': role_config['time_ranges'][1], # Second option as default
'refresh_interval': role_config['default_refresh'],
'timezone': 'UTC',
'theme': 'dark'
},
'layout': self._generate_dashboard_layout(service_def, role_config),
'panels': self._generate_panels(service_def, role_config),
'variables': self._generate_template_variables(service_def),
'alerts_integration': self._generate_alerts_integration(service_def),
'drill_down_paths': self._generate_drill_down_paths(service_def)
}
return dashboard_spec
def _generate_dashboard_layout(self, service_def: Dict[str, Any],
role_config: Dict[str, Any]) -> Dict[str, Any]:
"""Generate dashboard layout configuration."""
return {
'grid_settings': {
'width': 24, # Grafana-style 24-column grid
'height_unit': 'px',
'cell_height': 30
},
'sections': [
{
'title': 'Service Overview',
'collapsed': False,
'y_position': 0,
'panels': ['service_status', 'slo_summary', 'error_budget']
},
{
'title': 'Golden Signals',
'collapsed': False,
'y_position': 8,
'panels': ['latency', 'traffic', 'errors', 'saturation']
},
{
'title': 'Resource Utilization',
'collapsed': False,
'y_position': 16,
'panels': ['cpu_usage', 'memory_usage', 'network_io', 'disk_io']
},
{
'title': 'Dependencies & Downstream',
'collapsed': True,
'y_position': 24,
'panels': ['dependency_status', 'downstream_latency', 'circuit_breakers']
}
]
}
def _generate_panels(self, service_def: Dict[str, Any],
role_config: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Generate dashboard panels based on service and role."""
service_name = service_def.get('name', 'service')
service_type = service_def.get('type', 'api')
panels = []
# Service Overview Panels
panels.extend(self._create_overview_panels(service_def))
# Golden Signals Panels
panels.extend(self._create_golden_signals_panels(service_def))
# Resource Utilization Panels
panels.extend(self._create_resource_panels(service_def))
# Service-specific panels
if service_type == 'api':
panels.extend(self._create_api_specific_panels(service_def))
elif service_type == 'database':
panels.extend(self._create_database_specific_panels(service_def))
elif service_type == 'queue':
panels.extend(self._create_queue_specific_panels(service_def))
# Role-specific additional panels
if 'business_metrics' in role_config['primary_focus']:
panels.extend(self._create_business_metrics_panels(service_def))
if 'capacity' in role_config['primary_focus']:
panels.extend(self._create_capacity_panels(service_def))
return panels
def _create_overview_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create service overview panels."""
service_name = service_def.get('name', 'service')
return [
{
'id': 'service_status',
'title': 'Service Status',
'type': 'stat',
'grid_pos': {'x': 0, 'y': 0, 'w': 6, 'h': 4},
'targets': [
{
'expr': f'up{{service="{service_name}"}}',
'legendFormat': 'Status'
}
],
'field_config': {
'overrides': [
{
'matcher': {'id': 'byName', 'options': 'Status'},
'properties': [
{'id': 'color', 'value': {'mode': 'thresholds'}},
{'id': 'thresholds', 'value': {
'steps': [
{'color': 'red', 'value': 0},
{'color': 'green', 'value': 1}
]
}},
{'id': 'mappings', 'value': [
{'options': {'0': {'text': 'DOWN'}}, 'type': 'value'},
{'options': {'1': {'text': 'UP'}}, 'type': 'value'}
]}
]
}
]
},
'options': {
'orientation': 'horizontal',
'textMode': 'value_and_name'
}
},
{
'id': 'slo_summary',
'title': 'SLO Achievement (30d)',
'type': 'stat',
'grid_pos': {'x': 6, 'y': 0, 'w': 9, 'h': 4},
'targets': [
{
'expr': f'(1 - (increase(http_requests_total{{service="{service_name}",code=~"5.."}}[30d]) / increase(http_requests_total{{service="{service_name}"}}[30d]))) * 100',
'legendFormat': 'Availability'
},
{
'expr': f'histogram_quantile(0.95, increase(http_request_duration_seconds_bucket{{service="{service_name}"}}[30d])) * 1000',
'legendFormat': 'P95 Latency (ms)'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'thresholds'},
'thresholds': {
'steps': [
{'color': 'red', 'value': 0},
{'color': 'yellow', 'value': 99.0},
{'color': 'green', 'value': 99.9}
]
}
}
},
'options': {
'orientation': 'horizontal',
'textMode': 'value_and_name'
}
},
{
'id': 'error_budget',
'title': 'Error Budget Remaining',
'type': 'gauge',
'grid_pos': {'x': 15, 'y': 0, 'w': 9, 'h': 4},
'targets': [
{
'expr': f'(1 - (increase(http_requests_total{{service="{service_name}",code=~"5.."}}[30d]) / increase(http_requests_total{{service="{service_name}"}}[30d])) - 0.999) / 0.001 * 100',
'legendFormat': 'Error Budget %'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'thresholds'},
'min': 0,
'max': 100,
'thresholds': {
'steps': [
{'color': 'red', 'value': 0},
{'color': 'yellow', 'value': 25},
{'color': 'green', 'value': 50}
]
},
'unit': 'percent'
}
},
'options': {
'showThresholdLabels': True,
'showThresholdMarkers': True
}
}
]
def _create_golden_signals_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create golden signals monitoring panels."""
service_name = service_def.get('name', 'service')
return [
{
'id': 'latency',
'title': 'Request Latency',
'type': 'timeseries',
'grid_pos': {'x': 0, 'y': 8, 'w': 12, 'h': 6},
'targets': [
{
'expr': f'histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000',
'legendFormat': 'P50 Latency'
},
{
'expr': f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000',
'legendFormat': 'P95 Latency'
},
{
'expr': f'histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000',
'legendFormat': 'P99 Latency'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'palette-classic'},
'unit': 'ms',
'custom': {
'drawStyle': 'line',
'lineInterpolation': 'linear',
'lineWidth': 1,
'fillOpacity': 10
}
}
},
'options': {
'tooltip': {'mode': 'multi', 'sort': 'desc'},
'legend': {'displayMode': 'table', 'placement': 'bottom'}
}
},
{
'id': 'traffic',
'title': 'Request Rate',
'type': 'timeseries',
'grid_pos': {'x': 12, 'y': 8, 'w': 12, 'h': 6},
'targets': [
{
'expr': f'sum(rate(http_requests_total{{service="{service_name}"}}[5m]))',
'legendFormat': 'Total RPS'
},
{
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"2.."}}[5m]))',
'legendFormat': '2xx RPS'
},
{
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"4.."}}[5m]))',
'legendFormat': '4xx RPS'
},
{
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"5.."}}[5m]))',
'legendFormat': '5xx RPS'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'palette-classic'},
'unit': 'reqps',
'custom': {
'drawStyle': 'line',
'lineInterpolation': 'linear',
'lineWidth': 1,
'fillOpacity': 0
}
}
},
'options': {
'tooltip': {'mode': 'multi', 'sort': 'desc'},
'legend': {'displayMode': 'table', 'placement': 'bottom'}
}
},
{
'id': 'errors',
'title': 'Error Rate',
'type': 'timeseries',
'grid_pos': {'x': 0, 'y': 14, 'w': 12, 'h': 6},
'targets': [
{
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"5.."}}[5m])) / sum(rate(http_requests_total{{service="{service_name}"}}[5m])) * 100',
'legendFormat': '5xx Error Rate'
},
{
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"4.."}}[5m])) / sum(rate(http_requests_total{{service="{service_name}"}}[5m])) * 100',
'legendFormat': '4xx Error Rate'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'palette-classic'},
'unit': 'percent',
'custom': {
'drawStyle': 'line',
'lineInterpolation': 'linear',
'lineWidth': 2,
'fillOpacity': 20
}
},
'overrides': [
{
'matcher': {'id': 'byName', 'options': '5xx Error Rate'},
'properties': [{'id': 'color', 'value': {'fixedColor': 'red'}}]
}
]
},
'options': {
'tooltip': {'mode': 'multi', 'sort': 'desc'},
'legend': {'displayMode': 'table', 'placement': 'bottom'}
}
},
{
'id': 'saturation',
'title': 'Saturation Metrics',
'type': 'timeseries',
'grid_pos': {'x': 12, 'y': 14, 'w': 12, 'h': 6},
'targets': [
{
'expr': f'rate(process_cpu_seconds_total{{service="{service_name}"}}[5m]) * 100',
'legendFormat': 'CPU Usage %'
},
{
'expr': f'process_resident_memory_bytes{{service="{service_name}"}} / process_virtual_memory_max_bytes{{service="{service_name}"}} * 100',
'legendFormat': 'Memory Usage %'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'palette-classic'},
'unit': 'percent',
'max': 100,
'custom': {
'drawStyle': 'line',
'lineInterpolation': 'linear',
'lineWidth': 1,
'fillOpacity': 10
}
}
},
'options': {
'tooltip': {'mode': 'multi', 'sort': 'desc'},
'legend': {'displayMode': 'table', 'placement': 'bottom'}
}
}
]
def _create_resource_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create resource utilization panels."""
service_name = service_def.get('name', 'service')
return [
{
'id': 'cpu_usage',
'title': 'CPU Usage',
'type': 'gauge',
'grid_pos': {'x': 0, 'y': 20, 'w': 6, 'h': 4},
'targets': [
{
'expr': f'rate(process_cpu_seconds_total{{service="{service_name}"}}[5m]) * 100',
'legendFormat': 'CPU %'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'thresholds'},
'unit': 'percent',
'min': 0,
'max': 100,
'thresholds': {
'steps': [
{'color': 'green', 'value': 0},
{'color': 'yellow', 'value': 70},
{'color': 'red', 'value': 90}
]
}
}
},
'options': {
'showThresholdLabels': True,
'showThresholdMarkers': True
}
},
{
'id': 'memory_usage',
'title': 'Memory Usage',
'type': 'gauge',
'grid_pos': {'x': 6, 'y': 20, 'w': 6, 'h': 4},
'targets': [
{
'expr': f'process_resident_memory_bytes{{service="{service_name}"}} / 1024 / 1024',
'legendFormat': 'Memory MB'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'thresholds'},
'unit': 'decbytes',
'thresholds': {
'steps': [
{'color': 'green', 'value': 0},
{'color': 'yellow', 'value': 512000000}, # 512MB
{'color': 'red', 'value': 1024000000} # 1GB
]
}
}
}
},
{
'id': 'network_io',
'title': 'Network I/O',
'type': 'timeseries',
'grid_pos': {'x': 12, 'y': 20, 'w': 6, 'h': 4},
'targets': [
{
'expr': f'rate(process_network_receive_bytes_total{{service="{service_name}"}}[5m])',
'legendFormat': 'RX Bytes/s'
},
{
'expr': f'rate(process_network_transmit_bytes_total{{service="{service_name}"}}[5m])',
'legendFormat': 'TX Bytes/s'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'palette-classic'},
'unit': 'binBps'
}
}
},
{
'id': 'disk_io',
'title': 'Disk I/O',
'type': 'timeseries',
'grid_pos': {'x': 18, 'y': 20, 'w': 6, 'h': 4},
'targets': [
{
'expr': f'rate(process_disk_read_bytes_total{{service="{service_name}"}}[5m])',
'legendFormat': 'Read Bytes/s'
},
{
'expr': f'rate(process_disk_write_bytes_total{{service="{service_name}"}}[5m])',
'legendFormat': 'Write Bytes/s'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'palette-classic'},
'unit': 'binBps'
}
}
}
]
def _create_api_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create API-specific panels."""
service_name = service_def.get('name', 'service')
return [
{
'id': 'endpoint_latency',
'title': 'Top Slowest Endpoints',
'type': 'table',
'grid_pos': {'x': 0, 'y': 24, 'w': 12, 'h': 6},
'targets': [
{
'expr': f'topk(10, histogram_quantile(0.95, sum by (handler) (rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])))) * 1000',
'legendFormat': '{{handler}}',
'format': 'table',
'instant': True
}
],
'transformations': [
{
'id': 'organize',
'options': {
'excludeByName': {'Time': True},
'renameByName': {'Value': 'P95 Latency (ms)'}
}
}
],
'field_config': {
'overrides': [
{
'matcher': {'id': 'byName', 'options': 'P95 Latency (ms)'},
'properties': [
{'id': 'color', 'value': {'mode': 'thresholds'}},
{'id': 'thresholds', 'value': {
'steps': [
{'color': 'green', 'value': 0},
{'color': 'yellow', 'value': 100},
{'color': 'red', 'value': 500}
]
}}
]
}
]
}
},
{
'id': 'request_size_distribution',
'title': 'Request Size Distribution',
'type': 'heatmap',
'grid_pos': {'x': 12, 'y': 24, 'w': 12, 'h': 6},
'targets': [
{
'expr': f'sum by (le) (rate(http_request_size_bytes_bucket{{service="{service_name}"}}[5m]))',
'legendFormat': '{{le}}'
}
],
'options': {
'calculate': True,
'yAxis': {'unit': 'bytes'},
'color': {'scheme': 'Spectral'}
}
}
]
def _create_database_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create database-specific panels."""
service_name = service_def.get('name', 'service')
return [
{
'id': 'db_connections',
'title': 'Database Connections',
'type': 'timeseries',
'grid_pos': {'x': 0, 'y': 24, 'w': 8, 'h': 6},
'targets': [
{
'expr': f'db_connections_active{{service="{service_name}"}}',
'legendFormat': 'Active Connections'
},
{
'expr': f'db_connections_idle{{service="{service_name}"}}',
'legendFormat': 'Idle Connections'
},
{
'expr': f'db_connections_max{{service="{service_name}"}}',
'legendFormat': 'Max Connections'
}
]
},
{
'id': 'query_performance',
'title': 'Query Performance',
'type': 'timeseries',
'grid_pos': {'x': 8, 'y': 24, 'w': 8, 'h': 6},
'targets': [
{
'expr': f'rate(db_queries_total{{service="{service_name}"}}[5m])',
'legendFormat': 'Queries/sec'
},
{
'expr': f'rate(db_slow_queries_total{{service="{service_name}"}}[5m])',
'legendFormat': 'Slow Queries/sec'
}
]
},
{
'id': 'db_locks',
'title': 'Database Locks',
'type': 'stat',
'grid_pos': {'x': 16, 'y': 24, 'w': 8, 'h': 6},
'targets': [
{
'expr': f'db_locks_waiting{{service="{service_name}"}}',
'legendFormat': 'Waiting Locks'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'thresholds'},
'thresholds': {
'steps': [
{'color': 'green', 'value': 0},
{'color': 'yellow', 'value': 1},
{'color': 'red', 'value': 5}
]
}
}
}
}
]
def _create_queue_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create queue-specific panels."""
service_name = service_def.get('name', 'service')
return [
{
'id': 'queue_depth',
'title': 'Queue Depth',
'type': 'timeseries',
'grid_pos': {'x': 0, 'y': 24, 'w': 12, 'h': 6},
'targets': [
{
'expr': f'queue_depth{{service="{service_name}"}}',
'legendFormat': 'Messages in Queue'
}
]
},
{
'id': 'message_throughput',
'title': 'Message Throughput',
'type': 'timeseries',
'grid_pos': {'x': 12, 'y': 24, 'w': 12, 'h': 6},
'targets': [
{
'expr': f'rate(messages_published_total{{service="{service_name}"}}[5m])',
'legendFormat': 'Published/sec'
},
{
'expr': f'rate(messages_consumed_total{{service="{service_name}"}}[5m])',
'legendFormat': 'Consumed/sec'
}
]
}
]
def _create_business_metrics_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create business metrics panels."""
service_name = service_def.get('name', 'service')
return [
{
'id': 'business_kpis',
'title': 'Business KPIs',
'type': 'stat',
'grid_pos': {'x': 0, 'y': 30, 'w': 24, 'h': 4},
'targets': [
{
'expr': f'rate(business_transactions_total{{service="{service_name}"}}[1h])',
'legendFormat': 'Transactions/hour'
},
{
'expr': f'avg(business_transaction_value{{service="{service_name}"}}) * rate(business_transactions_total{{service="{service_name}"}}[1h])',
'legendFormat': 'Revenue/hour'
},
{
'expr': f'rate(user_registrations_total{{service="{service_name}"}}[1h])',
'legendFormat': 'New Users/hour'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'palette-classic'},
'custom': {
'displayMode': 'basic'
}
}
},
'options': {
'orientation': 'horizontal',
'textMode': 'value_and_name'
}
}
]
def _create_capacity_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create capacity planning panels."""
service_name = service_def.get('name', 'service')
return [
{
'id': 'capacity_trends',
'title': 'Capacity Trends (7d)',
'type': 'timeseries',
'grid_pos': {'x': 0, 'y': 34, 'w': 24, 'h': 6},
'targets': [
{
'expr': f'predict_linear(avg_over_time(rate(http_requests_total{{service="{service_name}"}}[5m])[7d:1h]), 7*24*3600)',
'legendFormat': 'Predicted Traffic (7d)'
},
{
'expr': f'predict_linear(avg_over_time(process_resident_memory_bytes{{service="{service_name}"}}[7d:1h]), 7*24*3600)',
'legendFormat': 'Predicted Memory Usage (7d)'
}
],
'field_config': {
'defaults': {
'color': {'mode': 'palette-classic'},
'custom': {
'drawStyle': 'line',
'lineStyle': {'dash': [10, 10]}
}
}
}
}
]
def _generate_template_variables(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Generate template variables for dynamic dashboard filtering."""
service_name = service_def.get('name', 'service')
return [
{
'name': 'environment',
'type': 'query',
'query': 'label_values(environment)',
'current': {'text': 'production', 'value': 'production'},
'includeAll': False,
'multi': False,
'refresh': 'on_dashboard_load'
},
{
'name': 'instance',
'type': 'query',
'query': f'label_values(up{{service="{service_name}"}}, instance)',
'current': {'text': 'All', 'value': '$__all'},
'includeAll': True,
'multi': True,
'refresh': 'on_time_range_change'
},
{
'name': 'handler',
'type': 'query',
'query': f'label_values(http_requests_total{{service="{service_name}"}}, handler)',
'current': {'text': 'All', 'value': '$__all'},
'includeAll': True,
'multi': True,
'refresh': 'on_time_range_change'
}
]
def _generate_alerts_integration(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
"""Generate alerts integration configuration."""
service_name = service_def.get('name', 'service')
return {
'alert_annotations': True,
'alert_rules_query': f'ALERTS{{service="{service_name}"}}',
'alert_panels': [
{
'title': 'Active Alerts',
'type': 'table',
'query': f'ALERTS{{service="{service_name}",alertstate="firing"}}',
'columns': ['alertname', 'severity', 'instance', 'description']
}
]
}
def _generate_drill_down_paths(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
"""Generate drill-down navigation paths."""
service_name = service_def.get('name', 'service')
return {
'service_overview': {
'from': 'service_status',
'to': 'detailed_health_dashboard',
'url': f'/d/service-health/{service_name}-health',
'params': ['var-service', 'var-environment']
},
'error_investigation': {
'from': 'errors',
'to': 'error_details_dashboard',
'url': f'/d/errors/{service_name}-errors',
'params': ['var-service', 'var-time_range']
},
'latency_analysis': {
'from': 'latency',
'to': 'trace_analysis_dashboard',
'url': f'/d/traces/{service_name}-traces',
'params': ['var-service', 'var-handler']
},
'capacity_planning': {
'from': 'saturation',
'to': 'capacity_dashboard',
'url': f'/d/capacity/{service_name}-capacity',
'params': ['var-service', 'var-time_range']
}
}
def generate_grafana_json(self, dashboard_spec: Dict[str, Any]) -> Dict[str, Any]:
"""Convert dashboard specification to Grafana JSON format."""
metadata = dashboard_spec['metadata']
config = dashboard_spec['configuration']
grafana_json = {
'dashboard': {
'id': None,
'title': metadata['title'],
'tags': [metadata['service']['type'], metadata['target_role'], 'generated'],
'timezone': config['timezone'],
'refresh': config['refresh_interval'],
'time': {
'from': 'now-1h',
'to': 'now'
},
'templating': {
'list': dashboard_spec['variables']
},
'panels': self._convert_panels_to_grafana_format(dashboard_spec['panels']),
'version': 1,
'schemaVersion': 30
},
'overwrite': True
}
return grafana_json
def _convert_panels_to_grafana_format(self, panels: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert panel specifications to Grafana format."""
grafana_panels = []
for panel in panels:
grafana_panel = {
'id': hash(panel['id']) % 1000, # Generate numeric ID
'title': panel['title'],
'type': panel['type'],
'gridPos': panel['grid_pos'],
'targets': panel['targets'],
'fieldConfig': panel.get('field_config', {}),
'options': panel.get('options', {}),
'transformations': panel.get('transformations', [])
}
grafana_panels.append(grafana_panel)
return grafana_panels
def generate_documentation(self, dashboard_spec: Dict[str, Any]) -> str:
"""Generate documentation for the dashboard."""
metadata = dashboard_spec['metadata']
service = metadata['service']
doc_content = f"""# {metadata['title']} Documentation
## Overview
This dashboard provides comprehensive monitoring for {service['name']}, a {service['type']} service with {service['criticality']} criticality.
**Target Audience:** {metadata['target_role'].upper()} teams
**Generated:** {metadata['generated_at']}
## Dashboard Sections
### Service Overview
- **Service Status**: Real-time availability status
- **SLO Achievement**: 30-day SLO compliance metrics
- **Error Budget**: Remaining error budget visualization
### Golden Signals Monitoring
- **Latency**: P50, P95, P99 response times
- **Traffic**: Request rate by status code
- **Errors**: Error rates for 4xx and 5xx responses
- **Saturation**: CPU and memory utilization
### Resource Utilization
- **CPU Usage**: Process CPU consumption
- **Memory Usage**: Memory utilization tracking
- **Network I/O**: Network throughput metrics
- **Disk I/O**: Disk read/write operations
## Key Metrics
### SLIs Tracked
"""
# Add service-type specific metrics
service_type = service.get('type', 'api')
if service_type in self.SERVICE_METRICS:
metrics = self.SERVICE_METRICS[service_type]['key_metrics']
for metric in metrics:
doc_content += f"- `{metric}`: Core service metric\n"
doc_content += f"""
## Alert Integration
- Active alerts are displayed in context with relevant panels
- Alert annotations show on time series charts
- Click-through to alert management system available
## Drill-Down Paths
"""
drill_downs = dashboard_spec.get('drill_down_paths', {})
for path_name, path_config in drill_downs.items():
doc_content += f"- **{path_name}**: From {path_config['from']}{path_config['to']}\n"
doc_content += f"""
## Usage Guidelines
### Time Ranges
Use appropriate time ranges for different investigation types:
- **Real-time monitoring**: 15m - 1h
- **Recent incident investigation**: 1h - 6h
- **Trend analysis**: 1d - 7d
- **Capacity planning**: 7d - 30d
### Variables
- **environment**: Filter by deployment environment
- **instance**: Focus on specific service instances
- **handler**: Filter by API endpoint or handler
### Performance Optimization
- Use longer time ranges for capacity planning
- Refresh intervals are optimized per role:
- SRE: 30s for operational awareness
- Developer: 1m for troubleshooting
- Executive: 5m for high-level monitoring
## Maintenance
- Dashboard panels automatically adapt to service changes
- Template variables refresh based on actual metric labels
- Review and update business metrics quarterly
"""
return doc_content
def export_specification(self, dashboard_spec: Dict[str, Any], output_file: str,
format_type: str = 'json'):
"""Export dashboard specification."""
if format_type.lower() == 'json':
with open(output_file, 'w') as f:
json.dump(dashboard_spec, f, indent=2)
elif format_type.lower() == 'grafana':
grafana_json = self.generate_grafana_json(dashboard_spec)
with open(output_file, 'w') as f:
json.dump(grafana_json, f, indent=2)
else:
raise ValueError(f"Unsupported format: {format_type}")
def print_summary(self, dashboard_spec: Dict[str, Any]):
"""Print human-readable summary of dashboard specification."""
metadata = dashboard_spec['metadata']
service = metadata['service']
config = dashboard_spec['configuration']
panels = dashboard_spec['panels']
print(f"\n{'='*60}")
print(f"DASHBOARD SPECIFICATION SUMMARY")
print(f"{'='*60}")
print(f"\nDashboard Details:")
print(f" Title: {metadata['title']}")
print(f" Target Role: {metadata['target_role'].upper()}")
print(f" Service: {service['name']} ({service['type']})")
print(f" Criticality: {service['criticality']}")
print(f" Generated: {metadata['generated_at']}")
print(f"\nConfiguration:")
print(f" Default Time Range: {config['default_time_range']}")
print(f" Refresh Interval: {config['refresh_interval']}")
print(f" Available Time Ranges: {', '.join(config['time_ranges'])}")
print(f"\nPanels ({len(panels)}):")
panel_types = {}
for panel in panels:
panel_type = panel['type']
panel_types[panel_type] = panel_types.get(panel_type, 0) + 1
for panel_type, count in panel_types.items():
print(f" {panel_type}: {count}")
variables = dashboard_spec.get('variables', [])
print(f"\nTemplate Variables ({len(variables)}):")
for var in variables:
print(f" {var['name']} ({var['type']})")
drill_downs = dashboard_spec.get('drill_down_paths', {})
print(f"\nDrill-down Paths: {len(drill_downs)}")
print(f"\nKey Features:")
print(f" • Golden Signals monitoring")
print(f" • Resource utilization tracking")
print(f" • Alert integration")
print(f" • Role-optimized layout")
print(f" • Service-type specific panels")
print(f"\n{'='*60}\n")
def main():
"""Main function for CLI usage."""
parser = argparse.ArgumentParser(
description='Generate comprehensive dashboard specifications',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate from service definition file
python dashboard_generator.py --input service.json --output dashboard.json
# Generate from command line parameters
python dashboard_generator.py --service-type api --name "Payment Service" --output payment_dashboard.json
# Generate Grafana-compatible JSON
python dashboard_generator.py --input service.json --output dashboard.json --format grafana
# Generate with specific role focus
python dashboard_generator.py --service-type web --name "Frontend" --role developer --output frontend_dev.json
"""
)
parser.add_argument('--input', '-i',
help='Input service definition JSON file')
parser.add_argument('--output', '-o',
help='Output dashboard specification file')
parser.add_argument('--service-type',
choices=['api', 'web', 'database', 'queue', 'batch', 'ml'],
help='Service type')
parser.add_argument('--name',
help='Service name')
parser.add_argument('--criticality',
choices=['critical', 'high', 'medium', 'low'],
default='medium',
help='Service criticality level')
parser.add_argument('--role',
choices=['sre', 'developer', 'executive', 'ops'],
default='sre',
help='Target role for dashboard optimization')
parser.add_argument('--format',
choices=['json', 'grafana'],
default='json',
help='Output format (json specification or grafana compatible)')
parser.add_argument('--doc-output',
help='Generate documentation file')
parser.add_argument('--summary-only', action='store_true',
help='Only display summary, do not save files')
args = parser.parse_args()
if not args.input and not (args.service_type and args.name):
parser.error("Must provide either --input file or --service-type and --name")
generator = DashboardGenerator()
try:
# Load or create service definition
if args.input:
service_def = generator.load_service_definition(args.input)
else:
service_def = generator.create_service_definition(
args.service_type, args.name, args.criticality
)
# Generate dashboard specification
dashboard_spec = generator.generate_dashboard_specification(service_def, args.role)
# Output results
if not args.summary_only:
output_file = args.output or f"{service_def['name'].replace(' ', '_').lower()}_dashboard.json"
generator.export_specification(dashboard_spec, output_file, args.format)
print(f"Dashboard specification saved to: {output_file}")
# Generate documentation if requested
if args.doc_output:
documentation = generator.generate_documentation(dashboard_spec)
with open(args.doc_output, 'w') as f:
f.write(documentation)
print(f"Documentation saved to: {args.doc_output}")
# Always show summary
generator.print_summary(dashboard_spec)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()