1219 lines
48 KiB
Python
1219 lines
48 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Dashboard Generator - Generate comprehensive dashboard specifications
|
|
|
|
This script generates dashboard specifications based on service/system descriptions:
|
|
- Panel layout optimized for different screen sizes and roles
|
|
- Metric queries (Prometheus-style) for comprehensive monitoring
|
|
- Visualization types appropriate for different metric types
|
|
- Drill-down paths for effective troubleshooting workflows
|
|
- Golden signals coverage (latency, traffic, errors, saturation)
|
|
- RED/USE method implementation
|
|
- Business metrics integration
|
|
|
|
Usage:
|
|
python dashboard_generator.py --input service_definition.json --output dashboard_spec.json
|
|
python dashboard_generator.py --service-type api --name "Payment Service" --output payment_dashboard.json
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import sys
|
|
import math
|
|
from typing import Dict, List, Any, Tuple
|
|
from datetime import datetime, timedelta
|
|
|
|
|
|
class DashboardGenerator:
|
|
"""Generate comprehensive dashboard specifications."""
|
|
|
|
# Dashboard layout templates by role
|
|
ROLE_LAYOUTS = {
|
|
'sre': {
|
|
'primary_focus': ['availability', 'latency', 'errors', 'resource_utilization'],
|
|
'secondary_focus': ['throughput', 'capacity', 'dependencies'],
|
|
'time_ranges': ['1h', '6h', '1d', '7d'],
|
|
'default_refresh': '30s'
|
|
},
|
|
'developer': {
|
|
'primary_focus': ['latency', 'errors', 'throughput', 'business_metrics'],
|
|
'secondary_focus': ['resource_utilization', 'dependencies'],
|
|
'time_ranges': ['15m', '1h', '6h', '1d'],
|
|
'default_refresh': '1m'
|
|
},
|
|
'executive': {
|
|
'primary_focus': ['availability', 'business_metrics', 'user_experience'],
|
|
'secondary_focus': ['cost', 'capacity_trends'],
|
|
'time_ranges': ['1d', '7d', '30d'],
|
|
'default_refresh': '5m'
|
|
},
|
|
'ops': {
|
|
'primary_focus': ['resource_utilization', 'capacity', 'alerts', 'deployments'],
|
|
'secondary_focus': ['throughput', 'latency'],
|
|
'time_ranges': ['5m', '30m', '2h', '1d'],
|
|
'default_refresh': '15s'
|
|
}
|
|
}
|
|
|
|
# Service type specific metric configurations
|
|
SERVICE_METRICS = {
|
|
'api': {
|
|
'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
|
|
'key_metrics': [
|
|
'http_requests_total',
|
|
'http_request_duration_seconds',
|
|
'http_request_size_bytes',
|
|
'http_response_size_bytes'
|
|
],
|
|
'resource_metrics': ['cpu_usage', 'memory_usage', 'goroutines']
|
|
},
|
|
'web': {
|
|
'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
|
|
'key_metrics': [
|
|
'http_requests_total',
|
|
'http_request_duration_seconds',
|
|
'page_load_time',
|
|
'user_sessions'
|
|
],
|
|
'resource_metrics': ['cpu_usage', 'memory_usage', 'connections']
|
|
},
|
|
'database': {
|
|
'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
|
|
'key_metrics': [
|
|
'db_connections_active',
|
|
'db_query_duration_seconds',
|
|
'db_queries_total',
|
|
'db_slow_queries_total'
|
|
],
|
|
'resource_metrics': ['cpu_usage', 'memory_usage', 'disk_io', 'connections']
|
|
},
|
|
'queue': {
|
|
'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
|
|
'key_metrics': [
|
|
'queue_depth',
|
|
'message_processing_duration',
|
|
'messages_published_total',
|
|
'messages_consumed_total'
|
|
],
|
|
'resource_metrics': ['cpu_usage', 'memory_usage', 'disk_usage']
|
|
}
|
|
}
|
|
|
|
# Visualization type recommendations
|
|
VISUALIZATION_TYPES = {
|
|
'latency': 'line_chart',
|
|
'throughput': 'line_chart',
|
|
'error_rate': 'line_chart',
|
|
'success_rate': 'stat',
|
|
'resource_utilization': 'gauge',
|
|
'queue_depth': 'bar_chart',
|
|
'status': 'stat',
|
|
'distribution': 'heatmap',
|
|
'alerts': 'table',
|
|
'logs': 'logs_panel'
|
|
}
|
|
|
|
def __init__(self):
|
|
"""Initialize the Dashboard Generator."""
|
|
self.service_config = {}
|
|
self.dashboard_spec = {}
|
|
|
|
def load_service_definition(self, file_path: str) -> Dict[str, Any]:
|
|
"""Load service definition from JSON file."""
|
|
try:
|
|
with open(file_path, 'r') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
raise ValueError(f"Service definition file not found: {file_path}")
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Invalid JSON in service definition: {e}")
|
|
|
|
def create_service_definition(self, service_type: str, name: str,
|
|
criticality: str = 'medium') -> Dict[str, Any]:
|
|
"""Create a service definition from parameters."""
|
|
return {
|
|
'name': name,
|
|
'type': service_type,
|
|
'criticality': criticality,
|
|
'description': f'{name} - A {criticality} criticality {service_type} service',
|
|
'team': 'platform',
|
|
'environment': 'production',
|
|
'dependencies': [],
|
|
'tags': []
|
|
}
|
|
|
|
def generate_dashboard_specification(self, service_def: Dict[str, Any],
|
|
target_role: str = 'sre') -> Dict[str, Any]:
|
|
"""Generate comprehensive dashboard specification."""
|
|
service_name = service_def.get('name', 'Service')
|
|
service_type = service_def.get('type', 'api')
|
|
|
|
# Get role-specific configuration
|
|
role_config = self.ROLE_LAYOUTS.get(target_role, self.ROLE_LAYOUTS['sre'])
|
|
|
|
dashboard_spec = {
|
|
'metadata': {
|
|
'title': f"{service_name} - {target_role.upper()} Dashboard",
|
|
'service': service_def,
|
|
'target_role': target_role,
|
|
'generated_at': datetime.utcnow().isoformat() + 'Z',
|
|
'version': '1.0'
|
|
},
|
|
'configuration': {
|
|
'time_ranges': role_config['time_ranges'],
|
|
'default_time_range': role_config['time_ranges'][1], # Second option as default
|
|
'refresh_interval': role_config['default_refresh'],
|
|
'timezone': 'UTC',
|
|
'theme': 'dark'
|
|
},
|
|
'layout': self._generate_dashboard_layout(service_def, role_config),
|
|
'panels': self._generate_panels(service_def, role_config),
|
|
'variables': self._generate_template_variables(service_def),
|
|
'alerts_integration': self._generate_alerts_integration(service_def),
|
|
'drill_down_paths': self._generate_drill_down_paths(service_def)
|
|
}
|
|
|
|
return dashboard_spec
|
|
|
|
def _generate_dashboard_layout(self, service_def: Dict[str, Any],
|
|
role_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate dashboard layout configuration."""
|
|
return {
|
|
'grid_settings': {
|
|
'width': 24, # Grafana-style 24-column grid
|
|
'height_unit': 'px',
|
|
'cell_height': 30
|
|
},
|
|
'sections': [
|
|
{
|
|
'title': 'Service Overview',
|
|
'collapsed': False,
|
|
'y_position': 0,
|
|
'panels': ['service_status', 'slo_summary', 'error_budget']
|
|
},
|
|
{
|
|
'title': 'Golden Signals',
|
|
'collapsed': False,
|
|
'y_position': 8,
|
|
'panels': ['latency', 'traffic', 'errors', 'saturation']
|
|
},
|
|
{
|
|
'title': 'Resource Utilization',
|
|
'collapsed': False,
|
|
'y_position': 16,
|
|
'panels': ['cpu_usage', 'memory_usage', 'network_io', 'disk_io']
|
|
},
|
|
{
|
|
'title': 'Dependencies & Downstream',
|
|
'collapsed': True,
|
|
'y_position': 24,
|
|
'panels': ['dependency_status', 'downstream_latency', 'circuit_breakers']
|
|
}
|
|
]
|
|
}
|
|
|
|
def _generate_panels(self, service_def: Dict[str, Any],
|
|
role_config: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Generate dashboard panels based on service and role."""
|
|
service_name = service_def.get('name', 'service')
|
|
service_type = service_def.get('type', 'api')
|
|
panels = []
|
|
|
|
# Service Overview Panels
|
|
panels.extend(self._create_overview_panels(service_def))
|
|
|
|
# Golden Signals Panels
|
|
panels.extend(self._create_golden_signals_panels(service_def))
|
|
|
|
# Resource Utilization Panels
|
|
panels.extend(self._create_resource_panels(service_def))
|
|
|
|
# Service-specific panels
|
|
if service_type == 'api':
|
|
panels.extend(self._create_api_specific_panels(service_def))
|
|
elif service_type == 'database':
|
|
panels.extend(self._create_database_specific_panels(service_def))
|
|
elif service_type == 'queue':
|
|
panels.extend(self._create_queue_specific_panels(service_def))
|
|
|
|
# Role-specific additional panels
|
|
if 'business_metrics' in role_config['primary_focus']:
|
|
panels.extend(self._create_business_metrics_panels(service_def))
|
|
|
|
if 'capacity' in role_config['primary_focus']:
|
|
panels.extend(self._create_capacity_panels(service_def))
|
|
|
|
return panels
|
|
|
|
def _create_overview_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Create service overview panels."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return [
|
|
{
|
|
'id': 'service_status',
|
|
'title': 'Service Status',
|
|
'type': 'stat',
|
|
'grid_pos': {'x': 0, 'y': 0, 'w': 6, 'h': 4},
|
|
'targets': [
|
|
{
|
|
'expr': f'up{{service="{service_name}"}}',
|
|
'legendFormat': 'Status'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'overrides': [
|
|
{
|
|
'matcher': {'id': 'byName', 'options': 'Status'},
|
|
'properties': [
|
|
{'id': 'color', 'value': {'mode': 'thresholds'}},
|
|
{'id': 'thresholds', 'value': {
|
|
'steps': [
|
|
{'color': 'red', 'value': 0},
|
|
{'color': 'green', 'value': 1}
|
|
]
|
|
}},
|
|
{'id': 'mappings', 'value': [
|
|
{'options': {'0': {'text': 'DOWN'}}, 'type': 'value'},
|
|
{'options': {'1': {'text': 'UP'}}, 'type': 'value'}
|
|
]}
|
|
]
|
|
}
|
|
]
|
|
},
|
|
'options': {
|
|
'orientation': 'horizontal',
|
|
'textMode': 'value_and_name'
|
|
}
|
|
},
|
|
{
|
|
'id': 'slo_summary',
|
|
'title': 'SLO Achievement (30d)',
|
|
'type': 'stat',
|
|
'grid_pos': {'x': 6, 'y': 0, 'w': 9, 'h': 4},
|
|
'targets': [
|
|
{
|
|
'expr': f'(1 - (increase(http_requests_total{{service="{service_name}",code=~"5.."}}[30d]) / increase(http_requests_total{{service="{service_name}"}}[30d]))) * 100',
|
|
'legendFormat': 'Availability'
|
|
},
|
|
{
|
|
'expr': f'histogram_quantile(0.95, increase(http_request_duration_seconds_bucket{{service="{service_name}"}}[30d])) * 1000',
|
|
'legendFormat': 'P95 Latency (ms)'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'thresholds'},
|
|
'thresholds': {
|
|
'steps': [
|
|
{'color': 'red', 'value': 0},
|
|
{'color': 'yellow', 'value': 99.0},
|
|
{'color': 'green', 'value': 99.9}
|
|
]
|
|
}
|
|
}
|
|
},
|
|
'options': {
|
|
'orientation': 'horizontal',
|
|
'textMode': 'value_and_name'
|
|
}
|
|
},
|
|
{
|
|
'id': 'error_budget',
|
|
'title': 'Error Budget Remaining',
|
|
'type': 'gauge',
|
|
'grid_pos': {'x': 15, 'y': 0, 'w': 9, 'h': 4},
|
|
'targets': [
|
|
{
|
|
'expr': f'(1 - (increase(http_requests_total{{service="{service_name}",code=~"5.."}}[30d]) / increase(http_requests_total{{service="{service_name}"}}[30d])) - 0.999) / 0.001 * 100',
|
|
'legendFormat': 'Error Budget %'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'thresholds'},
|
|
'min': 0,
|
|
'max': 100,
|
|
'thresholds': {
|
|
'steps': [
|
|
{'color': 'red', 'value': 0},
|
|
{'color': 'yellow', 'value': 25},
|
|
{'color': 'green', 'value': 50}
|
|
]
|
|
},
|
|
'unit': 'percent'
|
|
}
|
|
},
|
|
'options': {
|
|
'showThresholdLabels': True,
|
|
'showThresholdMarkers': True
|
|
}
|
|
}
|
|
]
|
|
|
|
def _create_golden_signals_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Create golden signals monitoring panels."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return [
|
|
{
|
|
'id': 'latency',
|
|
'title': 'Request Latency',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 0, 'y': 8, 'w': 12, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000',
|
|
'legendFormat': 'P50 Latency'
|
|
},
|
|
{
|
|
'expr': f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000',
|
|
'legendFormat': 'P95 Latency'
|
|
},
|
|
{
|
|
'expr': f'histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000',
|
|
'legendFormat': 'P99 Latency'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'palette-classic'},
|
|
'unit': 'ms',
|
|
'custom': {
|
|
'drawStyle': 'line',
|
|
'lineInterpolation': 'linear',
|
|
'lineWidth': 1,
|
|
'fillOpacity': 10
|
|
}
|
|
}
|
|
},
|
|
'options': {
|
|
'tooltip': {'mode': 'multi', 'sort': 'desc'},
|
|
'legend': {'displayMode': 'table', 'placement': 'bottom'}
|
|
}
|
|
},
|
|
{
|
|
'id': 'traffic',
|
|
'title': 'Request Rate',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 12, 'y': 8, 'w': 12, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'sum(rate(http_requests_total{{service="{service_name}"}}[5m]))',
|
|
'legendFormat': 'Total RPS'
|
|
},
|
|
{
|
|
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"2.."}}[5m]))',
|
|
'legendFormat': '2xx RPS'
|
|
},
|
|
{
|
|
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"4.."}}[5m]))',
|
|
'legendFormat': '4xx RPS'
|
|
},
|
|
{
|
|
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"5.."}}[5m]))',
|
|
'legendFormat': '5xx RPS'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'palette-classic'},
|
|
'unit': 'reqps',
|
|
'custom': {
|
|
'drawStyle': 'line',
|
|
'lineInterpolation': 'linear',
|
|
'lineWidth': 1,
|
|
'fillOpacity': 0
|
|
}
|
|
}
|
|
},
|
|
'options': {
|
|
'tooltip': {'mode': 'multi', 'sort': 'desc'},
|
|
'legend': {'displayMode': 'table', 'placement': 'bottom'}
|
|
}
|
|
},
|
|
{
|
|
'id': 'errors',
|
|
'title': 'Error Rate',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 0, 'y': 14, 'w': 12, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"5.."}}[5m])) / sum(rate(http_requests_total{{service="{service_name}"}}[5m])) * 100',
|
|
'legendFormat': '5xx Error Rate'
|
|
},
|
|
{
|
|
'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"4.."}}[5m])) / sum(rate(http_requests_total{{service="{service_name}"}}[5m])) * 100',
|
|
'legendFormat': '4xx Error Rate'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'palette-classic'},
|
|
'unit': 'percent',
|
|
'custom': {
|
|
'drawStyle': 'line',
|
|
'lineInterpolation': 'linear',
|
|
'lineWidth': 2,
|
|
'fillOpacity': 20
|
|
}
|
|
},
|
|
'overrides': [
|
|
{
|
|
'matcher': {'id': 'byName', 'options': '5xx Error Rate'},
|
|
'properties': [{'id': 'color', 'value': {'fixedColor': 'red'}}]
|
|
}
|
|
]
|
|
},
|
|
'options': {
|
|
'tooltip': {'mode': 'multi', 'sort': 'desc'},
|
|
'legend': {'displayMode': 'table', 'placement': 'bottom'}
|
|
}
|
|
},
|
|
{
|
|
'id': 'saturation',
|
|
'title': 'Saturation Metrics',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 12, 'y': 14, 'w': 12, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'rate(process_cpu_seconds_total{{service="{service_name}"}}[5m]) * 100',
|
|
'legendFormat': 'CPU Usage %'
|
|
},
|
|
{
|
|
'expr': f'process_resident_memory_bytes{{service="{service_name}"}} / process_virtual_memory_max_bytes{{service="{service_name}"}} * 100',
|
|
'legendFormat': 'Memory Usage %'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'palette-classic'},
|
|
'unit': 'percent',
|
|
'max': 100,
|
|
'custom': {
|
|
'drawStyle': 'line',
|
|
'lineInterpolation': 'linear',
|
|
'lineWidth': 1,
|
|
'fillOpacity': 10
|
|
}
|
|
}
|
|
},
|
|
'options': {
|
|
'tooltip': {'mode': 'multi', 'sort': 'desc'},
|
|
'legend': {'displayMode': 'table', 'placement': 'bottom'}
|
|
}
|
|
}
|
|
]
|
|
|
|
def _create_resource_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Create resource utilization panels."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return [
|
|
{
|
|
'id': 'cpu_usage',
|
|
'title': 'CPU Usage',
|
|
'type': 'gauge',
|
|
'grid_pos': {'x': 0, 'y': 20, 'w': 6, 'h': 4},
|
|
'targets': [
|
|
{
|
|
'expr': f'rate(process_cpu_seconds_total{{service="{service_name}"}}[5m]) * 100',
|
|
'legendFormat': 'CPU %'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'thresholds'},
|
|
'unit': 'percent',
|
|
'min': 0,
|
|
'max': 100,
|
|
'thresholds': {
|
|
'steps': [
|
|
{'color': 'green', 'value': 0},
|
|
{'color': 'yellow', 'value': 70},
|
|
{'color': 'red', 'value': 90}
|
|
]
|
|
}
|
|
}
|
|
},
|
|
'options': {
|
|
'showThresholdLabels': True,
|
|
'showThresholdMarkers': True
|
|
}
|
|
},
|
|
{
|
|
'id': 'memory_usage',
|
|
'title': 'Memory Usage',
|
|
'type': 'gauge',
|
|
'grid_pos': {'x': 6, 'y': 20, 'w': 6, 'h': 4},
|
|
'targets': [
|
|
{
|
|
'expr': f'process_resident_memory_bytes{{service="{service_name}"}} / 1024 / 1024',
|
|
'legendFormat': 'Memory MB'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'thresholds'},
|
|
'unit': 'decbytes',
|
|
'thresholds': {
|
|
'steps': [
|
|
{'color': 'green', 'value': 0},
|
|
{'color': 'yellow', 'value': 512000000}, # 512MB
|
|
{'color': 'red', 'value': 1024000000} # 1GB
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
'id': 'network_io',
|
|
'title': 'Network I/O',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 12, 'y': 20, 'w': 6, 'h': 4},
|
|
'targets': [
|
|
{
|
|
'expr': f'rate(process_network_receive_bytes_total{{service="{service_name}"}}[5m])',
|
|
'legendFormat': 'RX Bytes/s'
|
|
},
|
|
{
|
|
'expr': f'rate(process_network_transmit_bytes_total{{service="{service_name}"}}[5m])',
|
|
'legendFormat': 'TX Bytes/s'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'palette-classic'},
|
|
'unit': 'binBps'
|
|
}
|
|
}
|
|
},
|
|
{
|
|
'id': 'disk_io',
|
|
'title': 'Disk I/O',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 18, 'y': 20, 'w': 6, 'h': 4},
|
|
'targets': [
|
|
{
|
|
'expr': f'rate(process_disk_read_bytes_total{{service="{service_name}"}}[5m])',
|
|
'legendFormat': 'Read Bytes/s'
|
|
},
|
|
{
|
|
'expr': f'rate(process_disk_write_bytes_total{{service="{service_name}"}}[5m])',
|
|
'legendFormat': 'Write Bytes/s'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'palette-classic'},
|
|
'unit': 'binBps'
|
|
}
|
|
}
|
|
}
|
|
]
|
|
|
|
def _create_api_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Create API-specific panels."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return [
|
|
{
|
|
'id': 'endpoint_latency',
|
|
'title': 'Top Slowest Endpoints',
|
|
'type': 'table',
|
|
'grid_pos': {'x': 0, 'y': 24, 'w': 12, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'topk(10, histogram_quantile(0.95, sum by (handler) (rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])))) * 1000',
|
|
'legendFormat': '{{handler}}',
|
|
'format': 'table',
|
|
'instant': True
|
|
}
|
|
],
|
|
'transformations': [
|
|
{
|
|
'id': 'organize',
|
|
'options': {
|
|
'excludeByName': {'Time': True},
|
|
'renameByName': {'Value': 'P95 Latency (ms)'}
|
|
}
|
|
}
|
|
],
|
|
'field_config': {
|
|
'overrides': [
|
|
{
|
|
'matcher': {'id': 'byName', 'options': 'P95 Latency (ms)'},
|
|
'properties': [
|
|
{'id': 'color', 'value': {'mode': 'thresholds'}},
|
|
{'id': 'thresholds', 'value': {
|
|
'steps': [
|
|
{'color': 'green', 'value': 0},
|
|
{'color': 'yellow', 'value': 100},
|
|
{'color': 'red', 'value': 500}
|
|
]
|
|
}}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
},
|
|
{
|
|
'id': 'request_size_distribution',
|
|
'title': 'Request Size Distribution',
|
|
'type': 'heatmap',
|
|
'grid_pos': {'x': 12, 'y': 24, 'w': 12, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'sum by (le) (rate(http_request_size_bytes_bucket{{service="{service_name}"}}[5m]))',
|
|
'legendFormat': '{{le}}'
|
|
}
|
|
],
|
|
'options': {
|
|
'calculate': True,
|
|
'yAxis': {'unit': 'bytes'},
|
|
'color': {'scheme': 'Spectral'}
|
|
}
|
|
}
|
|
]
|
|
|
|
def _create_database_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Create database-specific panels."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return [
|
|
{
|
|
'id': 'db_connections',
|
|
'title': 'Database Connections',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 0, 'y': 24, 'w': 8, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'db_connections_active{{service="{service_name}"}}',
|
|
'legendFormat': 'Active Connections'
|
|
},
|
|
{
|
|
'expr': f'db_connections_idle{{service="{service_name}"}}',
|
|
'legendFormat': 'Idle Connections'
|
|
},
|
|
{
|
|
'expr': f'db_connections_max{{service="{service_name}"}}',
|
|
'legendFormat': 'Max Connections'
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'id': 'query_performance',
|
|
'title': 'Query Performance',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 8, 'y': 24, 'w': 8, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'rate(db_queries_total{{service="{service_name}"}}[5m])',
|
|
'legendFormat': 'Queries/sec'
|
|
},
|
|
{
|
|
'expr': f'rate(db_slow_queries_total{{service="{service_name}"}}[5m])',
|
|
'legendFormat': 'Slow Queries/sec'
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'id': 'db_locks',
|
|
'title': 'Database Locks',
|
|
'type': 'stat',
|
|
'grid_pos': {'x': 16, 'y': 24, 'w': 8, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'db_locks_waiting{{service="{service_name}"}}',
|
|
'legendFormat': 'Waiting Locks'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'thresholds'},
|
|
'thresholds': {
|
|
'steps': [
|
|
{'color': 'green', 'value': 0},
|
|
{'color': 'yellow', 'value': 1},
|
|
{'color': 'red', 'value': 5}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
]
|
|
|
|
def _create_queue_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Create queue-specific panels."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return [
|
|
{
|
|
'id': 'queue_depth',
|
|
'title': 'Queue Depth',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 0, 'y': 24, 'w': 12, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'queue_depth{{service="{service_name}"}}',
|
|
'legendFormat': 'Messages in Queue'
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'id': 'message_throughput',
|
|
'title': 'Message Throughput',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 12, 'y': 24, 'w': 12, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'rate(messages_published_total{{service="{service_name}"}}[5m])',
|
|
'legendFormat': 'Published/sec'
|
|
},
|
|
{
|
|
'expr': f'rate(messages_consumed_total{{service="{service_name}"}}[5m])',
|
|
'legendFormat': 'Consumed/sec'
|
|
}
|
|
]
|
|
}
|
|
]
|
|
|
|
def _create_business_metrics_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Create business metrics panels."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return [
|
|
{
|
|
'id': 'business_kpis',
|
|
'title': 'Business KPIs',
|
|
'type': 'stat',
|
|
'grid_pos': {'x': 0, 'y': 30, 'w': 24, 'h': 4},
|
|
'targets': [
|
|
{
|
|
'expr': f'rate(business_transactions_total{{service="{service_name}"}}[1h])',
|
|
'legendFormat': 'Transactions/hour'
|
|
},
|
|
{
|
|
'expr': f'avg(business_transaction_value{{service="{service_name}"}}) * rate(business_transactions_total{{service="{service_name}"}}[1h])',
|
|
'legendFormat': 'Revenue/hour'
|
|
},
|
|
{
|
|
'expr': f'rate(user_registrations_total{{service="{service_name}"}}[1h])',
|
|
'legendFormat': 'New Users/hour'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'palette-classic'},
|
|
'custom': {
|
|
'displayMode': 'basic'
|
|
}
|
|
}
|
|
},
|
|
'options': {
|
|
'orientation': 'horizontal',
|
|
'textMode': 'value_and_name'
|
|
}
|
|
}
|
|
]
|
|
|
|
def _create_capacity_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Create capacity planning panels."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return [
|
|
{
|
|
'id': 'capacity_trends',
|
|
'title': 'Capacity Trends (7d)',
|
|
'type': 'timeseries',
|
|
'grid_pos': {'x': 0, 'y': 34, 'w': 24, 'h': 6},
|
|
'targets': [
|
|
{
|
|
'expr': f'predict_linear(avg_over_time(rate(http_requests_total{{service="{service_name}"}}[5m])[7d:1h]), 7*24*3600)',
|
|
'legendFormat': 'Predicted Traffic (7d)'
|
|
},
|
|
{
|
|
'expr': f'predict_linear(avg_over_time(process_resident_memory_bytes{{service="{service_name}"}}[7d:1h]), 7*24*3600)',
|
|
'legendFormat': 'Predicted Memory Usage (7d)'
|
|
}
|
|
],
|
|
'field_config': {
|
|
'defaults': {
|
|
'color': {'mode': 'palette-classic'},
|
|
'custom': {
|
|
'drawStyle': 'line',
|
|
'lineStyle': {'dash': [10, 10]}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
]
|
|
|
|
def _generate_template_variables(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Generate template variables for dynamic dashboard filtering."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return [
|
|
{
|
|
'name': 'environment',
|
|
'type': 'query',
|
|
'query': 'label_values(environment)',
|
|
'current': {'text': 'production', 'value': 'production'},
|
|
'includeAll': False,
|
|
'multi': False,
|
|
'refresh': 'on_dashboard_load'
|
|
},
|
|
{
|
|
'name': 'instance',
|
|
'type': 'query',
|
|
'query': f'label_values(up{{service="{service_name}"}}, instance)',
|
|
'current': {'text': 'All', 'value': '$__all'},
|
|
'includeAll': True,
|
|
'multi': True,
|
|
'refresh': 'on_time_range_change'
|
|
},
|
|
{
|
|
'name': 'handler',
|
|
'type': 'query',
|
|
'query': f'label_values(http_requests_total{{service="{service_name}"}}, handler)',
|
|
'current': {'text': 'All', 'value': '$__all'},
|
|
'includeAll': True,
|
|
'multi': True,
|
|
'refresh': 'on_time_range_change'
|
|
}
|
|
]
|
|
|
|
def _generate_alerts_integration(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate alerts integration configuration."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return {
|
|
'alert_annotations': True,
|
|
'alert_rules_query': f'ALERTS{{service="{service_name}"}}',
|
|
'alert_panels': [
|
|
{
|
|
'title': 'Active Alerts',
|
|
'type': 'table',
|
|
'query': f'ALERTS{{service="{service_name}",alertstate="firing"}}',
|
|
'columns': ['alertname', 'severity', 'instance', 'description']
|
|
}
|
|
]
|
|
}
|
|
|
|
def _generate_drill_down_paths(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate drill-down navigation paths."""
|
|
service_name = service_def.get('name', 'service')
|
|
|
|
return {
|
|
'service_overview': {
|
|
'from': 'service_status',
|
|
'to': 'detailed_health_dashboard',
|
|
'url': f'/d/service-health/{service_name}-health',
|
|
'params': ['var-service', 'var-environment']
|
|
},
|
|
'error_investigation': {
|
|
'from': 'errors',
|
|
'to': 'error_details_dashboard',
|
|
'url': f'/d/errors/{service_name}-errors',
|
|
'params': ['var-service', 'var-time_range']
|
|
},
|
|
'latency_analysis': {
|
|
'from': 'latency',
|
|
'to': 'trace_analysis_dashboard',
|
|
'url': f'/d/traces/{service_name}-traces',
|
|
'params': ['var-service', 'var-handler']
|
|
},
|
|
'capacity_planning': {
|
|
'from': 'saturation',
|
|
'to': 'capacity_dashboard',
|
|
'url': f'/d/capacity/{service_name}-capacity',
|
|
'params': ['var-service', 'var-time_range']
|
|
}
|
|
}
|
|
|
|
def generate_grafana_json(self, dashboard_spec: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Convert dashboard specification to Grafana JSON format."""
|
|
metadata = dashboard_spec['metadata']
|
|
config = dashboard_spec['configuration']
|
|
|
|
grafana_json = {
|
|
'dashboard': {
|
|
'id': None,
|
|
'title': metadata['title'],
|
|
'tags': [metadata['service']['type'], metadata['target_role'], 'generated'],
|
|
'timezone': config['timezone'],
|
|
'refresh': config['refresh_interval'],
|
|
'time': {
|
|
'from': 'now-1h',
|
|
'to': 'now'
|
|
},
|
|
'templating': {
|
|
'list': dashboard_spec['variables']
|
|
},
|
|
'panels': self._convert_panels_to_grafana_format(dashboard_spec['panels']),
|
|
'version': 1,
|
|
'schemaVersion': 30
|
|
},
|
|
'overwrite': True
|
|
}
|
|
|
|
return grafana_json
|
|
|
|
def _convert_panels_to_grafana_format(self, panels: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Convert panel specifications to Grafana format."""
|
|
grafana_panels = []
|
|
|
|
for panel in panels:
|
|
grafana_panel = {
|
|
'id': hash(panel['id']) % 1000, # Generate numeric ID
|
|
'title': panel['title'],
|
|
'type': panel['type'],
|
|
'gridPos': panel['grid_pos'],
|
|
'targets': panel['targets'],
|
|
'fieldConfig': panel.get('field_config', {}),
|
|
'options': panel.get('options', {}),
|
|
'transformations': panel.get('transformations', [])
|
|
}
|
|
grafana_panels.append(grafana_panel)
|
|
|
|
return grafana_panels
|
|
|
|
def generate_documentation(self, dashboard_spec: Dict[str, Any]) -> str:
|
|
"""Generate documentation for the dashboard."""
|
|
metadata = dashboard_spec['metadata']
|
|
service = metadata['service']
|
|
|
|
doc_content = f"""# {metadata['title']} Documentation
|
|
|
|
## Overview
|
|
This dashboard provides comprehensive monitoring for {service['name']}, a {service['type']} service with {service['criticality']} criticality.
|
|
|
|
**Target Audience:** {metadata['target_role'].upper()} teams
|
|
**Generated:** {metadata['generated_at']}
|
|
|
|
## Dashboard Sections
|
|
|
|
### Service Overview
|
|
- **Service Status**: Real-time availability status
|
|
- **SLO Achievement**: 30-day SLO compliance metrics
|
|
- **Error Budget**: Remaining error budget visualization
|
|
|
|
### Golden Signals Monitoring
|
|
- **Latency**: P50, P95, P99 response times
|
|
- **Traffic**: Request rate by status code
|
|
- **Errors**: Error rates for 4xx and 5xx responses
|
|
- **Saturation**: CPU and memory utilization
|
|
|
|
### Resource Utilization
|
|
- **CPU Usage**: Process CPU consumption
|
|
- **Memory Usage**: Memory utilization tracking
|
|
- **Network I/O**: Network throughput metrics
|
|
- **Disk I/O**: Disk read/write operations
|
|
|
|
## Key Metrics
|
|
|
|
### SLIs Tracked
|
|
"""
|
|
|
|
# Add service-type specific metrics
|
|
service_type = service.get('type', 'api')
|
|
if service_type in self.SERVICE_METRICS:
|
|
metrics = self.SERVICE_METRICS[service_type]['key_metrics']
|
|
for metric in metrics:
|
|
doc_content += f"- `{metric}`: Core service metric\n"
|
|
|
|
doc_content += f"""
|
|
## Alert Integration
|
|
- Active alerts are displayed in context with relevant panels
|
|
- Alert annotations show on time series charts
|
|
- Click-through to alert management system available
|
|
|
|
## Drill-Down Paths
|
|
"""
|
|
|
|
drill_downs = dashboard_spec.get('drill_down_paths', {})
|
|
for path_name, path_config in drill_downs.items():
|
|
doc_content += f"- **{path_name}**: From {path_config['from']} → {path_config['to']}\n"
|
|
|
|
doc_content += f"""
|
|
## Usage Guidelines
|
|
|
|
### Time Ranges
|
|
Use appropriate time ranges for different investigation types:
|
|
- **Real-time monitoring**: 15m - 1h
|
|
- **Recent incident investigation**: 1h - 6h
|
|
- **Trend analysis**: 1d - 7d
|
|
- **Capacity planning**: 7d - 30d
|
|
|
|
### Variables
|
|
- **environment**: Filter by deployment environment
|
|
- **instance**: Focus on specific service instances
|
|
- **handler**: Filter by API endpoint or handler
|
|
|
|
### Performance Optimization
|
|
- Use longer time ranges for capacity planning
|
|
- Refresh intervals are optimized per role:
|
|
- SRE: 30s for operational awareness
|
|
- Developer: 1m for troubleshooting
|
|
- Executive: 5m for high-level monitoring
|
|
|
|
## Maintenance
|
|
- Dashboard panels automatically adapt to service changes
|
|
- Template variables refresh based on actual metric labels
|
|
- Review and update business metrics quarterly
|
|
"""
|
|
|
|
return doc_content
|
|
|
|
def export_specification(self, dashboard_spec: Dict[str, Any], output_file: str,
|
|
format_type: str = 'json'):
|
|
"""Export dashboard specification."""
|
|
if format_type.lower() == 'json':
|
|
with open(output_file, 'w') as f:
|
|
json.dump(dashboard_spec, f, indent=2)
|
|
elif format_type.lower() == 'grafana':
|
|
grafana_json = self.generate_grafana_json(dashboard_spec)
|
|
with open(output_file, 'w') as f:
|
|
json.dump(grafana_json, f, indent=2)
|
|
else:
|
|
raise ValueError(f"Unsupported format: {format_type}")
|
|
|
|
def print_summary(self, dashboard_spec: Dict[str, Any]):
|
|
"""Print human-readable summary of dashboard specification."""
|
|
metadata = dashboard_spec['metadata']
|
|
service = metadata['service']
|
|
config = dashboard_spec['configuration']
|
|
panels = dashboard_spec['panels']
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"DASHBOARD SPECIFICATION SUMMARY")
|
|
print(f"{'='*60}")
|
|
|
|
print(f"\nDashboard Details:")
|
|
print(f" Title: {metadata['title']}")
|
|
print(f" Target Role: {metadata['target_role'].upper()}")
|
|
print(f" Service: {service['name']} ({service['type']})")
|
|
print(f" Criticality: {service['criticality']}")
|
|
print(f" Generated: {metadata['generated_at']}")
|
|
|
|
print(f"\nConfiguration:")
|
|
print(f" Default Time Range: {config['default_time_range']}")
|
|
print(f" Refresh Interval: {config['refresh_interval']}")
|
|
print(f" Available Time Ranges: {', '.join(config['time_ranges'])}")
|
|
|
|
print(f"\nPanels ({len(panels)}):")
|
|
panel_types = {}
|
|
for panel in panels:
|
|
panel_type = panel['type']
|
|
panel_types[panel_type] = panel_types.get(panel_type, 0) + 1
|
|
|
|
for panel_type, count in panel_types.items():
|
|
print(f" {panel_type}: {count}")
|
|
|
|
variables = dashboard_spec.get('variables', [])
|
|
print(f"\nTemplate Variables ({len(variables)}):")
|
|
for var in variables:
|
|
print(f" {var['name']} ({var['type']})")
|
|
|
|
drill_downs = dashboard_spec.get('drill_down_paths', {})
|
|
print(f"\nDrill-down Paths: {len(drill_downs)}")
|
|
|
|
print(f"\nKey Features:")
|
|
print(f" • Golden Signals monitoring")
|
|
print(f" • Resource utilization tracking")
|
|
print(f" • Alert integration")
|
|
print(f" • Role-optimized layout")
|
|
print(f" • Service-type specific panels")
|
|
|
|
print(f"\n{'='*60}\n")
|
|
|
|
|
|
def main():
|
|
"""Main function for CLI usage."""
|
|
parser = argparse.ArgumentParser(
|
|
description='Generate comprehensive dashboard specifications',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Generate from service definition file
|
|
python dashboard_generator.py --input service.json --output dashboard.json
|
|
|
|
# Generate from command line parameters
|
|
python dashboard_generator.py --service-type api --name "Payment Service" --output payment_dashboard.json
|
|
|
|
# Generate Grafana-compatible JSON
|
|
python dashboard_generator.py --input service.json --output dashboard.json --format grafana
|
|
|
|
# Generate with specific role focus
|
|
python dashboard_generator.py --service-type web --name "Frontend" --role developer --output frontend_dev.json
|
|
"""
|
|
)
|
|
|
|
parser.add_argument('--input', '-i',
|
|
help='Input service definition JSON file')
|
|
parser.add_argument('--output', '-o',
|
|
help='Output dashboard specification file')
|
|
parser.add_argument('--service-type',
|
|
choices=['api', 'web', 'database', 'queue', 'batch', 'ml'],
|
|
help='Service type')
|
|
parser.add_argument('--name',
|
|
help='Service name')
|
|
parser.add_argument('--criticality',
|
|
choices=['critical', 'high', 'medium', 'low'],
|
|
default='medium',
|
|
help='Service criticality level')
|
|
parser.add_argument('--role',
|
|
choices=['sre', 'developer', 'executive', 'ops'],
|
|
default='sre',
|
|
help='Target role for dashboard optimization')
|
|
parser.add_argument('--format',
|
|
choices=['json', 'grafana'],
|
|
default='json',
|
|
help='Output format (json specification or grafana compatible)')
|
|
parser.add_argument('--doc-output',
|
|
help='Generate documentation file')
|
|
parser.add_argument('--summary-only', action='store_true',
|
|
help='Only display summary, do not save files')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.input and not (args.service_type and args.name):
|
|
parser.error("Must provide either --input file or --service-type and --name")
|
|
|
|
generator = DashboardGenerator()
|
|
|
|
try:
|
|
# Load or create service definition
|
|
if args.input:
|
|
service_def = generator.load_service_definition(args.input)
|
|
else:
|
|
service_def = generator.create_service_definition(
|
|
args.service_type, args.name, args.criticality
|
|
)
|
|
|
|
# Generate dashboard specification
|
|
dashboard_spec = generator.generate_dashboard_specification(service_def, args.role)
|
|
|
|
# Output results
|
|
if not args.summary_only:
|
|
output_file = args.output or f"{service_def['name'].replace(' ', '_').lower()}_dashboard.json"
|
|
generator.export_specification(dashboard_spec, output_file, args.format)
|
|
print(f"Dashboard specification saved to: {output_file}")
|
|
|
|
# Generate documentation if requested
|
|
if args.doc_output:
|
|
documentation = generator.generate_documentation(dashboard_spec)
|
|
with open(args.doc_output, 'w') as f:
|
|
f.write(documentation)
|
|
print(f"Documentation saved to: {args.doc_output}")
|
|
|
|
# Always show summary
|
|
generator.print_summary(dashboard_spec)
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |