add brain
This commit is contained in:
@@ -0,0 +1,811 @@
|
||||
{
|
||||
"metadata": {
|
||||
"title": "customer-portal - SRE Dashboard",
|
||||
"service": {
|
||||
"name": "customer-portal",
|
||||
"type": "web",
|
||||
"criticality": "high",
|
||||
"user_facing": true,
|
||||
"description": "Customer-facing web application for account management and billing",
|
||||
"team": "frontend",
|
||||
"environment": "production",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "billing-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "notification-service",
|
||||
"type": "api",
|
||||
"criticality": "medium"
|
||||
},
|
||||
{
|
||||
"name": "cdn",
|
||||
"type": "external",
|
||||
"criticality": "medium"
|
||||
}
|
||||
],
|
||||
"pages": [
|
||||
{
|
||||
"path": "/dashboard",
|
||||
"sla_load_time_ms": 2000,
|
||||
"expected_concurrent_users": 1000
|
||||
},
|
||||
{
|
||||
"path": "/billing",
|
||||
"sla_load_time_ms": 3000,
|
||||
"expected_concurrent_users": 200
|
||||
},
|
||||
{
|
||||
"path": "/settings",
|
||||
"sla_load_time_ms": 1500,
|
||||
"expected_concurrent_users": 100
|
||||
}
|
||||
],
|
||||
"business_metrics": {
|
||||
"daily_active_users": {
|
||||
"metric": "count(user_sessions_started_total[1d])",
|
||||
"target": 10000,
|
||||
"unit": "users"
|
||||
},
|
||||
"session_duration": {
|
||||
"metric": "avg(user_session_duration_seconds)",
|
||||
"target": 300,
|
||||
"unit": "seconds"
|
||||
},
|
||||
"bounce_rate": {
|
||||
"metric": "sum(rate(page_views_bounced_total[1h])) / sum(rate(page_views_total[1h]))",
|
||||
"target": 0.3,
|
||||
"unit": "percentage"
|
||||
}
|
||||
},
|
||||
"infrastructure": {
|
||||
"container_orchestrator": "kubernetes",
|
||||
"replicas": 4,
|
||||
"cpu_limit": "1000m",
|
||||
"memory_limit": "2Gi",
|
||||
"storage": {
|
||||
"type": "nfs",
|
||||
"size": "50Gi"
|
||||
},
|
||||
"ingress": {
|
||||
"type": "nginx",
|
||||
"ssl_termination": true,
|
||||
"rate_limiting": {
|
||||
"requests_per_second": 100,
|
||||
"burst": 200
|
||||
}
|
||||
}
|
||||
},
|
||||
"monitoring": {
|
||||
"synthetic_checks": [
|
||||
{
|
||||
"name": "login_flow",
|
||||
"url": "/auth/login",
|
||||
"frequency": "1m",
|
||||
"locations": [
|
||||
"us-east",
|
||||
"eu-west",
|
||||
"ap-south"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "checkout_flow",
|
||||
"url": "/billing/checkout",
|
||||
"frequency": "5m",
|
||||
"locations": [
|
||||
"us-east",
|
||||
"eu-west"
|
||||
]
|
||||
}
|
||||
],
|
||||
"rum": {
|
||||
"enabled": true,
|
||||
"sampling_rate": 0.1
|
||||
}
|
||||
},
|
||||
"compliance_requirements": [
|
||||
"GDPR",
|
||||
"CCPA"
|
||||
],
|
||||
"tags": [
|
||||
"frontend",
|
||||
"customer-facing",
|
||||
"billing",
|
||||
"high-traffic"
|
||||
]
|
||||
},
|
||||
"target_role": "sre",
|
||||
"generated_at": "2026-02-16T14:02:03.421248Z",
|
||||
"version": "1.0"
|
||||
},
|
||||
"configuration": {
|
||||
"time_ranges": [
|
||||
"1h",
|
||||
"6h",
|
||||
"1d",
|
||||
"7d"
|
||||
],
|
||||
"default_time_range": "6h",
|
||||
"refresh_interval": "30s",
|
||||
"timezone": "UTC",
|
||||
"theme": "dark"
|
||||
},
|
||||
"layout": {
|
||||
"grid_settings": {
|
||||
"width": 24,
|
||||
"height_unit": "px",
|
||||
"cell_height": 30
|
||||
},
|
||||
"sections": [
|
||||
{
|
||||
"title": "Service Overview",
|
||||
"collapsed": false,
|
||||
"y_position": 0,
|
||||
"panels": [
|
||||
"service_status",
|
||||
"slo_summary",
|
||||
"error_budget"
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Golden Signals",
|
||||
"collapsed": false,
|
||||
"y_position": 8,
|
||||
"panels": [
|
||||
"latency",
|
||||
"traffic",
|
||||
"errors",
|
||||
"saturation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Resource Utilization",
|
||||
"collapsed": false,
|
||||
"y_position": 16,
|
||||
"panels": [
|
||||
"cpu_usage",
|
||||
"memory_usage",
|
||||
"network_io",
|
||||
"disk_io"
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Dependencies & Downstream",
|
||||
"collapsed": true,
|
||||
"y_position": 24,
|
||||
"panels": [
|
||||
"dependency_status",
|
||||
"downstream_latency",
|
||||
"circuit_breakers"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": "service_status",
|
||||
"title": "Service Status",
|
||||
"type": "stat",
|
||||
"grid_pos": {
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{service=\"customer-portal\"}",
|
||||
"legendFormat": "Status"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Status"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"mode": "thresholds"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
},
|
||||
{
|
||||
"options": {
|
||||
"1": {
|
||||
"text": "UP"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"textMode": "value_and_name"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "slo_summary",
|
||||
"title": "SLO Achievement (30d)",
|
||||
"type": "stat",
|
||||
"grid_pos": {
|
||||
"x": 6,
|
||||
"y": 0,
|
||||
"w": 9,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (increase(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[30d]) / increase(http_requests_total{service=\"customer-portal\"}[30d]))) * 100",
|
||||
"legendFormat": "Availability"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, increase(http_request_duration_seconds_bucket{service=\"customer-portal\"}[30d])) * 1000",
|
||||
"legendFormat": "P95 Latency (ms)"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 99.0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 99.9
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"textMode": "value_and_name"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "error_budget",
|
||||
"title": "Error Budget Remaining",
|
||||
"type": "gauge",
|
||||
"grid_pos": {
|
||||
"x": 15,
|
||||
"y": 0,
|
||||
"w": 9,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (increase(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[30d]) / increase(http_requests_total{service=\"customer-portal\"}[30d])) - 0.999) / 0.001 * 100",
|
||||
"legendFormat": "Error Budget %"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 25
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 50
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "latency",
|
||||
"title": "Request Latency",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 0,
|
||||
"y": 8,
|
||||
"w": 12,
|
||||
"h": 6
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000",
|
||||
"legendFormat": "P50 Latency"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000",
|
||||
"legendFormat": "P95 Latency"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000",
|
||||
"legendFormat": "P99 Latency"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "traffic",
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 12,
|
||||
"y": 8,
|
||||
"w": 12,
|
||||
"h": 6
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\"}[5m]))",
|
||||
"legendFormat": "Total RPS"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"2..\"}[5m]))",
|
||||
"legendFormat": "2xx RPS"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"4..\"}[5m]))",
|
||||
"legendFormat": "4xx RPS"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[5m]))",
|
||||
"legendFormat": "5xx RPS"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "errors",
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 0,
|
||||
"y": 14,
|
||||
"w": 12,
|
||||
"h": 6
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"customer-portal\"}[5m])) * 100",
|
||||
"legendFormat": "5xx Error Rate"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"4..\"}[5m])) / sum(rate(http_requests_total{service=\"customer-portal\"}[5m])) * 100",
|
||||
"legendFormat": "4xx Error Rate"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 20
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "5xx Error Rate"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "saturation",
|
||||
"title": "Saturation Metrics",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 12,
|
||||
"y": 14,
|
||||
"w": 12,
|
||||
"h": 6
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_cpu_seconds_total{service=\"customer-portal\"}[5m]) * 100",
|
||||
"legendFormat": "CPU Usage %"
|
||||
},
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{service=\"customer-portal\"} / process_virtual_memory_max_bytes{service=\"customer-portal\"} * 100",
|
||||
"legendFormat": "Memory Usage %"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "cpu_usage",
|
||||
"title": "CPU Usage",
|
||||
"type": "gauge",
|
||||
"grid_pos": {
|
||||
"x": 0,
|
||||
"y": 20,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_cpu_seconds_total{service=\"customer-portal\"}[5m]) * 100",
|
||||
"legendFormat": "CPU %"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "memory_usage",
|
||||
"title": "Memory Usage",
|
||||
"type": "gauge",
|
||||
"grid_pos": {
|
||||
"x": 6,
|
||||
"y": 20,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{service=\"customer-portal\"} / 1024 / 1024",
|
||||
"legendFormat": "Memory MB"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"unit": "decbytes",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 512000000
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1024000000
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "network_io",
|
||||
"title": "Network I/O",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 12,
|
||||
"y": 20,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_network_receive_bytes_total{service=\"customer-portal\"}[5m])",
|
||||
"legendFormat": "RX Bytes/s"
|
||||
},
|
||||
{
|
||||
"expr": "rate(process_network_transmit_bytes_total{service=\"customer-portal\"}[5m])",
|
||||
"legendFormat": "TX Bytes/s"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "binBps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "disk_io",
|
||||
"title": "Disk I/O",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 18,
|
||||
"y": 20,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_disk_read_bytes_total{service=\"customer-portal\"}[5m])",
|
||||
"legendFormat": "Read Bytes/s"
|
||||
},
|
||||
{
|
||||
"expr": "rate(process_disk_write_bytes_total{service=\"customer-portal\"}[5m])",
|
||||
"legendFormat": "Write Bytes/s"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "binBps"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "environment",
|
||||
"type": "query",
|
||||
"query": "label_values(environment)",
|
||||
"current": {
|
||||
"text": "production",
|
||||
"value": "production"
|
||||
},
|
||||
"includeAll": false,
|
||||
"multi": false,
|
||||
"refresh": "on_dashboard_load"
|
||||
},
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"query": "label_values(up{service=\"customer-portal\"}, instance)",
|
||||
"current": {
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"refresh": "on_time_range_change"
|
||||
},
|
||||
{
|
||||
"name": "handler",
|
||||
"type": "query",
|
||||
"query": "label_values(http_requests_total{service=\"customer-portal\"}, handler)",
|
||||
"current": {
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"refresh": "on_time_range_change"
|
||||
}
|
||||
],
|
||||
"alerts_integration": {
|
||||
"alert_annotations": true,
|
||||
"alert_rules_query": "ALERTS{service=\"customer-portal\"}",
|
||||
"alert_panels": [
|
||||
{
|
||||
"title": "Active Alerts",
|
||||
"type": "table",
|
||||
"query": "ALERTS{service=\"customer-portal\",alertstate=\"firing\"}",
|
||||
"columns": [
|
||||
"alertname",
|
||||
"severity",
|
||||
"instance",
|
||||
"description"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"drill_down_paths": {
|
||||
"service_overview": {
|
||||
"from": "service_status",
|
||||
"to": "detailed_health_dashboard",
|
||||
"url": "/d/service-health/customer-portal-health",
|
||||
"params": [
|
||||
"var-service",
|
||||
"var-environment"
|
||||
]
|
||||
},
|
||||
"error_investigation": {
|
||||
"from": "errors",
|
||||
"to": "error_details_dashboard",
|
||||
"url": "/d/errors/customer-portal-errors",
|
||||
"params": [
|
||||
"var-service",
|
||||
"var-time_range"
|
||||
]
|
||||
},
|
||||
"latency_analysis": {
|
||||
"from": "latency",
|
||||
"to": "trace_analysis_dashboard",
|
||||
"url": "/d/traces/customer-portal-traces",
|
||||
"params": [
|
||||
"var-service",
|
||||
"var-handler"
|
||||
]
|
||||
},
|
||||
"capacity_planning": {
|
||||
"from": "saturation",
|
||||
"to": "capacity_dashboard",
|
||||
"url": "/d/capacity/customer-portal-capacity",
|
||||
"params": [
|
||||
"var-service",
|
||||
"var-time_range"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,545 @@
|
||||
{
|
||||
"metadata": {
|
||||
"service": {
|
||||
"name": "payment-service",
|
||||
"type": "api",
|
||||
"criticality": "critical",
|
||||
"user_facing": true,
|
||||
"description": "Handles payment processing and transaction management",
|
||||
"team": "payments",
|
||||
"environment": "production",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "payment-gateway",
|
||||
"type": "external",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "fraud-detection",
|
||||
"type": "ml",
|
||||
"criticality": "high"
|
||||
}
|
||||
],
|
||||
"endpoints": [
|
||||
{
|
||||
"path": "/api/v1/payments",
|
||||
"method": "POST",
|
||||
"sla_latency_ms": 500,
|
||||
"expected_tps": 100
|
||||
},
|
||||
{
|
||||
"path": "/api/v1/payments/{id}",
|
||||
"method": "GET",
|
||||
"sla_latency_ms": 200,
|
||||
"expected_tps": 500
|
||||
},
|
||||
{
|
||||
"path": "/api/v1/payments/{id}/refund",
|
||||
"method": "POST",
|
||||
"sla_latency_ms": 1000,
|
||||
"expected_tps": 10
|
||||
}
|
||||
],
|
||||
"business_metrics": {
|
||||
"revenue_per_hour": {
|
||||
"metric": "sum(payment_amount * rate(payments_successful_total[1h]))",
|
||||
"target": 50000,
|
||||
"unit": "USD"
|
||||
},
|
||||
"conversion_rate": {
|
||||
"metric": "sum(rate(payments_successful_total[5m])) / sum(rate(payment_attempts_total[5m]))",
|
||||
"target": 0.95,
|
||||
"unit": "percentage"
|
||||
}
|
||||
},
|
||||
"infrastructure": {
|
||||
"container_orchestrator": "kubernetes",
|
||||
"replicas": 6,
|
||||
"cpu_limit": "2000m",
|
||||
"memory_limit": "4Gi",
|
||||
"database": {
|
||||
"type": "postgresql",
|
||||
"connection_pool_size": 20
|
||||
},
|
||||
"cache": {
|
||||
"type": "redis",
|
||||
"cluster_size": 3
|
||||
}
|
||||
},
|
||||
"compliance_requirements": [
|
||||
"PCI-DSS",
|
||||
"SOX",
|
||||
"GDPR"
|
||||
],
|
||||
"tags": [
|
||||
"payment",
|
||||
"transaction",
|
||||
"critical-path",
|
||||
"revenue-generating"
|
||||
]
|
||||
},
|
||||
"generated_at": "2026-02-16T14:01:57.572080Z",
|
||||
"framework_version": "1.0"
|
||||
},
|
||||
"slis": [
|
||||
{
|
||||
"name": "Availability",
|
||||
"description": "Percentage of successful requests",
|
||||
"type": "ratio",
|
||||
"good_events": "sum(rate(http_requests_total{service=\"payment-service\",code!~\"5..\"}))",
|
||||
"total_events": "sum(rate(http_requests_total{service=\"payment-service\"}))",
|
||||
"unit": "percentage"
|
||||
},
|
||||
{
|
||||
"name": "Request Latency P95",
|
||||
"description": "95th percentile of request latency",
|
||||
"type": "threshold",
|
||||
"query": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m]))",
|
||||
"unit": "seconds"
|
||||
},
|
||||
{
|
||||
"name": "Error Rate",
|
||||
"description": "Rate of 5xx errors",
|
||||
"type": "ratio",
|
||||
"good_events": "sum(rate(http_requests_total{service=\"payment-service\",code!~\"5..\"}))",
|
||||
"total_events": "sum(rate(http_requests_total{service=\"payment-service\"}))",
|
||||
"unit": "percentage"
|
||||
},
|
||||
{
|
||||
"name": "Request Throughput",
|
||||
"description": "Requests per second",
|
||||
"type": "gauge",
|
||||
"query": "sum(rate(http_requests_total{service=\"payment-service\"}[5m]))",
|
||||
"unit": "requests/sec"
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate",
|
||||
"description": "Percentage of successful complete user journeys",
|
||||
"type": "ratio",
|
||||
"good_events": "sum(rate(user_journey_total{service=\"payment-service\",status=\"success\"}[5m]))",
|
||||
"total_events": "sum(rate(user_journey_total{service=\"payment-service\"}[5m]))",
|
||||
"unit": "percentage"
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability",
|
||||
"description": "Percentage of time key features are available",
|
||||
"type": "ratio",
|
||||
"good_events": "sum(rate(feature_checks_total{service=\"payment-service\",status=\"available\"}[5m]))",
|
||||
"total_events": "sum(rate(feature_checks_total{service=\"payment-service\"}[5m]))",
|
||||
"unit": "percentage"
|
||||
}
|
||||
],
|
||||
"slos": [
|
||||
{
|
||||
"name": "Availability SLO",
|
||||
"description": "Service level objective for percentage of successful requests",
|
||||
"sli_name": "Availability",
|
||||
"target_value": 0.9999,
|
||||
"target_display": "99.99%",
|
||||
"operator": ">=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "Request Latency P95 SLO",
|
||||
"description": "Service level objective for 95th percentile of request latency",
|
||||
"sli_name": "Request Latency P95",
|
||||
"target_value": 100,
|
||||
"target_display": "0.1s",
|
||||
"operator": "<=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "Error Rate SLO",
|
||||
"description": "Service level objective for rate of 5xx errors",
|
||||
"sli_name": "Error Rate",
|
||||
"target_value": 0.001,
|
||||
"target_display": "0.1%",
|
||||
"operator": "<=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate SLO",
|
||||
"description": "Service level objective for percentage of successful complete user journeys",
|
||||
"sli_name": "User Journey Success Rate",
|
||||
"target_value": 0.9999,
|
||||
"target_display": "99.99%",
|
||||
"operator": ">=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability SLO",
|
||||
"description": "Service level objective for percentage of time key features are available",
|
||||
"sli_name": "Feature Availability",
|
||||
"target_value": 0.9999,
|
||||
"target_display": "99.99%",
|
||||
"operator": ">=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
}
|
||||
],
|
||||
"error_budgets": [
|
||||
{
|
||||
"slo_name": "Availability SLO",
|
||||
"error_budget_rate": 9.999999999998899e-05,
|
||||
"error_budget_percentage": "0.010%",
|
||||
"budgets_by_window": {
|
||||
"1h": "0.4 seconds",
|
||||
"1d": "8.6 seconds",
|
||||
"7d": "1.0 minutes",
|
||||
"30d": "4.3 minutes"
|
||||
},
|
||||
"burn_rate_alerts": [
|
||||
{
|
||||
"name": "Availability Burn Rate 2% Alert",
|
||||
"description": "Alert when Availability is consuming error budget at 14.4x rate",
|
||||
"severity": "critical",
|
||||
"short_window": "5m",
|
||||
"long_window": "1h",
|
||||
"burn_rate_threshold": 14.4,
|
||||
"budget_consumed": "2%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Availability",
|
||||
"description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Availability Burn Rate 5% Alert",
|
||||
"description": "Alert when Availability is consuming error budget at 6x rate",
|
||||
"severity": "warning",
|
||||
"short_window": "30m",
|
||||
"long_window": "6h",
|
||||
"burn_rate_threshold": 6,
|
||||
"budget_consumed": "5%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Availability",
|
||||
"description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Availability Burn Rate 10% Alert",
|
||||
"description": "Alert when Availability is consuming error budget at 3x rate",
|
||||
"severity": "info",
|
||||
"short_window": "2h",
|
||||
"long_window": "1d",
|
||||
"burn_rate_threshold": 3,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Availability",
|
||||
"description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Availability Burn Rate 10% Alert",
|
||||
"description": "Alert when Availability is consuming error budget at 1x rate",
|
||||
"severity": "info",
|
||||
"short_window": "6h",
|
||||
"long_window": "3d",
|
||||
"burn_rate_threshold": 1,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Availability",
|
||||
"description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"slo_name": "User Journey Success Rate SLO",
|
||||
"error_budget_rate": 9.999999999998899e-05,
|
||||
"error_budget_percentage": "0.010%",
|
||||
"budgets_by_window": {
|
||||
"1h": "0.4 seconds",
|
||||
"1d": "8.6 seconds",
|
||||
"7d": "1.0 minutes",
|
||||
"30d": "4.3 minutes"
|
||||
},
|
||||
"burn_rate_alerts": [
|
||||
{
|
||||
"name": "User Journey Success Rate Burn Rate 2% Alert",
|
||||
"description": "Alert when User Journey Success Rate is consuming error budget at 14.4x rate",
|
||||
"severity": "critical",
|
||||
"short_window": "5m",
|
||||
"long_window": "1h",
|
||||
"burn_rate_threshold": 14.4,
|
||||
"budget_consumed": "2%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for User Journey Success Rate",
|
||||
"description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate Burn Rate 5% Alert",
|
||||
"description": "Alert when User Journey Success Rate is consuming error budget at 6x rate",
|
||||
"severity": "warning",
|
||||
"short_window": "30m",
|
||||
"long_window": "6h",
|
||||
"burn_rate_threshold": 6,
|
||||
"budget_consumed": "5%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for User Journey Success Rate",
|
||||
"description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate Burn Rate 10% Alert",
|
||||
"description": "Alert when User Journey Success Rate is consuming error budget at 3x rate",
|
||||
"severity": "info",
|
||||
"short_window": "2h",
|
||||
"long_window": "1d",
|
||||
"burn_rate_threshold": 3,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for User Journey Success Rate",
|
||||
"description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate Burn Rate 10% Alert",
|
||||
"description": "Alert when User Journey Success Rate is consuming error budget at 1x rate",
|
||||
"severity": "info",
|
||||
"short_window": "6h",
|
||||
"long_window": "3d",
|
||||
"burn_rate_threshold": 1,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for User Journey Success Rate",
|
||||
"description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"slo_name": "Feature Availability SLO",
|
||||
"error_budget_rate": 9.999999999998899e-05,
|
||||
"error_budget_percentage": "0.010%",
|
||||
"budgets_by_window": {
|
||||
"1h": "0.4 seconds",
|
||||
"1d": "8.6 seconds",
|
||||
"7d": "1.0 minutes",
|
||||
"30d": "4.3 minutes"
|
||||
},
|
||||
"burn_rate_alerts": [
|
||||
{
|
||||
"name": "Feature Availability Burn Rate 2% Alert",
|
||||
"description": "Alert when Feature Availability is consuming error budget at 14.4x rate",
|
||||
"severity": "critical",
|
||||
"short_window": "5m",
|
||||
"long_window": "1h",
|
||||
"burn_rate_threshold": 14.4,
|
||||
"budget_consumed": "2%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Feature Availability",
|
||||
"description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability Burn Rate 5% Alert",
|
||||
"description": "Alert when Feature Availability is consuming error budget at 6x rate",
|
||||
"severity": "warning",
|
||||
"short_window": "30m",
|
||||
"long_window": "6h",
|
||||
"burn_rate_threshold": 6,
|
||||
"budget_consumed": "5%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Feature Availability",
|
||||
"description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability Burn Rate 10% Alert",
|
||||
"description": "Alert when Feature Availability is consuming error budget at 3x rate",
|
||||
"severity": "info",
|
||||
"short_window": "2h",
|
||||
"long_window": "1d",
|
||||
"burn_rate_threshold": 3,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Feature Availability",
|
||||
"description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability Burn Rate 10% Alert",
|
||||
"description": "Alert when Feature Availability is consuming error budget at 1x rate",
|
||||
"severity": "info",
|
||||
"short_window": "6h",
|
||||
"long_window": "3d",
|
||||
"burn_rate_threshold": 1,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Feature Availability",
|
||||
"description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"sla_recommendations": {
|
||||
"applicable": true,
|
||||
"service": "payment-service",
|
||||
"commitments": [
|
||||
{
|
||||
"metric": "Availability",
|
||||
"target": 0.9989,
|
||||
"target_display": "99.89%",
|
||||
"measurement_window": "monthly",
|
||||
"measurement_method": "Uptime monitoring with 1-minute granularity"
|
||||
},
|
||||
{
|
||||
"metric": "Feature Availability",
|
||||
"target": 0.9989,
|
||||
"target_display": "99.89%",
|
||||
"measurement_window": "monthly",
|
||||
"measurement_method": "Uptime monitoring with 1-minute granularity"
|
||||
}
|
||||
],
|
||||
"penalties": [
|
||||
{
|
||||
"breach_threshold": "< 99.99%",
|
||||
"credit_percentage": 10
|
||||
},
|
||||
{
|
||||
"breach_threshold": "< 99.9%",
|
||||
"credit_percentage": 25
|
||||
},
|
||||
{
|
||||
"breach_threshold": "< 99%",
|
||||
"credit_percentage": 50
|
||||
}
|
||||
],
|
||||
"measurement_methodology": "External synthetic monitoring from multiple geographic locations",
|
||||
"exclusions": [
|
||||
"Planned maintenance windows (with 72h advance notice)",
|
||||
"Customer-side network or infrastructure issues",
|
||||
"Force majeure events",
|
||||
"Third-party service dependencies beyond our control"
|
||||
]
|
||||
},
|
||||
"monitoring_recommendations": {
|
||||
"metrics": {
|
||||
"collection": "Prometheus with service discovery",
|
||||
"retention": "90 days for raw metrics, 1 year for aggregated",
|
||||
"alerting": "Prometheus Alertmanager with multi-window burn rate alerts"
|
||||
},
|
||||
"logging": {
|
||||
"format": "Structured JSON logs with correlation IDs",
|
||||
"aggregation": "ELK stack or equivalent with proper indexing",
|
||||
"retention": "30 days for debug logs, 90 days for error logs"
|
||||
},
|
||||
"tracing": {
|
||||
"sampling": "Adaptive sampling with 1% base rate",
|
||||
"storage": "Jaeger or Zipkin with 7-day retention",
|
||||
"integration": "OpenTelemetry instrumentation"
|
||||
}
|
||||
},
|
||||
"implementation_guide": {
|
||||
"prerequisites": [
|
||||
"Service instrumented with metrics collection (Prometheus format)",
|
||||
"Structured logging with correlation IDs",
|
||||
"Monitoring infrastructure (Prometheus, Grafana, Alertmanager)",
|
||||
"Incident response processes and escalation policies"
|
||||
],
|
||||
"implementation_steps": [
|
||||
{
|
||||
"step": 1,
|
||||
"title": "Instrument Service",
|
||||
"description": "Add metrics collection for all defined SLIs",
|
||||
"estimated_effort": "1-2 days"
|
||||
},
|
||||
{
|
||||
"step": 2,
|
||||
"title": "Configure Recording Rules",
|
||||
"description": "Set up Prometheus recording rules for SLI calculations",
|
||||
"estimated_effort": "4-8 hours"
|
||||
},
|
||||
{
|
||||
"step": 3,
|
||||
"title": "Implement Burn Rate Alerts",
|
||||
"description": "Configure multi-window burn rate alerting rules",
|
||||
"estimated_effort": "1 day"
|
||||
},
|
||||
{
|
||||
"step": 4,
|
||||
"title": "Create SLO Dashboard",
|
||||
"description": "Build Grafana dashboard for SLO tracking and error budget monitoring",
|
||||
"estimated_effort": "4-6 hours"
|
||||
},
|
||||
{
|
||||
"step": 5,
|
||||
"title": "Test and Validate",
|
||||
"description": "Test alerting and validate SLI measurements against expectations",
|
||||
"estimated_effort": "1-2 days"
|
||||
},
|
||||
{
|
||||
"step": 6,
|
||||
"title": "Documentation and Training",
|
||||
"description": "Document runbooks and train team on SLO monitoring",
|
||||
"estimated_effort": "1 day"
|
||||
}
|
||||
],
|
||||
"validation_checklist": [
|
||||
"All SLIs produce expected metric values",
|
||||
"Burn rate alerts fire correctly during simulated outages",
|
||||
"Error budget calculations match manual verification",
|
||||
"Dashboard displays accurate SLO achievement rates",
|
||||
"Alert routing reaches correct escalation paths",
|
||||
"Runbooks are complete and tested"
|
||||
]
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user