add brain
This commit is contained in:
@@ -0,0 +1,384 @@
|
||||
# Observability Designer
|
||||
|
||||
A comprehensive toolkit for designing production-ready observability strategies including SLI/SLO frameworks, alert optimization, and dashboard generation.
|
||||
|
||||
## Overview
|
||||
|
||||
The Observability Designer skill provides three powerful Python scripts that help you create, optimize, and maintain observability systems:
|
||||
|
||||
- **SLO Designer**: Generate complete SLI/SLO frameworks with error budgets and burn rate alerts
|
||||
- **Alert Optimizer**: Analyze and optimize existing alert configurations to reduce noise and improve effectiveness
|
||||
- **Dashboard Generator**: Create comprehensive dashboard specifications with role-based layouts and drill-down paths
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Python 3.7+
|
||||
- No external dependencies required (uses Python standard library only)
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
# Generate SLO framework for a service
|
||||
python3 scripts/slo_designer.py --service-type api --criticality critical --user-facing true --service-name payment-service
|
||||
|
||||
# Optimize existing alerts
|
||||
python3 scripts/alert_optimizer.py --input assets/sample_alerts.json --analyze-only
|
||||
|
||||
# Generate a dashboard specification
|
||||
python3 scripts/dashboard_generator.py --service-type web --name "Customer Portal" --role sre
|
||||
```
|
||||
|
||||
## Scripts Documentation
|
||||
|
||||
### SLO Designer (`slo_designer.py`)
|
||||
|
||||
Generates comprehensive SLO frameworks based on service characteristics.
|
||||
|
||||
#### Features
|
||||
- **Automatic SLI Selection**: Recommends appropriate SLIs based on service type
|
||||
- **Target Setting**: Suggests SLO targets based on service criticality
|
||||
- **Error Budget Calculation**: Computes error budgets and burn rate thresholds
|
||||
- **Multi-Window Burn Rate Alerts**: Generates 4-window burn rate alerting rules
|
||||
- **SLA Recommendations**: Provides customer-facing SLA guidance
|
||||
|
||||
#### Usage Examples
|
||||
|
||||
```bash
|
||||
# From service definition file
|
||||
python3 scripts/slo_designer.py --input assets/sample_service_api.json --output slo_framework.json
|
||||
|
||||
# From command line parameters
|
||||
python3 scripts/slo_designer.py \
|
||||
--service-type api \
|
||||
--criticality critical \
|
||||
--user-facing true \
|
||||
--service-name payment-service \
|
||||
--output payment_slos.json
|
||||
|
||||
# Generate and display summary only
|
||||
python3 scripts/slo_designer.py --input assets/sample_service_web.json --summary-only
|
||||
```
|
||||
|
||||
#### Service Definition Format
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "payment-service",
|
||||
"type": "api",
|
||||
"criticality": "critical",
|
||||
"user_facing": true,
|
||||
"description": "Handles payment processing",
|
||||
"team": "payments",
|
||||
"environment": "production",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### Supported Service Types
|
||||
- **api**: REST APIs, GraphQL services
|
||||
- **web**: Web applications, SPAs
|
||||
- **database**: Database services, data stores
|
||||
- **queue**: Message queues, event streams
|
||||
- **batch**: Batch processing jobs
|
||||
- **ml**: Machine learning services
|
||||
|
||||
#### Criticality Levels
|
||||
- **critical**: 99.99% availability, <100ms P95 latency, <0.1% error rate
|
||||
- **high**: 99.9% availability, <200ms P95 latency, <0.5% error rate
|
||||
- **medium**: 99.5% availability, <500ms P95 latency, <1% error rate
|
||||
- **low**: 99% availability, <1s P95 latency, <2% error rate
|
||||
|
||||
### Alert Optimizer (`alert_optimizer.py`)
|
||||
|
||||
Analyzes existing alert configurations and provides optimization recommendations.
|
||||
|
||||
#### Features
|
||||
- **Noise Detection**: Identifies alerts with high false positive rates
|
||||
- **Coverage Analysis**: Finds gaps in monitoring coverage
|
||||
- **Duplicate Detection**: Locates redundant or overlapping alerts
|
||||
- **Threshold Analysis**: Reviews alert thresholds for appropriateness
|
||||
- **Fatigue Assessment**: Evaluates alert volume and routing
|
||||
|
||||
#### Usage Examples
|
||||
|
||||
```bash
|
||||
# Analyze existing alerts
|
||||
python3 scripts/alert_optimizer.py --input assets/sample_alerts.json --analyze-only
|
||||
|
||||
# Generate optimized configuration
|
||||
python3 scripts/alert_optimizer.py \
|
||||
--input assets/sample_alerts.json \
|
||||
--output optimized_alerts.json
|
||||
|
||||
# Generate HTML report
|
||||
python3 scripts/alert_optimizer.py \
|
||||
--input assets/sample_alerts.json \
|
||||
--report alert_analysis.html \
|
||||
--format html
|
||||
```
|
||||
|
||||
#### Alert Configuration Format
|
||||
|
||||
```json
|
||||
{
|
||||
"alerts": [
|
||||
{
|
||||
"alert": "HighLatency",
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High request latency detected",
|
||||
"runbook_url": "https://runbooks.company.com/high-latency"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 2.5,
|
||||
"false_positive_rate": 0.15
|
||||
}
|
||||
}
|
||||
],
|
||||
"services": [
|
||||
{
|
||||
"name": "payment-service",
|
||||
"criticality": "critical"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### Analysis Categories
|
||||
- **Golden Signals**: Latency, traffic, errors, saturation
|
||||
- **Resource Utilization**: CPU, memory, disk, network
|
||||
- **Business Metrics**: Revenue, conversion, user engagement
|
||||
- **Security**: Auth failures, suspicious activity
|
||||
- **Availability**: Uptime, health checks
|
||||
|
||||
### Dashboard Generator (`dashboard_generator.py`)
|
||||
|
||||
Creates comprehensive dashboard specifications with role-based optimization.
|
||||
|
||||
#### Features
|
||||
- **Role-Based Layouts**: Optimized for SRE, Developer, Executive, and Ops personas
|
||||
- **Golden Signals Coverage**: Automatic inclusion of key monitoring metrics
|
||||
- **Service-Type Specific Panels**: Tailored panels based on service characteristics
|
||||
- **Interactive Elements**: Template variables, drill-down paths, time range controls
|
||||
- **Grafana Compatibility**: Generates Grafana-compatible JSON
|
||||
|
||||
#### Usage Examples
|
||||
|
||||
```bash
|
||||
# From service definition
|
||||
python3 scripts/dashboard_generator.py \
|
||||
--input assets/sample_service_web.json \
|
||||
--output dashboard.json
|
||||
|
||||
# With specific role optimization
|
||||
python3 scripts/dashboard_generator.py \
|
||||
--service-type api \
|
||||
--name "Payment Service" \
|
||||
--role developer \
|
||||
--output payment_dev_dashboard.json
|
||||
|
||||
# Generate Grafana-compatible JSON
|
||||
python3 scripts/dashboard_generator.py \
|
||||
--input assets/sample_service_api.json \
|
||||
--output dashboard.json \
|
||||
--format grafana
|
||||
|
||||
# With documentation
|
||||
python3 scripts/dashboard_generator.py \
|
||||
--service-type web \
|
||||
--name "Customer Portal" \
|
||||
--output portal_dashboard.json \
|
||||
--doc-output portal_docs.md
|
||||
```
|
||||
|
||||
#### Target Roles
|
||||
|
||||
- **sre**: Focus on availability, latency, errors, resource utilization
|
||||
- **developer**: Emphasize latency, errors, throughput, business metrics
|
||||
- **executive**: Highlight availability, business metrics, user experience
|
||||
- **ops**: Priority on resource utilization, capacity, alerts, deployments
|
||||
|
||||
#### Panel Types
|
||||
- **Stat**: Single value displays with thresholds
|
||||
- **Gauge**: Resource utilization and capacity metrics
|
||||
- **Timeseries**: Trend analysis and historical data
|
||||
- **Table**: Top N lists and detailed breakdowns
|
||||
- **Heatmap**: Distribution and correlation analysis
|
||||
|
||||
## Sample Data
|
||||
|
||||
The `assets/` directory contains sample configurations for testing:
|
||||
|
||||
- `sample_service_api.json`: Critical API service definition
|
||||
- `sample_service_web.json`: High-priority web application definition
|
||||
- `sample_alerts.json`: Alert configuration with optimization opportunities
|
||||
|
||||
The `expected_outputs/` directory shows example outputs from each script:
|
||||
|
||||
- `sample_slo_framework.json`: Complete SLO framework for API service
|
||||
- `optimized_alerts.json`: Optimized alert configuration
|
||||
- `sample_dashboard.json`: SRE dashboard specification
|
||||
|
||||
## Best Practices
|
||||
|
||||
### SLO Design
|
||||
- Start with 1-2 SLOs per service and iterate
|
||||
- Choose SLIs that directly impact user experience
|
||||
- Set targets based on user needs, not technical capabilities
|
||||
- Use error budgets to balance reliability and velocity
|
||||
|
||||
### Alert Optimization
|
||||
- Every alert must be actionable
|
||||
- Alert on symptoms, not causes
|
||||
- Use multi-window burn rate alerts for SLO protection
|
||||
- Implement proper escalation and routing policies
|
||||
|
||||
### Dashboard Design
|
||||
- Follow the F-pattern for visual hierarchy
|
||||
- Use consistent color semantics across dashboards
|
||||
- Include drill-down paths for effective troubleshooting
|
||||
- Optimize for the target role's specific needs
|
||||
|
||||
## Integration Patterns
|
||||
|
||||
### CI/CD Integration
|
||||
```bash
|
||||
# Generate SLOs during service onboarding
|
||||
python3 scripts/slo_designer.py --input service-config.json --output slos.json
|
||||
|
||||
# Validate alert configurations in pipeline
|
||||
python3 scripts/alert_optimizer.py --input alerts.json --analyze-only --report validation.html
|
||||
|
||||
# Auto-generate dashboards for new services
|
||||
python3 scripts/dashboard_generator.py --input service-config.json --format grafana --output dashboard.json
|
||||
```
|
||||
|
||||
### Monitoring Stack Integration
|
||||
- **Prometheus**: Generated alert rules and recording rules
|
||||
- **Grafana**: Dashboard JSON for direct import
|
||||
- **Alertmanager**: Routing and escalation policies
|
||||
- **PagerDuty**: Escalation configuration
|
||||
|
||||
### GitOps Workflow
|
||||
1. Store service definitions in version control
|
||||
2. Generate observability configurations in CI/CD
|
||||
3. Deploy configurations via GitOps
|
||||
4. Monitor effectiveness and iterate
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Custom SLO Targets
|
||||
Override default targets by including them in service definitions:
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "special-service",
|
||||
"type": "api",
|
||||
"criticality": "high",
|
||||
"custom_slos": {
|
||||
"availability_target": 0.9995,
|
||||
"latency_p95_target_ms": 150,
|
||||
"error_rate_target": 0.002
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Alert Rule Templates
|
||||
Use template variables for reusable alert rules:
|
||||
|
||||
```yaml
|
||||
# Generated Prometheus alert rule
|
||||
- alert: {{ service_name }}_HighLatency
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service="{{ service_name }}"}[5m])) > {{ latency_threshold }}
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: "{{ service_name }}"
|
||||
```
|
||||
|
||||
### Dashboard Variants
|
||||
Generate multiple dashboard variants for different use cases:
|
||||
|
||||
```bash
|
||||
# SRE operational dashboard
|
||||
python3 scripts/dashboard_generator.py --input service.json --role sre --output sre-dashboard.json
|
||||
|
||||
# Developer debugging dashboard
|
||||
python3 scripts/dashboard_generator.py --input service.json --role developer --output dev-dashboard.json
|
||||
|
||||
# Executive business dashboard
|
||||
python3 scripts/dashboard_generator.py --input service.json --role executive --output exec-dashboard.json
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### Script Execution Errors
|
||||
- Ensure Python 3.7+ is installed
|
||||
- Check file paths and permissions
|
||||
- Validate JSON syntax in input files
|
||||
|
||||
#### Invalid Service Definitions
|
||||
- Required fields: `name`, `type`, `criticality`
|
||||
- Valid service types: `api`, `web`, `database`, `queue`, `batch`, `ml`
|
||||
- Valid criticality levels: `critical`, `high`, `medium`, `low`
|
||||
|
||||
#### Missing Historical Data
|
||||
- Alert historical data is optional but improves analysis
|
||||
- Include `fires_per_day` and `false_positive_rate` when available
|
||||
- Use monitoring system APIs to populate historical metrics
|
||||
|
||||
### Debug Mode
|
||||
Enable verbose logging by setting environment variable:
|
||||
|
||||
```bash
|
||||
export DEBUG=1
|
||||
python3 scripts/slo_designer.py --input service.json
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
### Development Setup
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone <repository-url>
|
||||
cd engineering/observability-designer
|
||||
|
||||
# Run tests
|
||||
python3 -m pytest tests/
|
||||
|
||||
# Lint code
|
||||
python3 -m flake8 scripts/
|
||||
```
|
||||
|
||||
### Adding New Features
|
||||
1. Follow existing code patterns and error handling
|
||||
2. Include comprehensive docstrings and type hints
|
||||
3. Add test cases for new functionality
|
||||
4. Update documentation and examples
|
||||
|
||||
## Support
|
||||
|
||||
For questions, issues, or feature requests:
|
||||
- Check existing documentation and examples
|
||||
- Review the reference materials in `references/`
|
||||
- Open an issue with detailed reproduction steps
|
||||
- Include sample configurations when reporting bugs
|
||||
|
||||
---
|
||||
|
||||
*This skill is part of the Claude Skills marketplace. For more information about observability best practices, see the reference documentation in the `references/` directory.*
|
||||
@@ -0,0 +1,269 @@
|
||||
---
|
||||
name: "observability-designer"
|
||||
description: "Observability Designer (POWERFUL)"
|
||||
---
|
||||
|
||||
# Observability Designer (POWERFUL)
|
||||
|
||||
**Category:** Engineering
|
||||
**Tier:** POWERFUL
|
||||
**Description:** Design comprehensive observability strategies for production systems including SLI/SLO frameworks, alerting optimization, and dashboard generation.
|
||||
|
||||
## Overview
|
||||
|
||||
Observability Designer enables you to create production-ready observability strategies that provide deep insights into system behavior, performance, and reliability. This skill combines the three pillars of observability (metrics, logs, traces) with proven frameworks like SLI/SLO design, golden signals monitoring, and alert optimization to create comprehensive observability solutions.
|
||||
|
||||
## Core Competencies
|
||||
|
||||
### SLI/SLO/SLA Framework Design
|
||||
- **Service Level Indicators (SLI):** Define measurable signals that indicate service health
|
||||
- **Service Level Objectives (SLO):** Set reliability targets based on user experience
|
||||
- **Service Level Agreements (SLA):** Establish customer-facing commitments with consequences
|
||||
- **Error Budget Management:** Calculate and track error budget consumption
|
||||
- **Burn Rate Alerting:** Multi-window burn rate alerts for proactive SLO protection
|
||||
|
||||
### Three Pillars of Observability
|
||||
|
||||
#### Metrics
|
||||
- **Golden Signals:** Latency, traffic, errors, and saturation monitoring
|
||||
- **RED Method:** Rate, Errors, and Duration for request-driven services
|
||||
- **USE Method:** Utilization, Saturation, and Errors for resource monitoring
|
||||
- **Business Metrics:** Revenue, user engagement, and feature adoption tracking
|
||||
- **Infrastructure Metrics:** CPU, memory, disk, network, and custom resource metrics
|
||||
|
||||
#### Logs
|
||||
- **Structured Logging:** JSON-based log formats with consistent fields
|
||||
- **Log Aggregation:** Centralized log collection and indexing strategies
|
||||
- **Log Levels:** Appropriate use of DEBUG, INFO, WARN, ERROR, FATAL levels
|
||||
- **Correlation IDs:** Request tracing through distributed systems
|
||||
- **Log Sampling:** Volume management for high-throughput systems
|
||||
|
||||
#### Traces
|
||||
- **Distributed Tracing:** End-to-end request flow visualization
|
||||
- **Span Design:** Meaningful span boundaries and metadata
|
||||
- **Trace Sampling:** Intelligent sampling strategies for performance and cost
|
||||
- **Service Maps:** Automatic dependency discovery through traces
|
||||
- **Root Cause Analysis:** Trace-driven debugging workflows
|
||||
|
||||
### Dashboard Design Principles
|
||||
|
||||
#### Information Architecture
|
||||
- **Hierarchy:** Overview → Service → Component → Instance drill-down paths
|
||||
- **Golden Ratio:** 80% operational metrics, 20% exploratory metrics
|
||||
- **Cognitive Load:** Maximum 7±2 panels per dashboard screen
|
||||
- **User Journey:** Role-based dashboard personas (SRE, Developer, Executive)
|
||||
|
||||
#### Visualization Best Practices
|
||||
- **Chart Selection:** Time series for trends, heatmaps for distributions, gauges for status
|
||||
- **Color Theory:** Red for critical, amber for warning, green for healthy states
|
||||
- **Reference Lines:** SLO targets, capacity thresholds, and historical baselines
|
||||
- **Time Ranges:** Default to meaningful windows (4h for incidents, 7d for trends)
|
||||
|
||||
#### Panel Design
|
||||
- **Metric Queries:** Efficient Prometheus/InfluxDB queries with proper aggregation
|
||||
- **Alerting Integration:** Visual alert state indicators on relevant panels
|
||||
- **Interactive Elements:** Template variables, drill-down links, and annotation overlays
|
||||
- **Performance:** Sub-second render times through query optimization
|
||||
|
||||
### Alert Design and Optimization
|
||||
|
||||
#### Alert Classification
|
||||
- **Severity Levels:**
|
||||
- **Critical:** Service down, SLO burn rate high
|
||||
- **Warning:** Approaching thresholds, non-user-facing issues
|
||||
- **Info:** Deployment notifications, capacity planning alerts
|
||||
- **Actionability:** Every alert must have a clear response action
|
||||
- **Alert Routing:** Escalation policies based on severity and team ownership
|
||||
|
||||
#### Alert Fatigue Prevention
|
||||
- **Signal vs Noise:** High precision (few false positives) over high recall
|
||||
- **Hysteresis:** Different thresholds for firing and resolving alerts
|
||||
- **Suppression:** Dependent alert suppression during known outages
|
||||
- **Grouping:** Related alerts grouped into single notifications
|
||||
|
||||
#### Alert Rule Design
|
||||
- **Threshold Selection:** Statistical methods for threshold determination
|
||||
- **Window Functions:** Appropriate averaging windows and percentile calculations
|
||||
- **Alert Lifecycle:** Clear firing conditions and automatic resolution criteria
|
||||
- **Testing:** Alert rule validation against historical data
|
||||
|
||||
### Runbook Generation and Incident Response
|
||||
|
||||
#### Runbook Structure
|
||||
- **Alert Context:** What the alert means and why it fired
|
||||
- **Impact Assessment:** User-facing vs internal impact evaluation
|
||||
- **Investigation Steps:** Ordered troubleshooting procedures with time estimates
|
||||
- **Resolution Actions:** Common fixes and escalation procedures
|
||||
- **Post-Incident:** Follow-up tasks and prevention measures
|
||||
|
||||
#### Incident Detection Patterns
|
||||
- **Anomaly Detection:** Statistical methods for detecting unusual patterns
|
||||
- **Composite Alerts:** Multi-signal alerts for complex failure modes
|
||||
- **Predictive Alerts:** Capacity and trend-based forward-looking alerts
|
||||
- **Canary Monitoring:** Early detection through progressive deployment monitoring
|
||||
|
||||
### Golden Signals Framework
|
||||
|
||||
#### Latency Monitoring
|
||||
- **Request Latency:** P50, P95, P99 response time tracking
|
||||
- **Queue Latency:** Time spent waiting in processing queues
|
||||
- **Network Latency:** Inter-service communication delays
|
||||
- **Database Latency:** Query execution and connection pool metrics
|
||||
|
||||
#### Traffic Monitoring
|
||||
- **Request Rate:** Requests per second with burst detection
|
||||
- **Bandwidth Usage:** Network throughput and capacity utilization
|
||||
- **User Sessions:** Active user tracking and session duration
|
||||
- **Feature Usage:** API endpoint and feature adoption metrics
|
||||
|
||||
#### Error Monitoring
|
||||
- **Error Rate:** 4xx and 5xx HTTP response code tracking
|
||||
- **Error Budget:** SLO-based error rate targets and consumption
|
||||
- **Error Distribution:** Error type classification and trending
|
||||
- **Silent Failures:** Detection of processing failures without HTTP errors
|
||||
|
||||
#### Saturation Monitoring
|
||||
- **Resource Utilization:** CPU, memory, disk, and network usage
|
||||
- **Queue Depth:** Processing queue length and wait times
|
||||
- **Connection Pools:** Database and service connection saturation
|
||||
- **Rate Limiting:** API throttling and quota exhaustion tracking
|
||||
|
||||
### Distributed Tracing Strategies
|
||||
|
||||
#### Trace Architecture
|
||||
- **Sampling Strategy:** Head-based, tail-based, and adaptive sampling
|
||||
- **Trace Propagation:** Context propagation across service boundaries
|
||||
- **Span Correlation:** Parent-child relationship modeling
|
||||
- **Trace Storage:** Retention policies and storage optimization
|
||||
|
||||
#### Service Instrumentation
|
||||
- **Auto-Instrumentation:** Framework-based automatic trace generation
|
||||
- **Manual Instrumentation:** Custom span creation for business logic
|
||||
- **Baggage Handling:** Cross-cutting concern propagation
|
||||
- **Performance Impact:** Instrumentation overhead measurement and optimization
|
||||
|
||||
### Log Aggregation Patterns
|
||||
|
||||
#### Collection Architecture
|
||||
- **Agent Deployment:** Log shipping agent strategies (push vs pull)
|
||||
- **Log Routing:** Topic-based routing and filtering
|
||||
- **Parsing Strategies:** Structured vs unstructured log handling
|
||||
- **Schema Evolution:** Log format versioning and migration
|
||||
|
||||
#### Storage and Indexing
|
||||
- **Index Design:** Optimized field indexing for common query patterns
|
||||
- **Retention Policies:** Time and volume-based log retention
|
||||
- **Compression:** Log data compression and archival strategies
|
||||
- **Search Performance:** Query optimization and result caching
|
||||
|
||||
### Cost Optimization for Observability
|
||||
|
||||
#### Data Management
|
||||
- **Metric Retention:** Tiered retention based on metric importance
|
||||
- **Log Sampling:** Intelligent sampling to reduce ingestion costs
|
||||
- **Trace Sampling:** Cost-effective trace collection strategies
|
||||
- **Data Archival:** Cold storage for historical observability data
|
||||
|
||||
#### Resource Optimization
|
||||
- **Query Efficiency:** Optimized metric and log queries
|
||||
- **Storage Costs:** Appropriate storage tiers for different data types
|
||||
- **Ingestion Rate Limiting:** Controlled data ingestion to manage costs
|
||||
- **Cardinality Management:** High-cardinality metric detection and mitigation
|
||||
|
||||
## Scripts Overview
|
||||
|
||||
This skill includes three powerful Python scripts for comprehensive observability design:
|
||||
|
||||
### 1. SLO Designer (`slo_designer.py`)
|
||||
Generates complete SLI/SLO frameworks based on service characteristics:
|
||||
- **Input:** Service description JSON (type, criticality, dependencies)
|
||||
- **Output:** SLI definitions, SLO targets, error budgets, burn rate alerts, SLA recommendations
|
||||
- **Features:** Multi-window burn rate calculations, error budget policies, alert rule generation
|
||||
|
||||
### 2. Alert Optimizer (`alert_optimizer.py`)
|
||||
Analyzes and optimizes existing alert configurations:
|
||||
- **Input:** Alert configuration JSON with rules, thresholds, and routing
|
||||
- **Output:** Optimization report and improved alert configuration
|
||||
- **Features:** Noise detection, coverage gaps, duplicate identification, threshold optimization
|
||||
|
||||
### 3. Dashboard Generator (`dashboard_generator.py`)
|
||||
Creates comprehensive dashboard specifications:
|
||||
- **Input:** Service/system description JSON
|
||||
- **Output:** Grafana-compatible dashboard JSON and documentation
|
||||
- **Features:** Golden signals coverage, RED/USE methods, drill-down paths, role-based views
|
||||
|
||||
## Integration Patterns
|
||||
|
||||
### Monitoring Stack Integration
|
||||
- **Prometheus:** Metric collection and alerting rule generation
|
||||
- **Grafana:** Dashboard creation and visualization configuration
|
||||
- **Elasticsearch/Kibana:** Log analysis and dashboard integration
|
||||
- **Jaeger/Zipkin:** Distributed tracing configuration and analysis
|
||||
|
||||
### CI/CD Integration
|
||||
- **Pipeline Monitoring:** Build, test, and deployment observability
|
||||
- **Deployment Correlation:** Release impact tracking and rollback triggers
|
||||
- **Feature Flag Monitoring:** A/B test and feature rollout observability
|
||||
- **Performance Regression:** Automated performance monitoring in pipelines
|
||||
|
||||
### Incident Management Integration
|
||||
- **PagerDuty/VictorOps:** Alert routing and escalation policies
|
||||
- **Slack/Teams:** Notification and collaboration integration
|
||||
- **JIRA/ServiceNow:** Incident tracking and resolution workflows
|
||||
- **Post-Mortem:** Automated incident analysis and improvement tracking
|
||||
|
||||
## Advanced Patterns
|
||||
|
||||
### Multi-Cloud Observability
|
||||
- **Cross-Cloud Metrics:** Unified metrics across AWS, GCP, Azure
|
||||
- **Network Observability:** Inter-cloud connectivity monitoring
|
||||
- **Cost Attribution:** Cloud resource cost tracking and optimization
|
||||
- **Compliance Monitoring:** Security and compliance posture tracking
|
||||
|
||||
### Microservices Observability
|
||||
- **Service Mesh Integration:** Istio/Linkerd observability configuration
|
||||
- **API Gateway Monitoring:** Request routing and rate limiting observability
|
||||
- **Container Orchestration:** Kubernetes cluster and workload monitoring
|
||||
- **Service Discovery:** Dynamic service monitoring and health checks
|
||||
|
||||
### Machine Learning Observability
|
||||
- **Model Performance:** Accuracy, drift, and bias monitoring
|
||||
- **Feature Store Monitoring:** Feature quality and freshness tracking
|
||||
- **Pipeline Observability:** ML pipeline execution and performance monitoring
|
||||
- **A/B Test Analysis:** Statistical significance and business impact measurement
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Organizational Alignment
|
||||
- **SLO Setting:** Collaborative target setting between product and engineering
|
||||
- **Alert Ownership:** Clear escalation paths and team responsibilities
|
||||
- **Dashboard Governance:** Centralized dashboard management and standards
|
||||
- **Training Programs:** Team education on observability tools and practices
|
||||
|
||||
### Technical Excellence
|
||||
- **Infrastructure as Code:** Observability configuration version control
|
||||
- **Testing Strategy:** Alert rule testing and dashboard validation
|
||||
- **Performance Monitoring:** Observability system performance tracking
|
||||
- **Security Considerations:** Access control and data privacy in observability
|
||||
|
||||
### Continuous Improvement
|
||||
- **Metrics Review:** Regular SLI/SLO effectiveness assessment
|
||||
- **Alert Tuning:** Ongoing alert threshold and routing optimization
|
||||
- **Dashboard Evolution:** User feedback-driven dashboard improvements
|
||||
- **Tool Evaluation:** Regular assessment of observability tool effectiveness
|
||||
|
||||
## Success Metrics
|
||||
|
||||
### Operational Metrics
|
||||
- **Mean Time to Detection (MTTD):** How quickly issues are identified
|
||||
- **Mean Time to Resolution (MTTR):** Time from detection to resolution
|
||||
- **Alert Precision:** Percentage of actionable alerts
|
||||
- **SLO Achievement:** Percentage of SLO targets met consistently
|
||||
|
||||
### Business Metrics
|
||||
- **System Reliability:** Overall uptime and user experience quality
|
||||
- **Engineering Velocity:** Development team productivity and deployment frequency
|
||||
- **Cost Efficiency:** Observability cost as percentage of infrastructure spend
|
||||
- **Customer Satisfaction:** User-reported reliability and performance satisfaction
|
||||
|
||||
This comprehensive observability design skill enables organizations to build robust, scalable monitoring and alerting systems that provide actionable insights while maintaining cost efficiency and operational excellence.
|
||||
@@ -0,0 +1,276 @@
|
||||
{
|
||||
"alerts": [
|
||||
{
|
||||
"alert": "HighLatency",
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m])) > 0.5",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High request latency detected",
|
||||
"description": "95th percentile latency is {{ $value }}s for payment-service",
|
||||
"runbook_url": "https://runbooks.company.com/high-latency"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 2.5,
|
||||
"false_positive_rate": 0.15,
|
||||
"average_duration_minutes": 12
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "ServiceDown",
|
||||
"expr": "up{service=\"payment-service\"} == 0",
|
||||
"labels": {
|
||||
"severity": "critical",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Payment service is down",
|
||||
"description": "Payment service has been down for more than 1 minute",
|
||||
"runbook_url": "https://runbooks.company.com/service-down"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 0.1,
|
||||
"false_positive_rate": 0.05,
|
||||
"average_duration_minutes": 3
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "HighErrorRate",
|
||||
"expr": "sum(rate(http_requests_total{service=\"payment-service\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"payment-service\"}[5m])) > 0.01",
|
||||
"for": "2m",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High error rate detected",
|
||||
"description": "Error rate is {{ $value | humanizePercentage }} for payment-service",
|
||||
"runbook_url": "https://runbooks.company.com/high-error-rate"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 1.8,
|
||||
"false_positive_rate": 0.25,
|
||||
"average_duration_minutes": 8
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "HighCPUUsage",
|
||||
"expr": "rate(process_cpu_seconds_total{service=\"payment-service\"}[5m]) * 100 > 80",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High CPU usage",
|
||||
"description": "CPU usage is {{ $value }}% for payment-service"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 15.2,
|
||||
"false_positive_rate": 0.8,
|
||||
"average_duration_minutes": 45
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "HighMemoryUsage",
|
||||
"expr": "process_resident_memory_bytes{service=\"payment-service\"} / process_virtual_memory_max_bytes{service=\"payment-service\"} * 100 > 85",
|
||||
"labels": {
|
||||
"severity": "info",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High memory usage",
|
||||
"description": "Memory usage is {{ $value }}% for payment-service"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 8.5,
|
||||
"false_positive_rate": 0.6,
|
||||
"average_duration_minutes": 30
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "DatabaseConnectionPoolExhaustion",
|
||||
"expr": "db_connections_active{service=\"payment-service\"} / db_connections_max{service=\"payment-service\"} > 0.9",
|
||||
"for": "1m",
|
||||
"labels": {
|
||||
"severity": "critical",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Database connection pool near exhaustion",
|
||||
"description": "Connection pool utilization is {{ $value | humanizePercentage }}",
|
||||
"runbook_url": "https://runbooks.company.com/db-connections"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 0.3,
|
||||
"false_positive_rate": 0.1,
|
||||
"average_duration_minutes": 5
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "LowTraffic",
|
||||
"expr": "sum(rate(http_requests_total{service=\"payment-service\"}[5m])) < 10",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Unusually low traffic",
|
||||
"description": "Request rate is {{ $value }} RPS, which is unusually low"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 12.0,
|
||||
"false_positive_rate": 0.9,
|
||||
"average_duration_minutes": 120
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "HighLatencyDuplicate",
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m])) > 0.5",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High request latency detected (duplicate)",
|
||||
"description": "95th percentile latency is {{ $value }}s for payment-service"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 2.5,
|
||||
"false_positive_rate": 0.15,
|
||||
"average_duration_minutes": 12
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "VeryLowErrorRate",
|
||||
"expr": "sum(rate(http_requests_total{service=\"payment-service\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"payment-service\"}[5m])) > 0.001",
|
||||
"labels": {
|
||||
"severity": "info",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Error rate above 0.1%",
|
||||
"description": "Error rate is {{ $value | humanizePercentage }}"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 25.0,
|
||||
"false_positive_rate": 0.95,
|
||||
"average_duration_minutes": 5
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "DiskUsageHigh",
|
||||
"expr": "disk_usage_percent{service=\"payment-service\"} > 85",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Disk usage high",
|
||||
"description": "Disk usage is {{ $value }}%"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 3.2,
|
||||
"false_positive_rate": 0.4,
|
||||
"average_duration_minutes": 240
|
||||
}
|
||||
}
|
||||
],
|
||||
"services": [
|
||||
{
|
||||
"name": "payment-service",
|
||||
"type": "api",
|
||||
"criticality": "critical",
|
||||
"team": "payments"
|
||||
},
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high",
|
||||
"team": "identity"
|
||||
},
|
||||
{
|
||||
"name": "notification-service",
|
||||
"type": "api",
|
||||
"criticality": "medium",
|
||||
"team": "communications"
|
||||
}
|
||||
],
|
||||
"alert_routing": {
|
||||
"routes": [
|
||||
{
|
||||
"match": {
|
||||
"severity": "critical"
|
||||
},
|
||||
"receiver": "pager-critical",
|
||||
"group_wait": "10s",
|
||||
"group_interval": "1m",
|
||||
"repeat_interval": "5m"
|
||||
},
|
||||
{
|
||||
"match": {
|
||||
"severity": "warning"
|
||||
},
|
||||
"receiver": "slack-warnings",
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "1h"
|
||||
},
|
||||
{
|
||||
"match": {
|
||||
"severity": "info"
|
||||
},
|
||||
"receiver": "email-info",
|
||||
"group_wait": "2m",
|
||||
"group_interval": "10m",
|
||||
"repeat_interval": "24h"
|
||||
}
|
||||
]
|
||||
},
|
||||
"receivers": [
|
||||
{
|
||||
"name": "pager-critical",
|
||||
"pagerduty_configs": [
|
||||
{
|
||||
"routing_key": "pager-key-critical",
|
||||
"description": "Critical alert: {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "slack-warnings",
|
||||
"slack_configs": [
|
||||
{
|
||||
"api_url": "https://hooks.slack.com/services/warnings",
|
||||
"channel": "#alerts-warnings",
|
||||
"title": "Warning Alert",
|
||||
"text": "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "email-info",
|
||||
"email_configs": [
|
||||
{
|
||||
"to": "team-notifications@company.com",
|
||||
"subject": "Info Alert: {{ .GroupLabels.alertname }}",
|
||||
"body": "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
{
|
||||
"name": "payment-service",
|
||||
"type": "api",
|
||||
"criticality": "critical",
|
||||
"user_facing": true,
|
||||
"description": "Handles payment processing and transaction management",
|
||||
"team": "payments",
|
||||
"environment": "production",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "payment-gateway",
|
||||
"type": "external",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "fraud-detection",
|
||||
"type": "ml",
|
||||
"criticality": "high"
|
||||
}
|
||||
],
|
||||
"endpoints": [
|
||||
{
|
||||
"path": "/api/v1/payments",
|
||||
"method": "POST",
|
||||
"sla_latency_ms": 500,
|
||||
"expected_tps": 100
|
||||
},
|
||||
{
|
||||
"path": "/api/v1/payments/{id}",
|
||||
"method": "GET",
|
||||
"sla_latency_ms": 200,
|
||||
"expected_tps": 500
|
||||
},
|
||||
{
|
||||
"path": "/api/v1/payments/{id}/refund",
|
||||
"method": "POST",
|
||||
"sla_latency_ms": 1000,
|
||||
"expected_tps": 10
|
||||
}
|
||||
],
|
||||
"business_metrics": {
|
||||
"revenue_per_hour": {
|
||||
"metric": "sum(payment_amount * rate(payments_successful_total[1h]))",
|
||||
"target": 50000,
|
||||
"unit": "USD"
|
||||
},
|
||||
"conversion_rate": {
|
||||
"metric": "sum(rate(payments_successful_total[5m])) / sum(rate(payment_attempts_total[5m]))",
|
||||
"target": 0.95,
|
||||
"unit": "percentage"
|
||||
}
|
||||
},
|
||||
"infrastructure": {
|
||||
"container_orchestrator": "kubernetes",
|
||||
"replicas": 6,
|
||||
"cpu_limit": "2000m",
|
||||
"memory_limit": "4Gi",
|
||||
"database": {
|
||||
"type": "postgresql",
|
||||
"connection_pool_size": 20
|
||||
},
|
||||
"cache": {
|
||||
"type": "redis",
|
||||
"cluster_size": 3
|
||||
}
|
||||
},
|
||||
"compliance_requirements": [
|
||||
"PCI-DSS",
|
||||
"SOX",
|
||||
"GDPR"
|
||||
],
|
||||
"tags": [
|
||||
"payment",
|
||||
"transaction",
|
||||
"critical-path",
|
||||
"revenue-generating"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
{
|
||||
"name": "customer-portal",
|
||||
"type": "web",
|
||||
"criticality": "high",
|
||||
"user_facing": true,
|
||||
"description": "Customer-facing web application for account management and billing",
|
||||
"team": "frontend",
|
||||
"environment": "production",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "billing-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "notification-service",
|
||||
"type": "api",
|
||||
"criticality": "medium"
|
||||
},
|
||||
{
|
||||
"name": "cdn",
|
||||
"type": "external",
|
||||
"criticality": "medium"
|
||||
}
|
||||
],
|
||||
"pages": [
|
||||
{
|
||||
"path": "/dashboard",
|
||||
"sla_load_time_ms": 2000,
|
||||
"expected_concurrent_users": 1000
|
||||
},
|
||||
{
|
||||
"path": "/billing",
|
||||
"sla_load_time_ms": 3000,
|
||||
"expected_concurrent_users": 200
|
||||
},
|
||||
{
|
||||
"path": "/settings",
|
||||
"sla_load_time_ms": 1500,
|
||||
"expected_concurrent_users": 100
|
||||
}
|
||||
],
|
||||
"business_metrics": {
|
||||
"daily_active_users": {
|
||||
"metric": "count(user_sessions_started_total[1d])",
|
||||
"target": 10000,
|
||||
"unit": "users"
|
||||
},
|
||||
"session_duration": {
|
||||
"metric": "avg(user_session_duration_seconds)",
|
||||
"target": 300,
|
||||
"unit": "seconds"
|
||||
},
|
||||
"bounce_rate": {
|
||||
"metric": "sum(rate(page_views_bounced_total[1h])) / sum(rate(page_views_total[1h]))",
|
||||
"target": 0.3,
|
||||
"unit": "percentage"
|
||||
}
|
||||
},
|
||||
"infrastructure": {
|
||||
"container_orchestrator": "kubernetes",
|
||||
"replicas": 4,
|
||||
"cpu_limit": "1000m",
|
||||
"memory_limit": "2Gi",
|
||||
"storage": {
|
||||
"type": "nfs",
|
||||
"size": "50Gi"
|
||||
},
|
||||
"ingress": {
|
||||
"type": "nginx",
|
||||
"ssl_termination": true,
|
||||
"rate_limiting": {
|
||||
"requests_per_second": 100,
|
||||
"burst": 200
|
||||
}
|
||||
}
|
||||
},
|
||||
"monitoring": {
|
||||
"synthetic_checks": [
|
||||
{
|
||||
"name": "login_flow",
|
||||
"url": "/auth/login",
|
||||
"frequency": "1m",
|
||||
"locations": ["us-east", "eu-west", "ap-south"]
|
||||
},
|
||||
{
|
||||
"name": "checkout_flow",
|
||||
"url": "/billing/checkout",
|
||||
"frequency": "5m",
|
||||
"locations": ["us-east", "eu-west"]
|
||||
}
|
||||
],
|
||||
"rum": {
|
||||
"enabled": true,
|
||||
"sampling_rate": 0.1
|
||||
}
|
||||
},
|
||||
"compliance_requirements": [
|
||||
"GDPR",
|
||||
"CCPA"
|
||||
],
|
||||
"tags": [
|
||||
"frontend",
|
||||
"customer-facing",
|
||||
"billing",
|
||||
"high-traffic"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,811 @@
|
||||
{
|
||||
"metadata": {
|
||||
"title": "customer-portal - SRE Dashboard",
|
||||
"service": {
|
||||
"name": "customer-portal",
|
||||
"type": "web",
|
||||
"criticality": "high",
|
||||
"user_facing": true,
|
||||
"description": "Customer-facing web application for account management and billing",
|
||||
"team": "frontend",
|
||||
"environment": "production",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "billing-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "notification-service",
|
||||
"type": "api",
|
||||
"criticality": "medium"
|
||||
},
|
||||
{
|
||||
"name": "cdn",
|
||||
"type": "external",
|
||||
"criticality": "medium"
|
||||
}
|
||||
],
|
||||
"pages": [
|
||||
{
|
||||
"path": "/dashboard",
|
||||
"sla_load_time_ms": 2000,
|
||||
"expected_concurrent_users": 1000
|
||||
},
|
||||
{
|
||||
"path": "/billing",
|
||||
"sla_load_time_ms": 3000,
|
||||
"expected_concurrent_users": 200
|
||||
},
|
||||
{
|
||||
"path": "/settings",
|
||||
"sla_load_time_ms": 1500,
|
||||
"expected_concurrent_users": 100
|
||||
}
|
||||
],
|
||||
"business_metrics": {
|
||||
"daily_active_users": {
|
||||
"metric": "count(user_sessions_started_total[1d])",
|
||||
"target": 10000,
|
||||
"unit": "users"
|
||||
},
|
||||
"session_duration": {
|
||||
"metric": "avg(user_session_duration_seconds)",
|
||||
"target": 300,
|
||||
"unit": "seconds"
|
||||
},
|
||||
"bounce_rate": {
|
||||
"metric": "sum(rate(page_views_bounced_total[1h])) / sum(rate(page_views_total[1h]))",
|
||||
"target": 0.3,
|
||||
"unit": "percentage"
|
||||
}
|
||||
},
|
||||
"infrastructure": {
|
||||
"container_orchestrator": "kubernetes",
|
||||
"replicas": 4,
|
||||
"cpu_limit": "1000m",
|
||||
"memory_limit": "2Gi",
|
||||
"storage": {
|
||||
"type": "nfs",
|
||||
"size": "50Gi"
|
||||
},
|
||||
"ingress": {
|
||||
"type": "nginx",
|
||||
"ssl_termination": true,
|
||||
"rate_limiting": {
|
||||
"requests_per_second": 100,
|
||||
"burst": 200
|
||||
}
|
||||
}
|
||||
},
|
||||
"monitoring": {
|
||||
"synthetic_checks": [
|
||||
{
|
||||
"name": "login_flow",
|
||||
"url": "/auth/login",
|
||||
"frequency": "1m",
|
||||
"locations": [
|
||||
"us-east",
|
||||
"eu-west",
|
||||
"ap-south"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "checkout_flow",
|
||||
"url": "/billing/checkout",
|
||||
"frequency": "5m",
|
||||
"locations": [
|
||||
"us-east",
|
||||
"eu-west"
|
||||
]
|
||||
}
|
||||
],
|
||||
"rum": {
|
||||
"enabled": true,
|
||||
"sampling_rate": 0.1
|
||||
}
|
||||
},
|
||||
"compliance_requirements": [
|
||||
"GDPR",
|
||||
"CCPA"
|
||||
],
|
||||
"tags": [
|
||||
"frontend",
|
||||
"customer-facing",
|
||||
"billing",
|
||||
"high-traffic"
|
||||
]
|
||||
},
|
||||
"target_role": "sre",
|
||||
"generated_at": "2026-02-16T14:02:03.421248Z",
|
||||
"version": "1.0"
|
||||
},
|
||||
"configuration": {
|
||||
"time_ranges": [
|
||||
"1h",
|
||||
"6h",
|
||||
"1d",
|
||||
"7d"
|
||||
],
|
||||
"default_time_range": "6h",
|
||||
"refresh_interval": "30s",
|
||||
"timezone": "UTC",
|
||||
"theme": "dark"
|
||||
},
|
||||
"layout": {
|
||||
"grid_settings": {
|
||||
"width": 24,
|
||||
"height_unit": "px",
|
||||
"cell_height": 30
|
||||
},
|
||||
"sections": [
|
||||
{
|
||||
"title": "Service Overview",
|
||||
"collapsed": false,
|
||||
"y_position": 0,
|
||||
"panels": [
|
||||
"service_status",
|
||||
"slo_summary",
|
||||
"error_budget"
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Golden Signals",
|
||||
"collapsed": false,
|
||||
"y_position": 8,
|
||||
"panels": [
|
||||
"latency",
|
||||
"traffic",
|
||||
"errors",
|
||||
"saturation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Resource Utilization",
|
||||
"collapsed": false,
|
||||
"y_position": 16,
|
||||
"panels": [
|
||||
"cpu_usage",
|
||||
"memory_usage",
|
||||
"network_io",
|
||||
"disk_io"
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Dependencies & Downstream",
|
||||
"collapsed": true,
|
||||
"y_position": 24,
|
||||
"panels": [
|
||||
"dependency_status",
|
||||
"downstream_latency",
|
||||
"circuit_breakers"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": "service_status",
|
||||
"title": "Service Status",
|
||||
"type": "stat",
|
||||
"grid_pos": {
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{service=\"customer-portal\"}",
|
||||
"legendFormat": "Status"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Status"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"mode": "thresholds"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
},
|
||||
{
|
||||
"options": {
|
||||
"1": {
|
||||
"text": "UP"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"textMode": "value_and_name"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "slo_summary",
|
||||
"title": "SLO Achievement (30d)",
|
||||
"type": "stat",
|
||||
"grid_pos": {
|
||||
"x": 6,
|
||||
"y": 0,
|
||||
"w": 9,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (increase(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[30d]) / increase(http_requests_total{service=\"customer-portal\"}[30d]))) * 100",
|
||||
"legendFormat": "Availability"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, increase(http_request_duration_seconds_bucket{service=\"customer-portal\"}[30d])) * 1000",
|
||||
"legendFormat": "P95 Latency (ms)"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 99.0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 99.9
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"textMode": "value_and_name"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "error_budget",
|
||||
"title": "Error Budget Remaining",
|
||||
"type": "gauge",
|
||||
"grid_pos": {
|
||||
"x": 15,
|
||||
"y": 0,
|
||||
"w": 9,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (increase(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[30d]) / increase(http_requests_total{service=\"customer-portal\"}[30d])) - 0.999) / 0.001 * 100",
|
||||
"legendFormat": "Error Budget %"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 25
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 50
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "latency",
|
||||
"title": "Request Latency",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 0,
|
||||
"y": 8,
|
||||
"w": 12,
|
||||
"h": 6
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000",
|
||||
"legendFormat": "P50 Latency"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000",
|
||||
"legendFormat": "P95 Latency"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000",
|
||||
"legendFormat": "P99 Latency"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "traffic",
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 12,
|
||||
"y": 8,
|
||||
"w": 12,
|
||||
"h": 6
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\"}[5m]))",
|
||||
"legendFormat": "Total RPS"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"2..\"}[5m]))",
|
||||
"legendFormat": "2xx RPS"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"4..\"}[5m]))",
|
||||
"legendFormat": "4xx RPS"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[5m]))",
|
||||
"legendFormat": "5xx RPS"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "errors",
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 0,
|
||||
"y": 14,
|
||||
"w": 12,
|
||||
"h": 6
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"customer-portal\"}[5m])) * 100",
|
||||
"legendFormat": "5xx Error Rate"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"4..\"}[5m])) / sum(rate(http_requests_total{service=\"customer-portal\"}[5m])) * 100",
|
||||
"legendFormat": "4xx Error Rate"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 20
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "5xx Error Rate"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "saturation",
|
||||
"title": "Saturation Metrics",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 12,
|
||||
"y": 14,
|
||||
"w": 12,
|
||||
"h": 6
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_cpu_seconds_total{service=\"customer-portal\"}[5m]) * 100",
|
||||
"legendFormat": "CPU Usage %"
|
||||
},
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{service=\"customer-portal\"} / process_virtual_memory_max_bytes{service=\"customer-portal\"} * 100",
|
||||
"legendFormat": "Memory Usage %"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "cpu_usage",
|
||||
"title": "CPU Usage",
|
||||
"type": "gauge",
|
||||
"grid_pos": {
|
||||
"x": 0,
|
||||
"y": 20,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_cpu_seconds_total{service=\"customer-portal\"}[5m]) * 100",
|
||||
"legendFormat": "CPU %"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "memory_usage",
|
||||
"title": "Memory Usage",
|
||||
"type": "gauge",
|
||||
"grid_pos": {
|
||||
"x": 6,
|
||||
"y": 20,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{service=\"customer-portal\"} / 1024 / 1024",
|
||||
"legendFormat": "Memory MB"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"unit": "decbytes",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 512000000
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1024000000
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "network_io",
|
||||
"title": "Network I/O",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 12,
|
||||
"y": 20,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_network_receive_bytes_total{service=\"customer-portal\"}[5m])",
|
||||
"legendFormat": "RX Bytes/s"
|
||||
},
|
||||
{
|
||||
"expr": "rate(process_network_transmit_bytes_total{service=\"customer-portal\"}[5m])",
|
||||
"legendFormat": "TX Bytes/s"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "binBps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "disk_io",
|
||||
"title": "Disk I/O",
|
||||
"type": "timeseries",
|
||||
"grid_pos": {
|
||||
"x": 18,
|
||||
"y": 20,
|
||||
"w": 6,
|
||||
"h": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_disk_read_bytes_total{service=\"customer-portal\"}[5m])",
|
||||
"legendFormat": "Read Bytes/s"
|
||||
},
|
||||
{
|
||||
"expr": "rate(process_disk_write_bytes_total{service=\"customer-portal\"}[5m])",
|
||||
"legendFormat": "Write Bytes/s"
|
||||
}
|
||||
],
|
||||
"field_config": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "binBps"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "environment",
|
||||
"type": "query",
|
||||
"query": "label_values(environment)",
|
||||
"current": {
|
||||
"text": "production",
|
||||
"value": "production"
|
||||
},
|
||||
"includeAll": false,
|
||||
"multi": false,
|
||||
"refresh": "on_dashboard_load"
|
||||
},
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"query": "label_values(up{service=\"customer-portal\"}, instance)",
|
||||
"current": {
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"refresh": "on_time_range_change"
|
||||
},
|
||||
{
|
||||
"name": "handler",
|
||||
"type": "query",
|
||||
"query": "label_values(http_requests_total{service=\"customer-portal\"}, handler)",
|
||||
"current": {
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"refresh": "on_time_range_change"
|
||||
}
|
||||
],
|
||||
"alerts_integration": {
|
||||
"alert_annotations": true,
|
||||
"alert_rules_query": "ALERTS{service=\"customer-portal\"}",
|
||||
"alert_panels": [
|
||||
{
|
||||
"title": "Active Alerts",
|
||||
"type": "table",
|
||||
"query": "ALERTS{service=\"customer-portal\",alertstate=\"firing\"}",
|
||||
"columns": [
|
||||
"alertname",
|
||||
"severity",
|
||||
"instance",
|
||||
"description"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"drill_down_paths": {
|
||||
"service_overview": {
|
||||
"from": "service_status",
|
||||
"to": "detailed_health_dashboard",
|
||||
"url": "/d/service-health/customer-portal-health",
|
||||
"params": [
|
||||
"var-service",
|
||||
"var-environment"
|
||||
]
|
||||
},
|
||||
"error_investigation": {
|
||||
"from": "errors",
|
||||
"to": "error_details_dashboard",
|
||||
"url": "/d/errors/customer-portal-errors",
|
||||
"params": [
|
||||
"var-service",
|
||||
"var-time_range"
|
||||
]
|
||||
},
|
||||
"latency_analysis": {
|
||||
"from": "latency",
|
||||
"to": "trace_analysis_dashboard",
|
||||
"url": "/d/traces/customer-portal-traces",
|
||||
"params": [
|
||||
"var-service",
|
||||
"var-handler"
|
||||
]
|
||||
},
|
||||
"capacity_planning": {
|
||||
"from": "saturation",
|
||||
"to": "capacity_dashboard",
|
||||
"url": "/d/capacity/customer-portal-capacity",
|
||||
"params": [
|
||||
"var-service",
|
||||
"var-time_range"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,545 @@
|
||||
{
|
||||
"metadata": {
|
||||
"service": {
|
||||
"name": "payment-service",
|
||||
"type": "api",
|
||||
"criticality": "critical",
|
||||
"user_facing": true,
|
||||
"description": "Handles payment processing and transaction management",
|
||||
"team": "payments",
|
||||
"environment": "production",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "payment-gateway",
|
||||
"type": "external",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "fraud-detection",
|
||||
"type": "ml",
|
||||
"criticality": "high"
|
||||
}
|
||||
],
|
||||
"endpoints": [
|
||||
{
|
||||
"path": "/api/v1/payments",
|
||||
"method": "POST",
|
||||
"sla_latency_ms": 500,
|
||||
"expected_tps": 100
|
||||
},
|
||||
{
|
||||
"path": "/api/v1/payments/{id}",
|
||||
"method": "GET",
|
||||
"sla_latency_ms": 200,
|
||||
"expected_tps": 500
|
||||
},
|
||||
{
|
||||
"path": "/api/v1/payments/{id}/refund",
|
||||
"method": "POST",
|
||||
"sla_latency_ms": 1000,
|
||||
"expected_tps": 10
|
||||
}
|
||||
],
|
||||
"business_metrics": {
|
||||
"revenue_per_hour": {
|
||||
"metric": "sum(payment_amount * rate(payments_successful_total[1h]))",
|
||||
"target": 50000,
|
||||
"unit": "USD"
|
||||
},
|
||||
"conversion_rate": {
|
||||
"metric": "sum(rate(payments_successful_total[5m])) / sum(rate(payment_attempts_total[5m]))",
|
||||
"target": 0.95,
|
||||
"unit": "percentage"
|
||||
}
|
||||
},
|
||||
"infrastructure": {
|
||||
"container_orchestrator": "kubernetes",
|
||||
"replicas": 6,
|
||||
"cpu_limit": "2000m",
|
||||
"memory_limit": "4Gi",
|
||||
"database": {
|
||||
"type": "postgresql",
|
||||
"connection_pool_size": 20
|
||||
},
|
||||
"cache": {
|
||||
"type": "redis",
|
||||
"cluster_size": 3
|
||||
}
|
||||
},
|
||||
"compliance_requirements": [
|
||||
"PCI-DSS",
|
||||
"SOX",
|
||||
"GDPR"
|
||||
],
|
||||
"tags": [
|
||||
"payment",
|
||||
"transaction",
|
||||
"critical-path",
|
||||
"revenue-generating"
|
||||
]
|
||||
},
|
||||
"generated_at": "2026-02-16T14:01:57.572080Z",
|
||||
"framework_version": "1.0"
|
||||
},
|
||||
"slis": [
|
||||
{
|
||||
"name": "Availability",
|
||||
"description": "Percentage of successful requests",
|
||||
"type": "ratio",
|
||||
"good_events": "sum(rate(http_requests_total{service=\"payment-service\",code!~\"5..\"}))",
|
||||
"total_events": "sum(rate(http_requests_total{service=\"payment-service\"}))",
|
||||
"unit": "percentage"
|
||||
},
|
||||
{
|
||||
"name": "Request Latency P95",
|
||||
"description": "95th percentile of request latency",
|
||||
"type": "threshold",
|
||||
"query": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m]))",
|
||||
"unit": "seconds"
|
||||
},
|
||||
{
|
||||
"name": "Error Rate",
|
||||
"description": "Rate of 5xx errors",
|
||||
"type": "ratio",
|
||||
"good_events": "sum(rate(http_requests_total{service=\"payment-service\",code!~\"5..\"}))",
|
||||
"total_events": "sum(rate(http_requests_total{service=\"payment-service\"}))",
|
||||
"unit": "percentage"
|
||||
},
|
||||
{
|
||||
"name": "Request Throughput",
|
||||
"description": "Requests per second",
|
||||
"type": "gauge",
|
||||
"query": "sum(rate(http_requests_total{service=\"payment-service\"}[5m]))",
|
||||
"unit": "requests/sec"
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate",
|
||||
"description": "Percentage of successful complete user journeys",
|
||||
"type": "ratio",
|
||||
"good_events": "sum(rate(user_journey_total{service=\"payment-service\",status=\"success\"}[5m]))",
|
||||
"total_events": "sum(rate(user_journey_total{service=\"payment-service\"}[5m]))",
|
||||
"unit": "percentage"
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability",
|
||||
"description": "Percentage of time key features are available",
|
||||
"type": "ratio",
|
||||
"good_events": "sum(rate(feature_checks_total{service=\"payment-service\",status=\"available\"}[5m]))",
|
||||
"total_events": "sum(rate(feature_checks_total{service=\"payment-service\"}[5m]))",
|
||||
"unit": "percentage"
|
||||
}
|
||||
],
|
||||
"slos": [
|
||||
{
|
||||
"name": "Availability SLO",
|
||||
"description": "Service level objective for percentage of successful requests",
|
||||
"sli_name": "Availability",
|
||||
"target_value": 0.9999,
|
||||
"target_display": "99.99%",
|
||||
"operator": ">=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "Request Latency P95 SLO",
|
||||
"description": "Service level objective for 95th percentile of request latency",
|
||||
"sli_name": "Request Latency P95",
|
||||
"target_value": 100,
|
||||
"target_display": "0.1s",
|
||||
"operator": "<=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "Error Rate SLO",
|
||||
"description": "Service level objective for rate of 5xx errors",
|
||||
"sli_name": "Error Rate",
|
||||
"target_value": 0.001,
|
||||
"target_display": "0.1%",
|
||||
"operator": "<=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate SLO",
|
||||
"description": "Service level objective for percentage of successful complete user journeys",
|
||||
"sli_name": "User Journey Success Rate",
|
||||
"target_value": 0.9999,
|
||||
"target_display": "99.99%",
|
||||
"operator": ">=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability SLO",
|
||||
"description": "Service level objective for percentage of time key features are available",
|
||||
"sli_name": "Feature Availability",
|
||||
"target_value": 0.9999,
|
||||
"target_display": "99.99%",
|
||||
"operator": ">=",
|
||||
"time_windows": [
|
||||
"1h",
|
||||
"1d",
|
||||
"7d",
|
||||
"30d"
|
||||
],
|
||||
"measurement_window": "30d",
|
||||
"service": "payment-service",
|
||||
"criticality": "critical"
|
||||
}
|
||||
],
|
||||
"error_budgets": [
|
||||
{
|
||||
"slo_name": "Availability SLO",
|
||||
"error_budget_rate": 9.999999999998899e-05,
|
||||
"error_budget_percentage": "0.010%",
|
||||
"budgets_by_window": {
|
||||
"1h": "0.4 seconds",
|
||||
"1d": "8.6 seconds",
|
||||
"7d": "1.0 minutes",
|
||||
"30d": "4.3 minutes"
|
||||
},
|
||||
"burn_rate_alerts": [
|
||||
{
|
||||
"name": "Availability Burn Rate 2% Alert",
|
||||
"description": "Alert when Availability is consuming error budget at 14.4x rate",
|
||||
"severity": "critical",
|
||||
"short_window": "5m",
|
||||
"long_window": "1h",
|
||||
"burn_rate_threshold": 14.4,
|
||||
"budget_consumed": "2%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Availability",
|
||||
"description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Availability Burn Rate 5% Alert",
|
||||
"description": "Alert when Availability is consuming error budget at 6x rate",
|
||||
"severity": "warning",
|
||||
"short_window": "30m",
|
||||
"long_window": "6h",
|
||||
"burn_rate_threshold": 6,
|
||||
"budget_consumed": "5%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Availability",
|
||||
"description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Availability Burn Rate 10% Alert",
|
||||
"description": "Alert when Availability is consuming error budget at 3x rate",
|
||||
"severity": "info",
|
||||
"short_window": "2h",
|
||||
"long_window": "1d",
|
||||
"burn_rate_threshold": 3,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Availability",
|
||||
"description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Availability Burn Rate 10% Alert",
|
||||
"description": "Alert when Availability is consuming error budget at 1x rate",
|
||||
"severity": "info",
|
||||
"short_window": "6h",
|
||||
"long_window": "3d",
|
||||
"burn_rate_threshold": 1,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Availability",
|
||||
"description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"slo_name": "User Journey Success Rate SLO",
|
||||
"error_budget_rate": 9.999999999998899e-05,
|
||||
"error_budget_percentage": "0.010%",
|
||||
"budgets_by_window": {
|
||||
"1h": "0.4 seconds",
|
||||
"1d": "8.6 seconds",
|
||||
"7d": "1.0 minutes",
|
||||
"30d": "4.3 minutes"
|
||||
},
|
||||
"burn_rate_alerts": [
|
||||
{
|
||||
"name": "User Journey Success Rate Burn Rate 2% Alert",
|
||||
"description": "Alert when User Journey Success Rate is consuming error budget at 14.4x rate",
|
||||
"severity": "critical",
|
||||
"short_window": "5m",
|
||||
"long_window": "1h",
|
||||
"burn_rate_threshold": 14.4,
|
||||
"budget_consumed": "2%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for User Journey Success Rate",
|
||||
"description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate Burn Rate 5% Alert",
|
||||
"description": "Alert when User Journey Success Rate is consuming error budget at 6x rate",
|
||||
"severity": "warning",
|
||||
"short_window": "30m",
|
||||
"long_window": "6h",
|
||||
"burn_rate_threshold": 6,
|
||||
"budget_consumed": "5%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for User Journey Success Rate",
|
||||
"description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate Burn Rate 10% Alert",
|
||||
"description": "Alert when User Journey Success Rate is consuming error budget at 3x rate",
|
||||
"severity": "info",
|
||||
"short_window": "2h",
|
||||
"long_window": "1d",
|
||||
"burn_rate_threshold": 3,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for User Journey Success Rate",
|
||||
"description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "User Journey Success Rate Burn Rate 10% Alert",
|
||||
"description": "Alert when User Journey Success Rate is consuming error budget at 1x rate",
|
||||
"severity": "info",
|
||||
"short_window": "6h",
|
||||
"long_window": "3d",
|
||||
"burn_rate_threshold": 1,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for User Journey Success Rate",
|
||||
"description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"slo_name": "Feature Availability SLO",
|
||||
"error_budget_rate": 9.999999999998899e-05,
|
||||
"error_budget_percentage": "0.010%",
|
||||
"budgets_by_window": {
|
||||
"1h": "0.4 seconds",
|
||||
"1d": "8.6 seconds",
|
||||
"7d": "1.0 minutes",
|
||||
"30d": "4.3 minutes"
|
||||
},
|
||||
"burn_rate_alerts": [
|
||||
{
|
||||
"name": "Feature Availability Burn Rate 2% Alert",
|
||||
"description": "Alert when Feature Availability is consuming error budget at 14.4x rate",
|
||||
"severity": "critical",
|
||||
"short_window": "5m",
|
||||
"long_window": "1h",
|
||||
"burn_rate_threshold": 14.4,
|
||||
"budget_consumed": "2%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Feature Availability",
|
||||
"description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability Burn Rate 5% Alert",
|
||||
"description": "Alert when Feature Availability is consuming error budget at 6x rate",
|
||||
"severity": "warning",
|
||||
"short_window": "30m",
|
||||
"long_window": "6h",
|
||||
"burn_rate_threshold": 6,
|
||||
"budget_consumed": "5%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Feature Availability",
|
||||
"description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability Burn Rate 10% Alert",
|
||||
"description": "Alert when Feature Availability is consuming error budget at 3x rate",
|
||||
"severity": "info",
|
||||
"short_window": "2h",
|
||||
"long_window": "1d",
|
||||
"burn_rate_threshold": 3,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Feature Availability",
|
||||
"description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Feature Availability Burn Rate 10% Alert",
|
||||
"description": "Alert when Feature Availability is consuming error budget at 1x rate",
|
||||
"severity": "info",
|
||||
"short_window": "6h",
|
||||
"long_window": "3d",
|
||||
"burn_rate_threshold": 1,
|
||||
"budget_consumed": "10%",
|
||||
"condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)",
|
||||
"annotations": {
|
||||
"summary": "High burn rate detected for Feature Availability",
|
||||
"description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"sla_recommendations": {
|
||||
"applicable": true,
|
||||
"service": "payment-service",
|
||||
"commitments": [
|
||||
{
|
||||
"metric": "Availability",
|
||||
"target": 0.9989,
|
||||
"target_display": "99.89%",
|
||||
"measurement_window": "monthly",
|
||||
"measurement_method": "Uptime monitoring with 1-minute granularity"
|
||||
},
|
||||
{
|
||||
"metric": "Feature Availability",
|
||||
"target": 0.9989,
|
||||
"target_display": "99.89%",
|
||||
"measurement_window": "monthly",
|
||||
"measurement_method": "Uptime monitoring with 1-minute granularity"
|
||||
}
|
||||
],
|
||||
"penalties": [
|
||||
{
|
||||
"breach_threshold": "< 99.99%",
|
||||
"credit_percentage": 10
|
||||
},
|
||||
{
|
||||
"breach_threshold": "< 99.9%",
|
||||
"credit_percentage": 25
|
||||
},
|
||||
{
|
||||
"breach_threshold": "< 99%",
|
||||
"credit_percentage": 50
|
||||
}
|
||||
],
|
||||
"measurement_methodology": "External synthetic monitoring from multiple geographic locations",
|
||||
"exclusions": [
|
||||
"Planned maintenance windows (with 72h advance notice)",
|
||||
"Customer-side network or infrastructure issues",
|
||||
"Force majeure events",
|
||||
"Third-party service dependencies beyond our control"
|
||||
]
|
||||
},
|
||||
"monitoring_recommendations": {
|
||||
"metrics": {
|
||||
"collection": "Prometheus with service discovery",
|
||||
"retention": "90 days for raw metrics, 1 year for aggregated",
|
||||
"alerting": "Prometheus Alertmanager with multi-window burn rate alerts"
|
||||
},
|
||||
"logging": {
|
||||
"format": "Structured JSON logs with correlation IDs",
|
||||
"aggregation": "ELK stack or equivalent with proper indexing",
|
||||
"retention": "30 days for debug logs, 90 days for error logs"
|
||||
},
|
||||
"tracing": {
|
||||
"sampling": "Adaptive sampling with 1% base rate",
|
||||
"storage": "Jaeger or Zipkin with 7-day retention",
|
||||
"integration": "OpenTelemetry instrumentation"
|
||||
}
|
||||
},
|
||||
"implementation_guide": {
|
||||
"prerequisites": [
|
||||
"Service instrumented with metrics collection (Prometheus format)",
|
||||
"Structured logging with correlation IDs",
|
||||
"Monitoring infrastructure (Prometheus, Grafana, Alertmanager)",
|
||||
"Incident response processes and escalation policies"
|
||||
],
|
||||
"implementation_steps": [
|
||||
{
|
||||
"step": 1,
|
||||
"title": "Instrument Service",
|
||||
"description": "Add metrics collection for all defined SLIs",
|
||||
"estimated_effort": "1-2 days"
|
||||
},
|
||||
{
|
||||
"step": 2,
|
||||
"title": "Configure Recording Rules",
|
||||
"description": "Set up Prometheus recording rules for SLI calculations",
|
||||
"estimated_effort": "4-8 hours"
|
||||
},
|
||||
{
|
||||
"step": 3,
|
||||
"title": "Implement Burn Rate Alerts",
|
||||
"description": "Configure multi-window burn rate alerting rules",
|
||||
"estimated_effort": "1 day"
|
||||
},
|
||||
{
|
||||
"step": 4,
|
||||
"title": "Create SLO Dashboard",
|
||||
"description": "Build Grafana dashboard for SLO tracking and error budget monitoring",
|
||||
"estimated_effort": "4-6 hours"
|
||||
},
|
||||
{
|
||||
"step": 5,
|
||||
"title": "Test and Validate",
|
||||
"description": "Test alerting and validate SLI measurements against expectations",
|
||||
"estimated_effort": "1-2 days"
|
||||
},
|
||||
{
|
||||
"step": 6,
|
||||
"title": "Documentation and Training",
|
||||
"description": "Document runbooks and train team on SLO monitoring",
|
||||
"estimated_effort": "1 day"
|
||||
}
|
||||
],
|
||||
"validation_checklist": [
|
||||
"All SLIs produce expected metric values",
|
||||
"Burn rate alerts fire correctly during simulated outages",
|
||||
"Error budget calculations match manual verification",
|
||||
"Dashboard displays accurate SLO achievement rates",
|
||||
"Alert routing reaches correct escalation paths",
|
||||
"Runbooks are complete and tested"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,469 @@
|
||||
# Alert Design Patterns: A Guide to Effective Alerting
|
||||
|
||||
## Introduction
|
||||
|
||||
Well-designed alerts are the difference between a reliable system and 3 AM pages about non-issues. This guide provides patterns and anti-patterns for creating alerts that provide value without causing fatigue.
|
||||
|
||||
## Fundamental Principles
|
||||
|
||||
### The Golden Rules of Alerting
|
||||
|
||||
1. **Every alert should be actionable** - If you can't do something about it, don't alert
|
||||
2. **Every alert should require human intelligence** - If a script can handle it, automate the response
|
||||
3. **Every alert should be novel** - Don't alert on known, ongoing issues
|
||||
4. **Every alert should represent a user-visible impact** - Internal metrics matter only if users are affected
|
||||
|
||||
### Alert Classification
|
||||
|
||||
#### Critical Alerts
|
||||
- Service is completely down
|
||||
- Data loss is occurring
|
||||
- Security breach detected
|
||||
- SLO burn rate indicates imminent SLO violation
|
||||
|
||||
#### Warning Alerts
|
||||
- Service degradation affecting some users
|
||||
- Approaching resource limits
|
||||
- Dependent service issues
|
||||
- Elevated error rates within SLO
|
||||
|
||||
#### Info Alerts
|
||||
- Deployment notifications
|
||||
- Capacity planning triggers
|
||||
- Configuration changes
|
||||
- Maintenance windows
|
||||
|
||||
## Alert Design Patterns
|
||||
|
||||
### Pattern 1: Symptoms, Not Causes
|
||||
|
||||
**Good**: Alert on user-visible symptoms
|
||||
```yaml
|
||||
- alert: HighLatency
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "API latency is high"
|
||||
description: "95th percentile latency is {{ $value }}s, above 500ms threshold"
|
||||
```
|
||||
|
||||
**Bad**: Alert on internal metrics that may not affect users
|
||||
```yaml
|
||||
- alert: HighCPU
|
||||
expr: cpu_usage > 80
|
||||
# This might not affect users at all!
|
||||
```
|
||||
|
||||
### Pattern 2: Multi-Window Alerting
|
||||
|
||||
Reduce false positives by requiring sustained problems:
|
||||
|
||||
```yaml
|
||||
- alert: ServiceDown
|
||||
expr: (
|
||||
avg_over_time(up[2m]) == 0 # Short window: immediate detection
|
||||
and
|
||||
avg_over_time(up[10m]) < 0.8 # Long window: avoid flapping
|
||||
)
|
||||
for: 1m
|
||||
```
|
||||
|
||||
### Pattern 3: Burn Rate Alerting
|
||||
|
||||
Alert based on error budget consumption rate:
|
||||
|
||||
```yaml
|
||||
# Fast burn: 2% of monthly budget in 1 hour
|
||||
- alert: ErrorBudgetFastBurn
|
||||
expr: (
|
||||
error_rate_5m > (14.4 * error_budget_slo)
|
||||
and
|
||||
error_rate_1h > (14.4 * error_budget_slo)
|
||||
)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
# Slow burn: 10% of monthly budget in 3 days
|
||||
- alert: ErrorBudgetSlowBurn
|
||||
expr: (
|
||||
error_rate_6h > (1.0 * error_budget_slo)
|
||||
and
|
||||
error_rate_3d > (1.0 * error_budget_slo)
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
```
|
||||
|
||||
### Pattern 4: Hysteresis
|
||||
|
||||
Use different thresholds for firing and resolving to prevent flapping:
|
||||
|
||||
```yaml
|
||||
- alert: HighErrorRate
|
||||
expr: error_rate > 0.05 # Fire at 5%
|
||||
for: 5m
|
||||
|
||||
# Resolution happens automatically when error_rate < 0.03 (3%)
|
||||
# This prevents flapping around the 5% threshold
|
||||
```
|
||||
|
||||
### Pattern 5: Composite Alerts
|
||||
|
||||
Alert when multiple conditions indicate a problem:
|
||||
|
||||
```yaml
|
||||
- alert: ServiceDegraded
|
||||
expr: (
|
||||
(latency_p95 > latency_threshold)
|
||||
or
|
||||
(error_rate > error_threshold)
|
||||
or
|
||||
(availability < availability_threshold)
|
||||
) and (
|
||||
request_rate > min_request_rate # Only alert if we have traffic
|
||||
)
|
||||
```
|
||||
|
||||
### Pattern 6: Contextual Alerting
|
||||
|
||||
Include relevant context in alerts:
|
||||
|
||||
```yaml
|
||||
- alert: DatabaseConnections
|
||||
expr: db_connections_active / db_connections_max > 0.8
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "Database connection pool nearly exhausted"
|
||||
description: "{{ $labels.database }} has {{ $value | humanizePercentage }} connection utilization"
|
||||
runbook_url: "https://runbooks.company.com/database-connections"
|
||||
impact: "New requests may be rejected, causing 500 errors"
|
||||
suggested_action: "Check for connection leaks or increase pool size"
|
||||
```
|
||||
|
||||
## Alert Routing and Escalation
|
||||
|
||||
### Routing by Impact and Urgency
|
||||
|
||||
#### Critical Path Services
|
||||
```yaml
|
||||
route:
|
||||
group_by: ['service']
|
||||
routes:
|
||||
- match:
|
||||
service: 'payment-api'
|
||||
severity: 'critical'
|
||||
receiver: 'payment-team-pager'
|
||||
continue: true
|
||||
- match:
|
||||
service: 'payment-api'
|
||||
severity: 'warning'
|
||||
receiver: 'payment-team-slack'
|
||||
```
|
||||
|
||||
#### Time-Based Routing
|
||||
```yaml
|
||||
route:
|
||||
routes:
|
||||
- match:
|
||||
severity: 'critical'
|
||||
receiver: 'oncall-pager'
|
||||
- match:
|
||||
severity: 'warning'
|
||||
time: 'business_hours' # 9 AM - 5 PM
|
||||
receiver: 'team-slack'
|
||||
- match:
|
||||
severity: 'warning'
|
||||
time: 'after_hours'
|
||||
receiver: 'team-email' # Lower urgency outside business hours
|
||||
```
|
||||
|
||||
### Escalation Patterns
|
||||
|
||||
#### Linear Escalation
|
||||
```yaml
|
||||
receivers:
|
||||
- name: 'primary-oncall'
|
||||
pagerduty_configs:
|
||||
- escalation_policy: 'P1-Escalation'
|
||||
# 0 min: Primary on-call
|
||||
# 5 min: Secondary on-call
|
||||
# 15 min: Engineering manager
|
||||
# 30 min: Director of engineering
|
||||
```
|
||||
|
||||
#### Severity-Based Escalation
|
||||
```yaml
|
||||
# Critical: Immediate escalation
|
||||
- match:
|
||||
severity: 'critical'
|
||||
receiver: 'critical-escalation'
|
||||
|
||||
# Warning: Team-first escalation
|
||||
- match:
|
||||
severity: 'warning'
|
||||
receiver: 'team-escalation'
|
||||
```
|
||||
|
||||
## Alert Fatigue Prevention
|
||||
|
||||
### Grouping and Suppression
|
||||
|
||||
#### Time-Based Grouping
|
||||
```yaml
|
||||
route:
|
||||
group_wait: 30s # Wait 30s to group similar alerts
|
||||
group_interval: 2m # Send grouped alerts every 2 minutes
|
||||
repeat_interval: 1h # Re-send unresolved alerts every hour
|
||||
```
|
||||
|
||||
#### Dependent Service Suppression
|
||||
```yaml
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
|
||||
- alert: HighLatency
|
||||
expr: latency_p95 > 1
|
||||
# This alert is suppressed when ServiceDown is firing
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
alertname: 'ServiceDown'
|
||||
target_match:
|
||||
alertname: 'HighLatency'
|
||||
equal: ['service']
|
||||
```
|
||||
|
||||
### Alert Throttling
|
||||
|
||||
```yaml
|
||||
# Limit to 1 alert per 10 minutes for noisy conditions
|
||||
- alert: HighMemoryUsage
|
||||
expr: memory_usage_percent > 85
|
||||
for: 10m # Longer 'for' duration reduces noise
|
||||
annotations:
|
||||
summary: "Memory usage has been high for 10+ minutes"
|
||||
```
|
||||
|
||||
### Smart Defaults
|
||||
|
||||
```yaml
|
||||
# Use business logic to set intelligent thresholds
|
||||
- alert: LowTraffic
|
||||
expr: request_rate < (
|
||||
avg_over_time(request_rate[7d]) * 0.1 # 10% of weekly average
|
||||
)
|
||||
# Only alert during business hours when low traffic is unusual
|
||||
for: 30m
|
||||
```
|
||||
|
||||
## Runbook Integration
|
||||
|
||||
### Runbook Structure Template
|
||||
|
||||
```markdown
|
||||
# Alert: {{ $labels.alertname }}
|
||||
|
||||
## Immediate Actions
|
||||
1. Check service status dashboard
|
||||
2. Verify if users are affected
|
||||
3. Look at recent deployments/changes
|
||||
|
||||
## Investigation Steps
|
||||
1. Check logs for errors in the last 30 minutes
|
||||
2. Verify dependent services are healthy
|
||||
3. Check resource utilization (CPU, memory, disk)
|
||||
4. Review recent alerts for patterns
|
||||
|
||||
## Resolution Actions
|
||||
- If deployment-related: Consider rollback
|
||||
- If resource-related: Scale up or optimize queries
|
||||
- If dependency-related: Engage appropriate team
|
||||
|
||||
## Escalation
|
||||
- Primary: @team-oncall
|
||||
- Secondary: @engineering-manager
|
||||
- Emergency: @site-reliability-team
|
||||
```
|
||||
|
||||
### Runbook Integration in Alerts
|
||||
|
||||
```yaml
|
||||
annotations:
|
||||
runbook_url: "https://runbooks.company.com/alerts/{{ $labels.alertname }}"
|
||||
quick_debug: |
|
||||
1. curl -s https://{{ $labels.instance }}/health
|
||||
2. kubectl logs {{ $labels.pod }} --tail=50
|
||||
3. Check dashboard: https://grafana.company.com/d/service-{{ $labels.service }}
|
||||
```
|
||||
|
||||
## Testing and Validation
|
||||
|
||||
### Alert Testing Strategies
|
||||
|
||||
#### Chaos Engineering Integration
|
||||
```python
|
||||
# Test that alerts fire during controlled failures
|
||||
def test_alert_during_cpu_spike():
|
||||
with chaos.cpu_spike(target='payment-api', duration='2m'):
|
||||
assert wait_for_alert('HighCPU', timeout=180)
|
||||
|
||||
def test_alert_during_network_partition():
|
||||
with chaos.network_partition(target='database'):
|
||||
assert wait_for_alert('DatabaseUnreachable', timeout=60)
|
||||
```
|
||||
|
||||
#### Historical Alert Analysis
|
||||
```prometheus
|
||||
# Query to find alerts that fired without incidents
|
||||
count by (alertname) (
|
||||
ALERTS{alertstate="firing"}[30d]
|
||||
) unless on (alertname) (
|
||||
count by (alertname) (
|
||||
incident_created{source="alert"}[30d]
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Alert Quality Metrics
|
||||
|
||||
#### Alert Precision
|
||||
```
|
||||
Precision = True Positives / (True Positives + False Positives)
|
||||
```
|
||||
|
||||
Track alerts that resulted in actual incidents vs false alarms.
|
||||
|
||||
#### Time to Resolution
|
||||
```prometheus
|
||||
# Average time from alert firing to resolution
|
||||
avg_over_time(
|
||||
(alert_resolved_timestamp - alert_fired_timestamp)[30d]
|
||||
) by (alertname)
|
||||
```
|
||||
|
||||
#### Alert Fatigue Indicators
|
||||
```prometheus
|
||||
# Alerts per day by team
|
||||
sum by (team) (
|
||||
increase(alerts_fired_total[1d])
|
||||
)
|
||||
|
||||
# Percentage of alerts acknowledged within 15 minutes
|
||||
sum(alerts_acked_within_15m) / sum(alerts_fired) * 100
|
||||
```
|
||||
|
||||
## Advanced Patterns
|
||||
|
||||
### Machine Learning-Enhanced Alerting
|
||||
|
||||
#### Anomaly Detection
|
||||
```yaml
|
||||
- alert: AnomalousTraffic
|
||||
expr: |
|
||||
abs(request_rate - predict_linear(request_rate[1h], 300)) /
|
||||
stddev_over_time(request_rate[1h]) > 3
|
||||
for: 10m
|
||||
annotations:
|
||||
summary: "Traffic pattern is anomalous"
|
||||
description: "Current traffic deviates from predicted pattern by >3 standard deviations"
|
||||
```
|
||||
|
||||
#### Dynamic Thresholds
|
||||
```yaml
|
||||
- alert: DynamicHighLatency
|
||||
expr: |
|
||||
latency_p95 > (
|
||||
quantile_over_time(0.95, latency_p95[7d]) + # Historical 95th percentile
|
||||
2 * stddev_over_time(latency_p95[7d]) # Plus 2 standard deviations
|
||||
)
|
||||
```
|
||||
|
||||
### Business Hours Awareness
|
||||
|
||||
```yaml
|
||||
# Different thresholds for business vs off hours
|
||||
- alert: HighLatencyBusinessHours
|
||||
expr: latency_p95 > 0.2 # Stricter during business hours
|
||||
for: 2m
|
||||
# Active 9 AM - 5 PM weekdays
|
||||
|
||||
- alert: HighLatencyOffHours
|
||||
expr: latency_p95 > 0.5 # More lenient after hours
|
||||
for: 5m
|
||||
# Active nights and weekends
|
||||
```
|
||||
|
||||
### Progressive Alerting
|
||||
|
||||
```yaml
|
||||
# Escalating alert severity based on duration
|
||||
- alert: ServiceLatencyElevated
|
||||
expr: latency_p95 > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
|
||||
- alert: ServiceLatencyHigh
|
||||
expr: latency_p95 > 0.5
|
||||
for: 15m # Same condition, longer duration
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- alert: ServiceLatencyCritical
|
||||
expr: latency_p95 > 0.5
|
||||
for: 30m # Same condition, even longer duration
|
||||
labels:
|
||||
severity: critical
|
||||
```
|
||||
|
||||
## Anti-Patterns to Avoid
|
||||
|
||||
### Anti-Pattern 1: Alerting on Everything
|
||||
**Problem**: Too many alerts create noise and fatigue
|
||||
**Solution**: Be selective; only alert on user-impacting issues
|
||||
|
||||
### Anti-Pattern 2: Vague Alert Messages
|
||||
**Problem**: "Service X is down" - which instance? what's the impact?
|
||||
**Solution**: Include specific details and context
|
||||
|
||||
### Anti-Pattern 3: Alerts Without Runbooks
|
||||
**Problem**: Alerts that don't explain what to do
|
||||
**Solution**: Every alert must have an associated runbook
|
||||
|
||||
### Anti-Pattern 4: Static Thresholds
|
||||
**Problem**: 80% CPU might be normal during peak hours
|
||||
**Solution**: Use contextual, adaptive thresholds
|
||||
|
||||
### Anti-Pattern 5: Ignoring Alert Quality
|
||||
**Problem**: Accepting high false positive rates
|
||||
**Solution**: Regularly review and tune alert precision
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pre-Implementation
|
||||
- [ ] Define alert severity levels and escalation policies
|
||||
- [ ] Create runbook templates
|
||||
- [ ] Set up alert routing configuration
|
||||
- [ ] Define SLOs that alerts will protect
|
||||
|
||||
### Alert Development
|
||||
- [ ] Each alert has clear success criteria
|
||||
- [ ] Alert conditions tested against historical data
|
||||
- [ ] Runbook created and accessible
|
||||
- [ ] Severity and routing configured
|
||||
- [ ] Context and suggested actions included
|
||||
|
||||
### Post-Implementation
|
||||
- [ ] Monitor alert precision and recall
|
||||
- [ ] Regular review of alert fatigue metrics
|
||||
- [ ] Quarterly alert effectiveness review
|
||||
- [ ] Team training on alert response procedures
|
||||
|
||||
### Quality Assurance
|
||||
- [ ] Test alerts fire during controlled failures
|
||||
- [ ] Verify alerts resolve when conditions improve
|
||||
- [ ] Confirm runbooks are accurate and helpful
|
||||
- [ ] Validate escalation paths work correctly
|
||||
|
||||
Remember: Great alerts are invisible when things work and invaluable when things break. Focus on quality over quantity, and always optimize for the human who will respond to the alert at 3 AM.
|
||||
@@ -0,0 +1,571 @@
|
||||
# Dashboard Best Practices: Design for Insight and Action
|
||||
|
||||
## Introduction
|
||||
|
||||
A well-designed dashboard is like a good story - it guides you through the data with purpose and clarity. This guide provides practical patterns for creating dashboards that inform decisions and enable quick troubleshooting.
|
||||
|
||||
## Design Principles
|
||||
|
||||
### The Hierarchy of Information
|
||||
|
||||
#### Primary Information (Top Third)
|
||||
- Service health status
|
||||
- SLO achievement
|
||||
- Critical alerts
|
||||
- Business KPIs
|
||||
|
||||
#### Secondary Information (Middle Third)
|
||||
- Golden signals (latency, traffic, errors, saturation)
|
||||
- Resource utilization
|
||||
- Throughput and performance metrics
|
||||
|
||||
#### Tertiary Information (Bottom Third)
|
||||
- Detailed breakdowns
|
||||
- Historical trends
|
||||
- Dependency status
|
||||
- Debug information
|
||||
|
||||
### Visual Design Principles
|
||||
|
||||
#### Rule of 7±2
|
||||
- Maximum 7±2 panels per screen
|
||||
- Group related information together
|
||||
- Use sections to organize complexity
|
||||
|
||||
#### Color Psychology
|
||||
- **Red**: Critical issues, danger, immediate attention needed
|
||||
- **Yellow/Orange**: Warnings, caution, degraded state
|
||||
- **Green**: Healthy, normal operation, success
|
||||
- **Blue**: Information, neutral metrics, capacity
|
||||
- **Gray**: Disabled, unknown, or baseline states
|
||||
|
||||
#### Chart Selection Guide
|
||||
- **Line charts**: Time series, trends, comparisons over time
|
||||
- **Bar charts**: Categorical comparisons, top N lists
|
||||
- **Gauges**: Single value with defined good/bad ranges
|
||||
- **Stat panels**: Key metrics, percentages, counts
|
||||
- **Heatmaps**: Distribution data, correlation analysis
|
||||
- **Tables**: Detailed breakdowns, multi-dimensional data
|
||||
|
||||
## Dashboard Archetypes
|
||||
|
||||
### The Overview Dashboard
|
||||
|
||||
**Purpose**: High-level health check and business metrics
|
||||
**Audience**: Executives, managers, cross-team stakeholders
|
||||
**Update Frequency**: 5-15 minutes
|
||||
|
||||
```yaml
|
||||
sections:
|
||||
- title: "Business Health"
|
||||
panels:
|
||||
- service_availability_summary
|
||||
- revenue_per_hour
|
||||
- active_users
|
||||
- conversion_rate
|
||||
|
||||
- title: "System Health"
|
||||
panels:
|
||||
- critical_alerts_count
|
||||
- slo_achievement_summary
|
||||
- error_budget_remaining
|
||||
- deployment_status
|
||||
```
|
||||
|
||||
### The SRE Operational Dashboard
|
||||
|
||||
**Purpose**: Real-time monitoring and incident response
|
||||
**Audience**: SRE, on-call engineers
|
||||
**Update Frequency**: 15-30 seconds
|
||||
|
||||
```yaml
|
||||
sections:
|
||||
- title: "Service Status"
|
||||
panels:
|
||||
- service_up_status
|
||||
- active_incidents
|
||||
- recent_deployments
|
||||
|
||||
- title: "Golden Signals"
|
||||
panels:
|
||||
- latency_percentiles
|
||||
- request_rate
|
||||
- error_rate
|
||||
- resource_saturation
|
||||
|
||||
- title: "Infrastructure"
|
||||
panels:
|
||||
- cpu_memory_utilization
|
||||
- network_io
|
||||
- disk_space
|
||||
```
|
||||
|
||||
### The Developer Debug Dashboard
|
||||
|
||||
**Purpose**: Deep-dive troubleshooting and performance analysis
|
||||
**Audience**: Development teams
|
||||
**Update Frequency**: 30 seconds - 2 minutes
|
||||
|
||||
```yaml
|
||||
sections:
|
||||
- title: "Application Performance"
|
||||
panels:
|
||||
- endpoint_latency_breakdown
|
||||
- database_query_performance
|
||||
- cache_hit_rates
|
||||
- queue_depths
|
||||
|
||||
- title: "Errors and Logs"
|
||||
panels:
|
||||
- error_rate_by_endpoint
|
||||
- log_volume_by_level
|
||||
- exception_types
|
||||
- slow_queries
|
||||
```
|
||||
|
||||
## Layout Patterns
|
||||
|
||||
### The F-Pattern Layout
|
||||
|
||||
Based on eye-tracking studies, users scan in an F-pattern:
|
||||
|
||||
```
|
||||
[Critical Status] [SLO Summary ] [Error Budget ]
|
||||
[Latency ] [Traffic ] [Errors ]
|
||||
[Saturation ] [Resource Use ] [Detailed View]
|
||||
[Historical ] [Dependencies ] [Debug Info ]
|
||||
```
|
||||
|
||||
### The Z-Pattern Layout
|
||||
|
||||
For executive dashboards, follow the Z-pattern:
|
||||
|
||||
```
|
||||
[Business KPIs ] → [System Status]
|
||||
↓ ↓
|
||||
[Trend Analysis ] ← [Key Metrics ]
|
||||
```
|
||||
|
||||
### Responsive Design
|
||||
|
||||
#### Desktop (1920x1080)
|
||||
- 24-column grid
|
||||
- Panels can be 6, 8, 12, or 24 units wide
|
||||
- 4-6 rows visible without scrolling
|
||||
|
||||
#### Laptop (1366x768)
|
||||
- Stack wider panels vertically
|
||||
- Reduce panel heights
|
||||
- Prioritize most critical information
|
||||
|
||||
#### Mobile (768px width)
|
||||
- Single column layout
|
||||
- Simplified panels
|
||||
- Touch-friendly controls
|
||||
|
||||
## Effective Panel Design
|
||||
|
||||
### Stat Panels
|
||||
|
||||
```yaml
|
||||
# Good: Clear value with context
|
||||
- title: "API Availability"
|
||||
type: stat
|
||||
targets:
|
||||
- expr: avg(up{service="api"}) * 100
|
||||
field_config:
|
||||
unit: percent
|
||||
thresholds:
|
||||
steps:
|
||||
- color: red
|
||||
value: 0
|
||||
- color: yellow
|
||||
value: 99
|
||||
- color: green
|
||||
value: 99.9
|
||||
options:
|
||||
color_mode: background
|
||||
text_mode: value_and_name
|
||||
```
|
||||
|
||||
### Time Series Panels
|
||||
|
||||
```yaml
|
||||
# Good: Multiple related metrics with clear legend
|
||||
- title: "Request Latency"
|
||||
type: timeseries
|
||||
targets:
|
||||
- expr: histogram_quantile(0.50, rate(http_duration_bucket[5m]))
|
||||
legend: "P50"
|
||||
- expr: histogram_quantile(0.95, rate(http_duration_bucket[5m]))
|
||||
legend: "P95"
|
||||
- expr: histogram_quantile(0.99, rate(http_duration_bucket[5m]))
|
||||
legend: "P99"
|
||||
field_config:
|
||||
unit: ms
|
||||
custom:
|
||||
draw_style: line
|
||||
fill_opacity: 10
|
||||
options:
|
||||
legend:
|
||||
display_mode: table
|
||||
placement: bottom
|
||||
values: [min, max, mean, last]
|
||||
```
|
||||
|
||||
### Table Panels
|
||||
|
||||
```yaml
|
||||
# Good: Top N with relevant columns
|
||||
- title: "Slowest Endpoints"
|
||||
type: table
|
||||
targets:
|
||||
- expr: topk(10, histogram_quantile(0.95, sum by (handler)(rate(http_duration_bucket[5m]))))
|
||||
format: table
|
||||
instant: true
|
||||
transformations:
|
||||
- id: organize
|
||||
options:
|
||||
exclude_by_name:
|
||||
Time: true
|
||||
rename_by_name:
|
||||
Value: "P95 Latency (ms)"
|
||||
handler: "Endpoint"
|
||||
```
|
||||
|
||||
## Color and Visualization Best Practices
|
||||
|
||||
### Threshold Configuration
|
||||
|
||||
```yaml
|
||||
# Traffic light system with meaningful boundaries
|
||||
thresholds:
|
||||
steps:
|
||||
- color: green # Good performance
|
||||
value: null # Default
|
||||
- color: yellow # Degraded performance
|
||||
value: 95 # 95th percentile of historical normal
|
||||
- color: orange # Poor performance
|
||||
value: 99 # 99th percentile of historical normal
|
||||
- color: red # Critical performance
|
||||
value: 99.9 # Worst case scenario
|
||||
```
|
||||
|
||||
### Color Blind Friendly Palettes
|
||||
|
||||
```yaml
|
||||
# Use patterns and shapes in addition to color
|
||||
field_config:
|
||||
overrides:
|
||||
- matcher:
|
||||
id: byName
|
||||
options: "Critical"
|
||||
properties:
|
||||
- id: color
|
||||
value:
|
||||
mode: fixed
|
||||
fixed_color: "#d73027" # Red-orange for protanopia
|
||||
- id: custom.draw_style
|
||||
value: "points" # Different shape
|
||||
```
|
||||
|
||||
### Consistent Color Semantics
|
||||
|
||||
- **Success/Health**: Green (#28a745)
|
||||
- **Warning/Degraded**: Yellow (#ffc107)
|
||||
- **Error/Critical**: Red (#dc3545)
|
||||
- **Information**: Blue (#007bff)
|
||||
- **Neutral**: Gray (#6c757d)
|
||||
|
||||
## Time Range Strategy
|
||||
|
||||
### Default Time Ranges by Dashboard Type
|
||||
|
||||
#### Real-time Operational
|
||||
- **Default**: Last 15 minutes
|
||||
- **Quick options**: 5m, 15m, 1h, 4h
|
||||
- **Auto-refresh**: 15-30 seconds
|
||||
|
||||
#### Troubleshooting
|
||||
- **Default**: Last 1 hour
|
||||
- **Quick options**: 15m, 1h, 4h, 12h, 1d
|
||||
- **Auto-refresh**: 1 minute
|
||||
|
||||
#### Business Review
|
||||
- **Default**: Last 24 hours
|
||||
- **Quick options**: 1d, 7d, 30d, 90d
|
||||
- **Auto-refresh**: 5 minutes
|
||||
|
||||
#### Capacity Planning
|
||||
- **Default**: Last 7 days
|
||||
- **Quick options**: 7d, 30d, 90d, 1y
|
||||
- **Auto-refresh**: 15 minutes
|
||||
|
||||
### Time Range Annotations
|
||||
|
||||
```yaml
|
||||
# Add context for time-based events
|
||||
annotations:
|
||||
- name: "Deployments"
|
||||
datasource: "Prometheus"
|
||||
expr: "deployment_timestamp"
|
||||
title_format: "Deploy {{ version }}"
|
||||
text_format: "Deployed version {{ version }} to {{ environment }}"
|
||||
|
||||
- name: "Incidents"
|
||||
datasource: "Incident API"
|
||||
query: "incidents.json?service={{ service }}"
|
||||
color: "red"
|
||||
```
|
||||
|
||||
## Interactive Features
|
||||
|
||||
### Template Variables
|
||||
|
||||
```yaml
|
||||
# Service selector
|
||||
- name: service
|
||||
type: query
|
||||
query: label_values(up, service)
|
||||
current:
|
||||
text: All
|
||||
value: $__all
|
||||
include_all: true
|
||||
multi: true
|
||||
|
||||
# Environment selector
|
||||
- name: environment
|
||||
type: query
|
||||
query: label_values(up{service="$service"}, environment)
|
||||
current:
|
||||
text: production
|
||||
value: production
|
||||
```
|
||||
|
||||
### Drill-Down Links
|
||||
|
||||
```yaml
|
||||
# Panel-level drill-downs
|
||||
- title: "Error Rate"
|
||||
type: timeseries
|
||||
# ... other config ...
|
||||
options:
|
||||
data_links:
|
||||
- title: "View Error Logs"
|
||||
url: "/d/logs-dashboard?var-service=${__field.labels.service}&from=${__from}&to=${__to}"
|
||||
- title: "Error Traces"
|
||||
url: "/d/traces-dashboard?var-service=${__field.labels.service}"
|
||||
```
|
||||
|
||||
### Dynamic Panel Titles
|
||||
|
||||
```yaml
|
||||
- title: "${service} - Request Rate" # Uses template variable
|
||||
type: timeseries
|
||||
# Title updates automatically when service variable changes
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Query Optimization
|
||||
|
||||
#### Use Recording Rules
|
||||
```yaml
|
||||
# Instead of complex queries in dashboards
|
||||
groups:
|
||||
- name: http_requests
|
||||
rules:
|
||||
- record: http_request_rate_5m
|
||||
expr: sum(rate(http_requests_total[5m])) by (service, method, handler)
|
||||
|
||||
- record: http_request_latency_p95_5m
|
||||
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le))
|
||||
```
|
||||
|
||||
#### Limit Data Points
|
||||
```yaml
|
||||
# Good: Reasonable resolution for dashboard
|
||||
- expr: http_request_rate_5m[1h]
|
||||
interval: 15s # One point every 15 seconds
|
||||
|
||||
# Bad: Too many points for visualization
|
||||
- expr: http_request_rate_1s[1h] # 3600 points!
|
||||
```
|
||||
|
||||
### Dashboard Performance
|
||||
|
||||
#### Panel Limits
|
||||
- **Maximum panels per dashboard**: 20-30
|
||||
- **Maximum queries per panel**: 10
|
||||
- **Maximum time series per panel**: 50
|
||||
|
||||
#### Caching Strategy
|
||||
```yaml
|
||||
# Use appropriate cache headers
|
||||
cache_timeout: 30 # Cache for 30 seconds on fast-changing panels
|
||||
cache_timeout: 300 # Cache for 5 minutes on slow-changing panels
|
||||
```
|
||||
|
||||
## Accessibility
|
||||
|
||||
### Screen Reader Support
|
||||
|
||||
```yaml
|
||||
# Provide text alternatives for visual elements
|
||||
- title: "Service Health Status"
|
||||
type: stat
|
||||
options:
|
||||
text_mode: value_and_name # Includes both value and description
|
||||
field_config:
|
||||
mappings:
|
||||
- options:
|
||||
"1":
|
||||
text: "Healthy"
|
||||
color: "green"
|
||||
"0":
|
||||
text: "Unhealthy"
|
||||
color: "red"
|
||||
```
|
||||
|
||||
### Keyboard Navigation
|
||||
|
||||
- Ensure all interactive elements are keyboard accessible
|
||||
- Provide logical tab order
|
||||
- Include skip links for complex dashboards
|
||||
|
||||
### High Contrast Mode
|
||||
|
||||
```yaml
|
||||
# Test dashboards work in high contrast mode
|
||||
theme: high_contrast
|
||||
colors:
|
||||
- "#000000" # Pure black
|
||||
- "#ffffff" # Pure white
|
||||
- "#ffff00" # Pure yellow
|
||||
- "#ff0000" # Pure red
|
||||
```
|
||||
|
||||
## Testing and Validation
|
||||
|
||||
### Dashboard Testing Checklist
|
||||
|
||||
#### Functional Testing
|
||||
- [ ] All panels load without errors
|
||||
- [ ] Template variables filter correctly
|
||||
- [ ] Time range changes update all panels
|
||||
- [ ] Drill-down links work as expected
|
||||
- [ ] Auto-refresh functions properly
|
||||
|
||||
#### Visual Testing
|
||||
- [ ] Dashboard renders correctly on different screen sizes
|
||||
- [ ] Colors are distinguishable and meaningful
|
||||
- [ ] Text is readable at normal zoom levels
|
||||
- [ ] Legends and labels are clear
|
||||
|
||||
#### Performance Testing
|
||||
- [ ] Dashboard loads in < 5 seconds
|
||||
- [ ] No queries timeout under normal load
|
||||
- [ ] Auto-refresh doesn't cause browser lag
|
||||
- [ ] Memory usage remains reasonable
|
||||
|
||||
#### Usability Testing
|
||||
- [ ] New team members can understand the dashboard
|
||||
- [ ] Action items are clear during incidents
|
||||
- [ ] Key information is quickly discoverable
|
||||
- [ ] Dashboard supports common troubleshooting workflows
|
||||
|
||||
## Maintenance and Governance
|
||||
|
||||
### Dashboard Lifecycle
|
||||
|
||||
#### Creation
|
||||
1. Define dashboard purpose and audience
|
||||
2. Identify key metrics and success criteria
|
||||
3. Design layout following established patterns
|
||||
4. Implement with consistent styling
|
||||
5. Test with real data and user scenarios
|
||||
|
||||
#### Maintenance
|
||||
- **Weekly**: Check for broken panels or queries
|
||||
- **Monthly**: Review dashboard usage analytics
|
||||
- **Quarterly**: Gather user feedback and iterate
|
||||
- **Annually**: Major review and potential redesign
|
||||
|
||||
#### Retirement
|
||||
- Archive dashboards that are no longer used
|
||||
- Migrate users to replacement dashboards
|
||||
- Document lessons learned
|
||||
|
||||
### Dashboard Standards
|
||||
|
||||
```yaml
|
||||
# Organization dashboard standards
|
||||
standards:
|
||||
naming_convention: "[Team] [Service] - [Purpose]"
|
||||
tags: [team, service_type, environment, purpose]
|
||||
refresh_intervals: [15s, 30s, 1m, 5m, 15m]
|
||||
time_ranges: [5m, 15m, 1h, 4h, 1d, 7d, 30d]
|
||||
color_scheme: "company_standard"
|
||||
max_panels_per_dashboard: 25
|
||||
```
|
||||
|
||||
## Advanced Patterns
|
||||
|
||||
### Composite Dashboards
|
||||
|
||||
```yaml
|
||||
# Dashboard that includes panels from other dashboards
|
||||
- title: "Service Overview"
|
||||
type: dashlist
|
||||
targets:
|
||||
- "service-health"
|
||||
- "service-performance"
|
||||
- "service-business-metrics"
|
||||
options:
|
||||
show_headings: true
|
||||
max_items: 10
|
||||
```
|
||||
|
||||
### Dynamic Dashboard Generation
|
||||
|
||||
```python
|
||||
# Generate dashboards from service definitions
|
||||
def generate_service_dashboard(service_config):
|
||||
panels = []
|
||||
|
||||
# Always include golden signals
|
||||
panels.extend(generate_golden_signals_panels(service_config))
|
||||
|
||||
# Add service-specific panels
|
||||
if service_config.type == 'database':
|
||||
panels.extend(generate_database_panels(service_config))
|
||||
elif service_config.type == 'queue':
|
||||
panels.extend(generate_queue_panels(service_config))
|
||||
|
||||
return {
|
||||
'title': f"{service_config.name} - Operational Dashboard",
|
||||
'panels': panels,
|
||||
'variables': generate_variables(service_config)
|
||||
}
|
||||
```
|
||||
|
||||
### A/B Testing for Dashboards
|
||||
|
||||
```yaml
|
||||
# Test different dashboard designs with different teams
|
||||
experiment:
|
||||
name: "dashboard_layout_test"
|
||||
variants:
|
||||
- name: "traditional_layout"
|
||||
weight: 50
|
||||
config: "dashboard_v1.json"
|
||||
- name: "f_pattern_layout"
|
||||
weight: 50
|
||||
config: "dashboard_v2.json"
|
||||
success_metrics:
|
||||
- "time_to_insight"
|
||||
- "user_satisfaction"
|
||||
- "troubleshooting_efficiency"
|
||||
```
|
||||
|
||||
Remember: A dashboard should tell a story about your system's health and guide users toward the right actions. Focus on clarity over complexity, and always optimize for the person who will use it during a stressful incident.
|
||||
@@ -0,0 +1,329 @@
|
||||
# SLO Cookbook: A Practical Guide to Service Level Objectives
|
||||
|
||||
## Introduction
|
||||
|
||||
Service Level Objectives (SLOs) are a key tool for managing service reliability. This cookbook provides practical guidance for implementing SLOs that actually improve system reliability rather than just creating meaningless metrics.
|
||||
|
||||
## Fundamentals
|
||||
|
||||
### The SLI/SLO/SLA Hierarchy
|
||||
|
||||
- **SLI (Service Level Indicator)**: A quantifiable measure of service quality
|
||||
- **SLO (Service Level Objective)**: A target range of values for an SLI
|
||||
- **SLA (Service Level Agreement)**: A business agreement with consequences for missing SLO targets
|
||||
|
||||
### Golden Rule of SLOs
|
||||
|
||||
**Start simple, iterate based on learning.** Your first SLOs won't be perfect, and that's okay.
|
||||
|
||||
## Choosing Good SLIs
|
||||
|
||||
### The Four Golden Signals
|
||||
|
||||
1. **Latency**: How long requests take to complete
|
||||
2. **Traffic**: How many requests are coming in
|
||||
3. **Errors**: How many requests are failing
|
||||
4. **Saturation**: How "full" your service is
|
||||
|
||||
### SLI Selection Criteria
|
||||
|
||||
A good SLI should be:
|
||||
- **Measurable**: You can collect data for it
|
||||
- **Meaningful**: It reflects user experience
|
||||
- **Controllable**: You can take action to improve it
|
||||
- **Proportional**: Changes in the SLI reflect changes in user happiness
|
||||
|
||||
### Service Type Specific SLIs
|
||||
|
||||
#### HTTP APIs
|
||||
- **Request latency**: P95 or P99 response time
|
||||
- **Availability**: Proportion of successful requests (non-5xx)
|
||||
- **Throughput**: Requests per second capacity
|
||||
|
||||
```prometheus
|
||||
# Availability SLI
|
||||
sum(rate(http_requests_total{code!~"5.."}[5m])) / sum(rate(http_requests_total[5m]))
|
||||
|
||||
# Latency SLI
|
||||
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
|
||||
```
|
||||
|
||||
#### Batch Jobs
|
||||
- **Freshness**: Age of the last successful run
|
||||
- **Correctness**: Proportion of jobs completing successfully
|
||||
- **Throughput**: Items processed per unit time
|
||||
|
||||
#### Data Pipelines
|
||||
- **Data freshness**: Time since last successful update
|
||||
- **Data quality**: Proportion of records passing validation
|
||||
- **Processing latency**: Time from ingestion to availability
|
||||
|
||||
### Anti-Patterns in SLI Selection
|
||||
|
||||
❌ **Don't use**: CPU usage, memory usage, disk space as primary SLIs
|
||||
- These are symptoms, not user-facing impacts
|
||||
|
||||
❌ **Don't use**: Counts instead of rates or proportions
|
||||
- "Number of errors" vs "Error rate"
|
||||
|
||||
❌ **Don't use**: Internal metrics that users don't care about
|
||||
- Queue depth, cache hit rate (unless they directly impact user experience)
|
||||
|
||||
## Setting SLO Targets
|
||||
|
||||
### The Art of Target Setting
|
||||
|
||||
Setting SLO targets is balancing act between:
|
||||
- **User happiness**: Targets should reflect acceptable user experience
|
||||
- **Business value**: Tighter SLOs cost more to maintain
|
||||
- **Current performance**: Targets should be achievable but aspirational
|
||||
|
||||
### Target Setting Strategies
|
||||
|
||||
#### Historical Performance Method
|
||||
1. Collect 4-6 weeks of historical data
|
||||
2. Calculate the worst user-visible performance in that period
|
||||
3. Set your SLO slightly better than the worst acceptable performance
|
||||
|
||||
#### User Journey Mapping
|
||||
1. Map critical user journeys
|
||||
2. Identify acceptable performance for each step
|
||||
3. Work backwards to component SLOs
|
||||
|
||||
#### Error Budget Approach
|
||||
1. Decide how much unreliability you can afford
|
||||
2. Set SLO targets based on acceptable error budget consumption
|
||||
3. Example: 99.9% availability = 43.8 minutes downtime per month
|
||||
|
||||
### SLO Target Examples by Service Criticality
|
||||
|
||||
#### Critical Services (Revenue Impact)
|
||||
- **Availability**: 99.95% - 99.99%
|
||||
- **Latency (P95)**: 100-200ms
|
||||
- **Error Rate**: < 0.1%
|
||||
|
||||
#### High Priority Services
|
||||
- **Availability**: 99.9% - 99.95%
|
||||
- **Latency (P95)**: 200-500ms
|
||||
- **Error Rate**: < 0.5%
|
||||
|
||||
#### Standard Services
|
||||
- **Availability**: 99.5% - 99.9%
|
||||
- **Latency (P95)**: 500ms - 1s
|
||||
- **Error Rate**: < 1%
|
||||
|
||||
## Error Budget Management
|
||||
|
||||
### What is an Error Budget?
|
||||
|
||||
Your error budget is the maximum amount of unreliability you can accumulate while still meeting your SLO. It's calculated as:
|
||||
|
||||
```
|
||||
Error Budget = (1 - SLO) × Time Window
|
||||
```
|
||||
|
||||
For a 99.9% availability SLO over 30 days:
|
||||
```
|
||||
Error Budget = (1 - 0.999) × 30 days = 0.001 × 30 days = 43.8 minutes
|
||||
```
|
||||
|
||||
### Error Budget Policies
|
||||
|
||||
Define what happens when you consume your error budget:
|
||||
|
||||
#### Conservative Policy (High-Risk Services)
|
||||
- **> 50% consumed**: Freeze non-critical feature releases
|
||||
- **> 75% consumed**: Focus entirely on reliability improvements
|
||||
- **> 90% consumed**: Consider emergency measures (traffic shaping, etc.)
|
||||
|
||||
#### Balanced Policy (Standard Services)
|
||||
- **> 75% consumed**: Increase focus on reliability work
|
||||
- **> 90% consumed**: Pause feature work, focus on reliability
|
||||
|
||||
#### Aggressive Policy (Early Stage Services)
|
||||
- **> 90% consumed**: Review but continue normal operations
|
||||
- **100% consumed**: Evaluate SLO appropriateness
|
||||
|
||||
### Burn Rate Alerting
|
||||
|
||||
Multi-window burn rate alerts help you catch SLO violations before they become critical:
|
||||
|
||||
```yaml
|
||||
# Fast burn: 2% budget consumed in 1 hour
|
||||
- alert: FastBurnSLOViolation
|
||||
expr: (
|
||||
(1 - (sum(rate(http_requests_total{code!~"5.."}[5m])) / sum(rate(http_requests_total[5m])))) > (14.4 * 0.001)
|
||||
and
|
||||
(1 - (sum(rate(http_requests_total{code!~"5.."}[1h])) / sum(rate(http_requests_total[1h])))) > (14.4 * 0.001)
|
||||
)
|
||||
for: 2m
|
||||
|
||||
# Slow burn: 10% budget consumed in 3 days
|
||||
- alert: SlowBurnSLOViolation
|
||||
expr: (
|
||||
(1 - (sum(rate(http_requests_total{code!~"5.."}[6h])) / sum(rate(http_requests_total[6h])))) > (1.0 * 0.001)
|
||||
and
|
||||
(1 - (sum(rate(http_requests_total{code!~"5.."}[3d])) / sum(rate(http_requests_total[3d])))) > (1.0 * 0.001)
|
||||
)
|
||||
for: 15m
|
||||
```
|
||||
|
||||
## Implementation Patterns
|
||||
|
||||
### The SLO Implementation Ladder
|
||||
|
||||
#### Level 1: Basic SLOs
|
||||
- Choose 1-2 SLIs that matter most to users
|
||||
- Set aspirational but achievable targets
|
||||
- Implement basic alerting when SLOs are missed
|
||||
|
||||
#### Level 2: Operational SLOs
|
||||
- Add burn rate alerting
|
||||
- Create error budget dashboards
|
||||
- Establish error budget policies
|
||||
- Regular SLO review meetings
|
||||
|
||||
#### Level 3: Advanced SLOs
|
||||
- Multi-window burn rate alerts
|
||||
- Automated error budget policy enforcement
|
||||
- SLO-driven incident prioritization
|
||||
- Integration with CI/CD for deployment decisions
|
||||
|
||||
### SLO Measurement Architecture
|
||||
|
||||
#### Push vs Pull Metrics
|
||||
- **Pull** (Prometheus): Good for infrastructure metrics, real-time alerting
|
||||
- **Push** (StatsD): Good for application metrics, business events
|
||||
|
||||
#### Measurement Points
|
||||
- **Server-side**: More reliable, easier to implement
|
||||
- **Client-side**: Better reflects user experience
|
||||
- **Synthetic**: Consistent, predictable, may not reflect real user experience
|
||||
|
||||
### SLO Dashboard Design
|
||||
|
||||
Essential elements for SLO dashboards:
|
||||
|
||||
1. **Current SLO Achievement**: Large, prominent display
|
||||
2. **Error Budget Remaining**: Visual indicator (gauge, progress bar)
|
||||
3. **Burn Rate**: Time series showing error budget consumption rate
|
||||
4. **Historical Trends**: 4-week view of SLO achievement
|
||||
5. **Alerts**: Current and recent SLO-related alerts
|
||||
|
||||
## Advanced Topics
|
||||
|
||||
### Dependency SLOs
|
||||
|
||||
For services with dependencies:
|
||||
|
||||
```
|
||||
SLO_service ≤ min(SLO_inherent, ∏SLO_dependencies)
|
||||
```
|
||||
|
||||
If your service depends on 3 other services each with 99.9% SLO:
|
||||
```
|
||||
Maximum_SLO = 0.999³ = 0.997 = 99.7%
|
||||
```
|
||||
|
||||
### User Journey SLOs
|
||||
|
||||
Track end-to-end user experiences:
|
||||
|
||||
```prometheus
|
||||
# Registration success rate
|
||||
sum(rate(user_registration_success_total[5m])) / sum(rate(user_registration_attempts_total[5m]))
|
||||
|
||||
# Purchase completion latency
|
||||
histogram_quantile(0.95, rate(purchase_completion_duration_seconds_bucket[5m]))
|
||||
```
|
||||
|
||||
### SLOs for Batch Systems
|
||||
|
||||
Special considerations for non-request/response systems:
|
||||
|
||||
#### Freshness SLO
|
||||
```prometheus
|
||||
# Data should be no more than 4 hours old
|
||||
(time() - last_successful_update_timestamp) < (4 * 3600)
|
||||
```
|
||||
|
||||
#### Throughput SLO
|
||||
```prometheus
|
||||
# Should process at least 1000 items per hour
|
||||
rate(items_processed_total[1h]) >= 1000
|
||||
```
|
||||
|
||||
#### Quality SLO
|
||||
```prometheus
|
||||
# At least 99.5% of records should pass validation
|
||||
sum(rate(records_valid_total[5m])) / sum(rate(records_processed_total[5m])) >= 0.995
|
||||
```
|
||||
|
||||
## Common Mistakes and How to Avoid Them
|
||||
|
||||
### Mistake 1: Too Many SLOs
|
||||
**Problem**: Drowning in metrics, losing focus
|
||||
**Solution**: Start with 1-2 SLOs per service, add more only when needed
|
||||
|
||||
### Mistake 2: Internal Metrics as SLIs
|
||||
**Problem**: Optimizing for metrics that don't impact users
|
||||
**Solution**: Always ask "If this metric changes, do users notice?"
|
||||
|
||||
### Mistake 3: Perfectionist SLOs
|
||||
**Problem**: 99.99% SLO when 99.9% would be fine
|
||||
**Solution**: Higher SLOs cost exponentially more; pick the minimum acceptable level
|
||||
|
||||
### Mistake 4: Ignoring Error Budgets
|
||||
**Problem**: Treating any SLO miss as an emergency
|
||||
**Solution**: Error budgets exist to be spent; use them to balance feature velocity and reliability
|
||||
|
||||
### Mistake 5: Static SLOs
|
||||
**Problem**: Setting SLOs once and never updating them
|
||||
**Solution**: Review SLOs quarterly; adjust based on user feedback and business changes
|
||||
|
||||
## SLO Review Process
|
||||
|
||||
### Monthly SLO Review Agenda
|
||||
|
||||
1. **SLO Achievement Review**: Did we meet our SLOs?
|
||||
2. **Error Budget Analysis**: How did we spend our error budget?
|
||||
3. **Incident Correlation**: Which incidents impacted our SLOs?
|
||||
4. **SLI Quality Assessment**: Are our SLIs still meaningful?
|
||||
5. **Target Adjustment**: Should we change any targets?
|
||||
|
||||
### Quarterly SLO Health Check
|
||||
|
||||
1. **User Impact Validation**: Survey users about acceptable performance
|
||||
2. **Business Alignment**: Do SLOs still reflect business priorities?
|
||||
3. **Measurement Quality**: Are we measuring the right things?
|
||||
4. **Cost/Benefit Analysis**: Are tighter SLOs worth the investment?
|
||||
|
||||
## Tooling and Automation
|
||||
|
||||
### Essential Tools
|
||||
|
||||
1. **Metrics Collection**: Prometheus, InfluxDB, CloudWatch
|
||||
2. **Alerting**: Alertmanager, PagerDuty, OpsGenie
|
||||
3. **Dashboards**: Grafana, DataDog, New Relic
|
||||
4. **SLO Platforms**: Sloth, Pyrra, Service Level Blue
|
||||
|
||||
### Automation Opportunities
|
||||
|
||||
- **Burn rate alert generation** from SLO definitions
|
||||
- **Dashboard creation** from SLO specifications
|
||||
- **Error budget calculation** and tracking
|
||||
- **Release blocking** based on error budget consumption
|
||||
|
||||
## Getting Started Checklist
|
||||
|
||||
- [ ] Identify your service's critical user journeys
|
||||
- [ ] Choose 1-2 SLIs that best reflect user experience
|
||||
- [ ] Collect 4-6 weeks of baseline data
|
||||
- [ ] Set initial SLO targets based on historical performance
|
||||
- [ ] Implement basic SLO monitoring and alerting
|
||||
- [ ] Create an SLO dashboard
|
||||
- [ ] Define error budget policies
|
||||
- [ ] Schedule monthly SLO reviews
|
||||
- [ ] Plan for quarterly SLO health checks
|
||||
|
||||
Remember: SLOs are a journey, not a destination. Start simple, learn from experience, and iterate toward better reliability management.
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,670 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
SLO Designer - Generate comprehensive SLI/SLO frameworks for services
|
||||
|
||||
This script analyzes service descriptions and generates complete SLO frameworks including:
|
||||
- SLI definitions based on service characteristics
|
||||
- SLO targets based on criticality and user impact
|
||||
- Error budget calculations and policies
|
||||
- Multi-window burn rate alerts
|
||||
- SLA recommendations for customer-facing services
|
||||
|
||||
Usage:
|
||||
python slo_designer.py --input service_definition.json --output slo_framework.json
|
||||
python slo_designer.py --service-type api --criticality high --user-facing true
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import sys
|
||||
import math
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
class SLODesigner:
|
||||
"""Design and generate SLO frameworks for services."""
|
||||
|
||||
# SLO target recommendations based on service criticality
|
||||
SLO_TARGETS = {
|
||||
'critical': {
|
||||
'availability': 0.9999, # 99.99% - 4.38 minutes downtime/month
|
||||
'latency_p95': 100, # 95th percentile latency in ms
|
||||
'latency_p99': 500, # 99th percentile latency in ms
|
||||
'error_rate': 0.001 # 0.1% error rate
|
||||
},
|
||||
'high': {
|
||||
'availability': 0.999, # 99.9% - 43.8 minutes downtime/month
|
||||
'latency_p95': 200, # 95th percentile latency in ms
|
||||
'latency_p99': 1000, # 99th percentile latency in ms
|
||||
'error_rate': 0.005 # 0.5% error rate
|
||||
},
|
||||
'medium': {
|
||||
'availability': 0.995, # 99.5% - 3.65 hours downtime/month
|
||||
'latency_p95': 500, # 95th percentile latency in ms
|
||||
'latency_p99': 2000, # 99th percentile latency in ms
|
||||
'error_rate': 0.01 # 1% error rate
|
||||
},
|
||||
'low': {
|
||||
'availability': 0.99, # 99% - 7.3 hours downtime/month
|
||||
'latency_p95': 1000, # 95th percentile latency in ms
|
||||
'latency_p99': 5000, # 99th percentile latency in ms
|
||||
'error_rate': 0.02 # 2% error rate
|
||||
}
|
||||
}
|
||||
|
||||
# Burn rate windows for multi-window alerting
|
||||
BURN_RATE_WINDOWS = [
|
||||
{'short': '5m', 'long': '1h', 'burn_rate': 14.4, 'budget_consumed': '2%'},
|
||||
{'short': '30m', 'long': '6h', 'burn_rate': 6, 'budget_consumed': '5%'},
|
||||
{'short': '2h', 'long': '1d', 'burn_rate': 3, 'budget_consumed': '10%'},
|
||||
{'short': '6h', 'long': '3d', 'burn_rate': 1, 'budget_consumed': '10%'}
|
||||
]
|
||||
|
||||
# Service type specific SLI recommendations
|
||||
SERVICE_TYPE_SLIS = {
|
||||
'api': ['availability', 'latency', 'error_rate', 'throughput'],
|
||||
'web': ['availability', 'latency', 'error_rate', 'page_load_time'],
|
||||
'database': ['availability', 'query_latency', 'connection_success_rate', 'replication_lag'],
|
||||
'queue': ['availability', 'message_processing_time', 'queue_depth', 'message_loss_rate'],
|
||||
'batch': ['job_success_rate', 'job_duration', 'data_freshness', 'resource_utilization'],
|
||||
'ml': ['model_accuracy', 'prediction_latency', 'training_success_rate', 'feature_freshness']
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the SLO Designer."""
|
||||
self.service_config = {}
|
||||
self.slo_framework = {}
|
||||
|
||||
def load_service_definition(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Load service definition from JSON file."""
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
raise ValueError(f"Service definition file not found: {file_path}")
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON in service definition: {e}")
|
||||
|
||||
def create_service_definition(self, service_type: str, criticality: str,
|
||||
user_facing: bool, name: str = None) -> Dict[str, Any]:
|
||||
"""Create a service definition from parameters."""
|
||||
return {
|
||||
'name': name or f'{service_type}_service',
|
||||
'type': service_type,
|
||||
'criticality': criticality,
|
||||
'user_facing': user_facing,
|
||||
'description': f'A {criticality} criticality {service_type} service',
|
||||
'dependencies': [],
|
||||
'team': 'platform',
|
||||
'environment': 'production'
|
||||
}
|
||||
|
||||
def generate_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Generate Service Level Indicators based on service characteristics."""
|
||||
service_type = service_def.get('type', 'api')
|
||||
base_slis = self.SERVICE_TYPE_SLIS.get(service_type, ['availability', 'latency', 'error_rate'])
|
||||
|
||||
slis = []
|
||||
|
||||
for sli_name in base_slis:
|
||||
sli = self._create_sli_definition(sli_name, service_def)
|
||||
if sli:
|
||||
slis.append(sli)
|
||||
|
||||
# Add user-facing specific SLIs
|
||||
if service_def.get('user_facing', False):
|
||||
user_slis = self._generate_user_facing_slis(service_def)
|
||||
slis.extend(user_slis)
|
||||
|
||||
return slis
|
||||
|
||||
def _create_sli_definition(self, sli_name: str, service_def: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Create detailed SLI definition."""
|
||||
service_name = service_def.get('name', 'service')
|
||||
|
||||
sli_definitions = {
|
||||
'availability': {
|
||||
'name': 'Availability',
|
||||
'description': 'Percentage of successful requests',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))',
|
||||
'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))',
|
||||
'unit': 'percentage'
|
||||
},
|
||||
'latency': {
|
||||
'name': 'Request Latency P95',
|
||||
'description': '95th percentile of request latency',
|
||||
'type': 'threshold',
|
||||
'query': f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'seconds'
|
||||
},
|
||||
'error_rate': {
|
||||
'name': 'Error Rate',
|
||||
'description': 'Rate of 5xx errors',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))',
|
||||
'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))',
|
||||
'unit': 'percentage'
|
||||
},
|
||||
'throughput': {
|
||||
'name': 'Request Throughput',
|
||||
'description': 'Requests per second',
|
||||
'type': 'gauge',
|
||||
'query': f'sum(rate(http_requests_total{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'requests/sec'
|
||||
},
|
||||
'page_load_time': {
|
||||
'name': 'Page Load Time P95',
|
||||
'description': '95th percentile of page load time',
|
||||
'type': 'threshold',
|
||||
'query': f'histogram_quantile(0.95, rate(page_load_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'seconds'
|
||||
},
|
||||
'query_latency': {
|
||||
'name': 'Database Query Latency P95',
|
||||
'description': '95th percentile of database query latency',
|
||||
'type': 'threshold',
|
||||
'query': f'histogram_quantile(0.95, rate(db_query_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'seconds'
|
||||
},
|
||||
'connection_success_rate': {
|
||||
'name': 'Database Connection Success Rate',
|
||||
'description': 'Percentage of successful database connections',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(db_connections_total{{service="{service_name}",status="success"}}[5m]))',
|
||||
'total_events': f'sum(rate(db_connections_total{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'percentage'
|
||||
}
|
||||
}
|
||||
|
||||
return sli_definitions.get(sli_name)
|
||||
|
||||
def _generate_user_facing_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Generate additional SLIs for user-facing services."""
|
||||
service_name = service_def.get('name', 'service')
|
||||
|
||||
return [
|
||||
{
|
||||
'name': 'User Journey Success Rate',
|
||||
'description': 'Percentage of successful complete user journeys',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(user_journey_total{{service="{service_name}",status="success"}}[5m]))',
|
||||
'total_events': f'sum(rate(user_journey_total{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'percentage'
|
||||
},
|
||||
{
|
||||
'name': 'Feature Availability',
|
||||
'description': 'Percentage of time key features are available',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(feature_checks_total{{service="{service_name}",status="available"}}[5m]))',
|
||||
'total_events': f'sum(rate(feature_checks_total{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'percentage'
|
||||
}
|
||||
]
|
||||
|
||||
def generate_slos(self, service_def: Dict[str, Any], slis: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Generate Service Level Objectives based on service criticality."""
|
||||
criticality = service_def.get('criticality', 'medium')
|
||||
targets = self.SLO_TARGETS.get(criticality, self.SLO_TARGETS['medium'])
|
||||
|
||||
slos = []
|
||||
|
||||
for sli in slis:
|
||||
slo = self._create_slo_from_sli(sli, targets, service_def)
|
||||
if slo:
|
||||
slos.append(slo)
|
||||
|
||||
return slos
|
||||
|
||||
def _create_slo_from_sli(self, sli: Dict[str, Any], targets: Dict[str, float],
|
||||
service_def: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Create SLO definition from SLI."""
|
||||
sli_name = sli['name'].lower().replace(' ', '_')
|
||||
|
||||
# Map SLI names to target keys
|
||||
target_mapping = {
|
||||
'availability': 'availability',
|
||||
'request_latency_p95': 'latency_p95',
|
||||
'error_rate': 'error_rate',
|
||||
'user_journey_success_rate': 'availability',
|
||||
'feature_availability': 'availability',
|
||||
'page_load_time_p95': 'latency_p95',
|
||||
'database_query_latency_p95': 'latency_p95',
|
||||
'database_connection_success_rate': 'availability'
|
||||
}
|
||||
|
||||
target_key = target_mapping.get(sli_name)
|
||||
if not target_key:
|
||||
return None
|
||||
|
||||
target_value = targets.get(target_key)
|
||||
if target_value is None:
|
||||
return None
|
||||
|
||||
# Determine comparison operator and format target
|
||||
if 'latency' in sli_name or 'duration' in sli_name:
|
||||
operator = '<='
|
||||
target_display = f"{target_value}ms" if target_value < 10 else f"{target_value/1000}s"
|
||||
elif 'rate' in sli_name and 'error' in sli_name:
|
||||
operator = '<='
|
||||
target_display = f"{target_value * 100}%"
|
||||
target_value = target_value # Keep as decimal
|
||||
else:
|
||||
operator = '>='
|
||||
target_display = f"{target_value * 100}%"
|
||||
|
||||
# Calculate time windows
|
||||
time_windows = ['1h', '1d', '7d', '30d']
|
||||
|
||||
slo = {
|
||||
'name': f"{sli['name']} SLO",
|
||||
'description': f"Service level objective for {sli['description'].lower()}",
|
||||
'sli_name': sli['name'],
|
||||
'target_value': target_value,
|
||||
'target_display': target_display,
|
||||
'operator': operator,
|
||||
'time_windows': time_windows,
|
||||
'measurement_window': '30d',
|
||||
'service': service_def.get('name', 'service'),
|
||||
'criticality': service_def.get('criticality', 'medium')
|
||||
}
|
||||
|
||||
return slo
|
||||
|
||||
def calculate_error_budgets(self, slos: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Calculate error budgets for SLOs."""
|
||||
error_budgets = []
|
||||
|
||||
for slo in slos:
|
||||
if slo['operator'] == '>=': # Availability-type SLOs
|
||||
target = slo['target_value']
|
||||
error_budget_rate = 1 - target
|
||||
|
||||
# Calculate budget for different time windows
|
||||
time_windows = {
|
||||
'1h': 3600,
|
||||
'1d': 86400,
|
||||
'7d': 604800,
|
||||
'30d': 2592000
|
||||
}
|
||||
|
||||
budgets = {}
|
||||
for window, seconds in time_windows.items():
|
||||
budget_seconds = seconds * error_budget_rate
|
||||
if budget_seconds < 60:
|
||||
budgets[window] = f"{budget_seconds:.1f} seconds"
|
||||
elif budget_seconds < 3600:
|
||||
budgets[window] = f"{budget_seconds/60:.1f} minutes"
|
||||
else:
|
||||
budgets[window] = f"{budget_seconds/3600:.1f} hours"
|
||||
|
||||
error_budget = {
|
||||
'slo_name': slo['name'],
|
||||
'error_budget_rate': error_budget_rate,
|
||||
'error_budget_percentage': f"{error_budget_rate * 100:.3f}%",
|
||||
'budgets_by_window': budgets,
|
||||
'burn_rate_alerts': self._generate_burn_rate_alerts(slo, error_budget_rate)
|
||||
}
|
||||
|
||||
error_budgets.append(error_budget)
|
||||
|
||||
return error_budgets
|
||||
|
||||
def _generate_burn_rate_alerts(self, slo: Dict[str, Any], error_budget_rate: float) -> List[Dict[str, Any]]:
|
||||
"""Generate multi-window burn rate alerts."""
|
||||
alerts = []
|
||||
service_name = slo['service']
|
||||
sli_query = self._get_sli_query_for_burn_rate(slo)
|
||||
|
||||
for window_config in self.BURN_RATE_WINDOWS:
|
||||
alert = {
|
||||
'name': f"{slo['sli_name']} Burn Rate {window_config['budget_consumed']} Alert",
|
||||
'description': f"Alert when {slo['sli_name']} is consuming error budget at {window_config['burn_rate']}x rate",
|
||||
'severity': self._determine_alert_severity(float(window_config['budget_consumed'].rstrip('%'))),
|
||||
'short_window': window_config['short'],
|
||||
'long_window': window_config['long'],
|
||||
'burn_rate_threshold': window_config['burn_rate'],
|
||||
'budget_consumed': window_config['budget_consumed'],
|
||||
'condition': f"({sli_query}_short > {window_config['burn_rate']}) and ({sli_query}_long > {window_config['burn_rate']})",
|
||||
'annotations': {
|
||||
'summary': f"High burn rate detected for {slo['sli_name']}",
|
||||
'description': f"Error budget consumption rate is {window_config['burn_rate']}x normal, will exhaust {window_config['budget_consumed']} of monthly budget"
|
||||
}
|
||||
}
|
||||
alerts.append(alert)
|
||||
|
||||
return alerts
|
||||
|
||||
def _get_sli_query_for_burn_rate(self, slo: Dict[str, Any]) -> str:
|
||||
"""Generate SLI query fragment for burn rate calculation."""
|
||||
service_name = slo['service']
|
||||
sli_name = slo['sli_name'].lower().replace(' ', '_')
|
||||
|
||||
if 'availability' in sli_name or 'success' in sli_name:
|
||||
return f"(1 - (sum(rate(http_requests_total{{service='{service_name}',code!~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}}))))"
|
||||
elif 'error' in sli_name:
|
||||
return f"(sum(rate(http_requests_total{{service='{service_name}',code=~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}})))"
|
||||
else:
|
||||
return f"sli_burn_rate_{sli_name}"
|
||||
|
||||
def _determine_alert_severity(self, budget_consumed_percent: float) -> str:
|
||||
"""Determine alert severity based on budget consumption rate."""
|
||||
if budget_consumed_percent <= 2:
|
||||
return 'critical'
|
||||
elif budget_consumed_percent <= 5:
|
||||
return 'warning'
|
||||
else:
|
||||
return 'info'
|
||||
|
||||
def generate_sla_recommendations(self, service_def: Dict[str, Any],
|
||||
slos: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Generate SLA recommendations for customer-facing services."""
|
||||
if not service_def.get('user_facing', False):
|
||||
return {
|
||||
'applicable': False,
|
||||
'reason': 'SLA not recommended for non-user-facing services'
|
||||
}
|
||||
|
||||
criticality = service_def.get('criticality', 'medium')
|
||||
|
||||
# SLA targets should be more conservative than SLO targets
|
||||
sla_buffer = 0.001 # 0.1% buffer below SLO
|
||||
|
||||
sla_recommendations = {
|
||||
'applicable': True,
|
||||
'service': service_def.get('name'),
|
||||
'commitments': [],
|
||||
'penalties': self._generate_penalty_structure(criticality),
|
||||
'measurement_methodology': 'External synthetic monitoring from multiple geographic locations',
|
||||
'exclusions': [
|
||||
'Planned maintenance windows (with 72h advance notice)',
|
||||
'Customer-side network or infrastructure issues',
|
||||
'Force majeure events',
|
||||
'Third-party service dependencies beyond our control'
|
||||
]
|
||||
}
|
||||
|
||||
for slo in slos:
|
||||
if slo['operator'] == '>=' and 'availability' in slo['sli_name'].lower():
|
||||
sla_target = max(0.9, slo['target_value'] - sla_buffer)
|
||||
commitment = {
|
||||
'metric': slo['sli_name'],
|
||||
'target': sla_target,
|
||||
'target_display': f"{sla_target * 100:.2f}%",
|
||||
'measurement_window': 'monthly',
|
||||
'measurement_method': 'Uptime monitoring with 1-minute granularity'
|
||||
}
|
||||
sla_recommendations['commitments'].append(commitment)
|
||||
|
||||
return sla_recommendations
|
||||
|
||||
def _generate_penalty_structure(self, criticality: str) -> List[Dict[str, Any]]:
|
||||
"""Generate penalty structure based on service criticality."""
|
||||
penalty_structures = {
|
||||
'critical': [
|
||||
{'breach_threshold': '< 99.99%', 'credit_percentage': 10},
|
||||
{'breach_threshold': '< 99.9%', 'credit_percentage': 25},
|
||||
{'breach_threshold': '< 99%', 'credit_percentage': 50}
|
||||
],
|
||||
'high': [
|
||||
{'breach_threshold': '< 99.9%', 'credit_percentage': 10},
|
||||
{'breach_threshold': '< 99.5%', 'credit_percentage': 25}
|
||||
],
|
||||
'medium': [
|
||||
{'breach_threshold': '< 99.5%', 'credit_percentage': 10}
|
||||
],
|
||||
'low': []
|
||||
}
|
||||
|
||||
return penalty_structures.get(criticality, [])
|
||||
|
||||
def generate_framework(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Generate complete SLO framework."""
|
||||
# Generate SLIs
|
||||
slis = self.generate_slis(service_def)
|
||||
|
||||
# Generate SLOs
|
||||
slos = self.generate_slos(service_def, slis)
|
||||
|
||||
# Calculate error budgets
|
||||
error_budgets = self.calculate_error_budgets(slos)
|
||||
|
||||
# Generate SLA recommendations
|
||||
sla_recommendations = self.generate_sla_recommendations(service_def, slos)
|
||||
|
||||
# Create comprehensive framework
|
||||
framework = {
|
||||
'metadata': {
|
||||
'service': service_def,
|
||||
'generated_at': datetime.utcnow().isoformat() + 'Z',
|
||||
'framework_version': '1.0'
|
||||
},
|
||||
'slis': slis,
|
||||
'slos': slos,
|
||||
'error_budgets': error_budgets,
|
||||
'sla_recommendations': sla_recommendations,
|
||||
'monitoring_recommendations': self._generate_monitoring_recommendations(service_def),
|
||||
'implementation_guide': self._generate_implementation_guide(service_def, slis, slos)
|
||||
}
|
||||
|
||||
return framework
|
||||
|
||||
def _generate_monitoring_recommendations(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Generate monitoring tool recommendations."""
|
||||
service_type = service_def.get('type', 'api')
|
||||
|
||||
recommendations = {
|
||||
'metrics': {
|
||||
'collection': 'Prometheus with service discovery',
|
||||
'retention': '90 days for raw metrics, 1 year for aggregated',
|
||||
'alerting': 'Prometheus Alertmanager with multi-window burn rate alerts'
|
||||
},
|
||||
'logging': {
|
||||
'format': 'Structured JSON logs with correlation IDs',
|
||||
'aggregation': 'ELK stack or equivalent with proper indexing',
|
||||
'retention': '30 days for debug logs, 90 days for error logs'
|
||||
},
|
||||
'tracing': {
|
||||
'sampling': 'Adaptive sampling with 1% base rate',
|
||||
'storage': 'Jaeger or Zipkin with 7-day retention',
|
||||
'integration': 'OpenTelemetry instrumentation'
|
||||
}
|
||||
}
|
||||
|
||||
if service_type == 'web':
|
||||
recommendations['synthetic_monitoring'] = {
|
||||
'frequency': 'Every 1 minute from 3+ geographic locations',
|
||||
'checks': 'Full user journey simulation',
|
||||
'tools': 'Pingdom, DataDog Synthetics, or equivalent'
|
||||
}
|
||||
|
||||
return recommendations
|
||||
|
||||
def _generate_implementation_guide(self, service_def: Dict[str, Any],
|
||||
slis: List[Dict[str, Any]],
|
||||
slos: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Generate implementation guide for the SLO framework."""
|
||||
return {
|
||||
'prerequisites': [
|
||||
'Service instrumented with metrics collection (Prometheus format)',
|
||||
'Structured logging with correlation IDs',
|
||||
'Monitoring infrastructure (Prometheus, Grafana, Alertmanager)',
|
||||
'Incident response processes and escalation policies'
|
||||
],
|
||||
'implementation_steps': [
|
||||
{
|
||||
'step': 1,
|
||||
'title': 'Instrument Service',
|
||||
'description': 'Add metrics collection for all defined SLIs',
|
||||
'estimated_effort': '1-2 days'
|
||||
},
|
||||
{
|
||||
'step': 2,
|
||||
'title': 'Configure Recording Rules',
|
||||
'description': 'Set up Prometheus recording rules for SLI calculations',
|
||||
'estimated_effort': '4-8 hours'
|
||||
},
|
||||
{
|
||||
'step': 3,
|
||||
'title': 'Implement Burn Rate Alerts',
|
||||
'description': 'Configure multi-window burn rate alerting rules',
|
||||
'estimated_effort': '1 day'
|
||||
},
|
||||
{
|
||||
'step': 4,
|
||||
'title': 'Create SLO Dashboard',
|
||||
'description': 'Build Grafana dashboard for SLO tracking and error budget monitoring',
|
||||
'estimated_effort': '4-6 hours'
|
||||
},
|
||||
{
|
||||
'step': 5,
|
||||
'title': 'Test and Validate',
|
||||
'description': 'Test alerting and validate SLI measurements against expectations',
|
||||
'estimated_effort': '1-2 days'
|
||||
},
|
||||
{
|
||||
'step': 6,
|
||||
'title': 'Documentation and Training',
|
||||
'description': 'Document runbooks and train team on SLO monitoring',
|
||||
'estimated_effort': '1 day'
|
||||
}
|
||||
],
|
||||
'validation_checklist': [
|
||||
'All SLIs produce expected metric values',
|
||||
'Burn rate alerts fire correctly during simulated outages',
|
||||
'Error budget calculations match manual verification',
|
||||
'Dashboard displays accurate SLO achievement rates',
|
||||
'Alert routing reaches correct escalation paths',
|
||||
'Runbooks are complete and tested'
|
||||
]
|
||||
}
|
||||
|
||||
def export_json(self, framework: Dict[str, Any], output_file: str):
|
||||
"""Export framework as JSON."""
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(framework, f, indent=2)
|
||||
|
||||
def print_summary(self, framework: Dict[str, Any]):
|
||||
"""Print human-readable summary of the SLO framework."""
|
||||
service = framework['metadata']['service']
|
||||
slis = framework['slis']
|
||||
slos = framework['slos']
|
||||
error_budgets = framework['error_budgets']
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SLO FRAMEWORK SUMMARY FOR {service['name'].upper()}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
print(f"\nService Details:")
|
||||
print(f" Type: {service['type']}")
|
||||
print(f" Criticality: {service['criticality']}")
|
||||
print(f" User Facing: {'Yes' if service.get('user_facing') else 'No'}")
|
||||
print(f" Team: {service.get('team', 'Unknown')}")
|
||||
|
||||
print(f"\nService Level Indicators ({len(slis)}):")
|
||||
for i, sli in enumerate(slis, 1):
|
||||
print(f" {i}. {sli['name']}")
|
||||
print(f" Description: {sli['description']}")
|
||||
print(f" Type: {sli['type']}")
|
||||
print()
|
||||
|
||||
print(f"Service Level Objectives ({len(slos)}):")
|
||||
for i, slo in enumerate(slos, 1):
|
||||
print(f" {i}. {slo['name']}")
|
||||
print(f" Target: {slo['target_display']}")
|
||||
print(f" Measurement Window: {slo['measurement_window']}")
|
||||
print()
|
||||
|
||||
print(f"Error Budget Summary:")
|
||||
for budget in error_budgets:
|
||||
print(f" {budget['slo_name']}:")
|
||||
print(f" Monthly Budget: {budget['error_budget_percentage']}")
|
||||
print(f" Burn Rate Alerts: {len(budget['burn_rate_alerts'])}")
|
||||
print()
|
||||
|
||||
sla = framework['sla_recommendations']
|
||||
if sla['applicable']:
|
||||
print(f"SLA Recommendations:")
|
||||
print(f" Commitments: {len(sla['commitments'])}")
|
||||
print(f" Penalty Tiers: {len(sla['penalties'])}")
|
||||
else:
|
||||
print(f"SLA Recommendations: {sla['reason']}")
|
||||
|
||||
print(f"\nImplementation Timeline: 1-2 weeks")
|
||||
print(f"Framework generated at: {framework['metadata']['generated_at']}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate comprehensive SLO frameworks for services',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Generate from service definition file
|
||||
python slo_designer.py --input service.json --output framework.json
|
||||
|
||||
# Generate from command line parameters
|
||||
python slo_designer.py --service-type api --criticality high --user-facing true --output framework.json
|
||||
|
||||
# Generate and display summary only
|
||||
python slo_designer.py --service-type web --criticality critical --user-facing true --summary-only
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--input', '-i',
|
||||
help='Input service definition JSON file')
|
||||
parser.add_argument('--output', '-o',
|
||||
help='Output framework JSON file')
|
||||
parser.add_argument('--service-type',
|
||||
choices=['api', 'web', 'database', 'queue', 'batch', 'ml'],
|
||||
help='Service type')
|
||||
parser.add_argument('--criticality',
|
||||
choices=['critical', 'high', 'medium', 'low'],
|
||||
help='Service criticality level')
|
||||
parser.add_argument('--user-facing',
|
||||
choices=['true', 'false'],
|
||||
help='Whether service is user-facing')
|
||||
parser.add_argument('--service-name',
|
||||
help='Service name')
|
||||
parser.add_argument('--summary-only', action='store_true',
|
||||
help='Only display summary, do not save JSON')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.input and not (args.service_type and args.criticality and args.user_facing):
|
||||
parser.error("Must provide either --input file or --service-type, --criticality, and --user-facing")
|
||||
|
||||
designer = SLODesigner()
|
||||
|
||||
try:
|
||||
# Load or create service definition
|
||||
if args.input:
|
||||
service_def = designer.load_service_definition(args.input)
|
||||
else:
|
||||
user_facing = args.user_facing.lower() == 'true'
|
||||
service_def = designer.create_service_definition(
|
||||
args.service_type, args.criticality, user_facing, args.service_name
|
||||
)
|
||||
|
||||
# Generate framework
|
||||
framework = designer.generate_framework(service_def)
|
||||
|
||||
# Output results
|
||||
if not args.summary_only:
|
||||
output_file = args.output or f"{service_def['name']}_slo_framework.json"
|
||||
designer.export_json(framework, output_file)
|
||||
print(f"SLO framework saved to: {output_file}")
|
||||
|
||||
# Always show summary
|
||||
designer.print_summary(framework)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user