Files
2026-03-12 15:17:52 +07:00

670 lines
29 KiB
Python

#!/usr/bin/env python3
"""
SLO Designer - Generate comprehensive SLI/SLO frameworks for services
This script analyzes service descriptions and generates complete SLO frameworks including:
- SLI definitions based on service characteristics
- SLO targets based on criticality and user impact
- Error budget calculations and policies
- Multi-window burn rate alerts
- SLA recommendations for customer-facing services
Usage:
python slo_designer.py --input service_definition.json --output slo_framework.json
python slo_designer.py --service-type api --criticality high --user-facing true
"""
import json
import argparse
import sys
import math
from typing import Dict, List, Any, Tuple
from datetime import datetime, timedelta
class SLODesigner:
"""Design and generate SLO frameworks for services."""
# SLO target recommendations based on service criticality
SLO_TARGETS = {
'critical': {
'availability': 0.9999, # 99.99% - 4.38 minutes downtime/month
'latency_p95': 100, # 95th percentile latency in ms
'latency_p99': 500, # 99th percentile latency in ms
'error_rate': 0.001 # 0.1% error rate
},
'high': {
'availability': 0.999, # 99.9% - 43.8 minutes downtime/month
'latency_p95': 200, # 95th percentile latency in ms
'latency_p99': 1000, # 99th percentile latency in ms
'error_rate': 0.005 # 0.5% error rate
},
'medium': {
'availability': 0.995, # 99.5% - 3.65 hours downtime/month
'latency_p95': 500, # 95th percentile latency in ms
'latency_p99': 2000, # 99th percentile latency in ms
'error_rate': 0.01 # 1% error rate
},
'low': {
'availability': 0.99, # 99% - 7.3 hours downtime/month
'latency_p95': 1000, # 95th percentile latency in ms
'latency_p99': 5000, # 99th percentile latency in ms
'error_rate': 0.02 # 2% error rate
}
}
# Burn rate windows for multi-window alerting
BURN_RATE_WINDOWS = [
{'short': '5m', 'long': '1h', 'burn_rate': 14.4, 'budget_consumed': '2%'},
{'short': '30m', 'long': '6h', 'burn_rate': 6, 'budget_consumed': '5%'},
{'short': '2h', 'long': '1d', 'burn_rate': 3, 'budget_consumed': '10%'},
{'short': '6h', 'long': '3d', 'burn_rate': 1, 'budget_consumed': '10%'}
]
# Service type specific SLI recommendations
SERVICE_TYPE_SLIS = {
'api': ['availability', 'latency', 'error_rate', 'throughput'],
'web': ['availability', 'latency', 'error_rate', 'page_load_time'],
'database': ['availability', 'query_latency', 'connection_success_rate', 'replication_lag'],
'queue': ['availability', 'message_processing_time', 'queue_depth', 'message_loss_rate'],
'batch': ['job_success_rate', 'job_duration', 'data_freshness', 'resource_utilization'],
'ml': ['model_accuracy', 'prediction_latency', 'training_success_rate', 'feature_freshness']
}
def __init__(self):
"""Initialize the SLO Designer."""
self.service_config = {}
self.slo_framework = {}
def load_service_definition(self, file_path: str) -> Dict[str, Any]:
"""Load service definition from JSON file."""
try:
with open(file_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
raise ValueError(f"Service definition file not found: {file_path}")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in service definition: {e}")
def create_service_definition(self, service_type: str, criticality: str,
user_facing: bool, name: str = None) -> Dict[str, Any]:
"""Create a service definition from parameters."""
return {
'name': name or f'{service_type}_service',
'type': service_type,
'criticality': criticality,
'user_facing': user_facing,
'description': f'A {criticality} criticality {service_type} service',
'dependencies': [],
'team': 'platform',
'environment': 'production'
}
def generate_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Generate Service Level Indicators based on service characteristics."""
service_type = service_def.get('type', 'api')
base_slis = self.SERVICE_TYPE_SLIS.get(service_type, ['availability', 'latency', 'error_rate'])
slis = []
for sli_name in base_slis:
sli = self._create_sli_definition(sli_name, service_def)
if sli:
slis.append(sli)
# Add user-facing specific SLIs
if service_def.get('user_facing', False):
user_slis = self._generate_user_facing_slis(service_def)
slis.extend(user_slis)
return slis
def _create_sli_definition(self, sli_name: str, service_def: Dict[str, Any]) -> Dict[str, Any]:
"""Create detailed SLI definition."""
service_name = service_def.get('name', 'service')
sli_definitions = {
'availability': {
'name': 'Availability',
'description': 'Percentage of successful requests',
'type': 'ratio',
'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))',
'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))',
'unit': 'percentage'
},
'latency': {
'name': 'Request Latency P95',
'description': '95th percentile of request latency',
'type': 'threshold',
'query': f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
'unit': 'seconds'
},
'error_rate': {
'name': 'Error Rate',
'description': 'Rate of 5xx errors',
'type': 'ratio',
'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))',
'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))',
'unit': 'percentage'
},
'throughput': {
'name': 'Request Throughput',
'description': 'Requests per second',
'type': 'gauge',
'query': f'sum(rate(http_requests_total{{service="{service_name}"}}[5m]))',
'unit': 'requests/sec'
},
'page_load_time': {
'name': 'Page Load Time P95',
'description': '95th percentile of page load time',
'type': 'threshold',
'query': f'histogram_quantile(0.95, rate(page_load_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
'unit': 'seconds'
},
'query_latency': {
'name': 'Database Query Latency P95',
'description': '95th percentile of database query latency',
'type': 'threshold',
'query': f'histogram_quantile(0.95, rate(db_query_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
'unit': 'seconds'
},
'connection_success_rate': {
'name': 'Database Connection Success Rate',
'description': 'Percentage of successful database connections',
'type': 'ratio',
'good_events': f'sum(rate(db_connections_total{{service="{service_name}",status="success"}}[5m]))',
'total_events': f'sum(rate(db_connections_total{{service="{service_name}"}}[5m]))',
'unit': 'percentage'
}
}
return sli_definitions.get(sli_name)
def _generate_user_facing_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Generate additional SLIs for user-facing services."""
service_name = service_def.get('name', 'service')
return [
{
'name': 'User Journey Success Rate',
'description': 'Percentage of successful complete user journeys',
'type': 'ratio',
'good_events': f'sum(rate(user_journey_total{{service="{service_name}",status="success"}}[5m]))',
'total_events': f'sum(rate(user_journey_total{{service="{service_name}"}}[5m]))',
'unit': 'percentage'
},
{
'name': 'Feature Availability',
'description': 'Percentage of time key features are available',
'type': 'ratio',
'good_events': f'sum(rate(feature_checks_total{{service="{service_name}",status="available"}}[5m]))',
'total_events': f'sum(rate(feature_checks_total{{service="{service_name}"}}[5m]))',
'unit': 'percentage'
}
]
def generate_slos(self, service_def: Dict[str, Any], slis: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Generate Service Level Objectives based on service criticality."""
criticality = service_def.get('criticality', 'medium')
targets = self.SLO_TARGETS.get(criticality, self.SLO_TARGETS['medium'])
slos = []
for sli in slis:
slo = self._create_slo_from_sli(sli, targets, service_def)
if slo:
slos.append(slo)
return slos
def _create_slo_from_sli(self, sli: Dict[str, Any], targets: Dict[str, float],
service_def: Dict[str, Any]) -> Dict[str, Any]:
"""Create SLO definition from SLI."""
sli_name = sli['name'].lower().replace(' ', '_')
# Map SLI names to target keys
target_mapping = {
'availability': 'availability',
'request_latency_p95': 'latency_p95',
'error_rate': 'error_rate',
'user_journey_success_rate': 'availability',
'feature_availability': 'availability',
'page_load_time_p95': 'latency_p95',
'database_query_latency_p95': 'latency_p95',
'database_connection_success_rate': 'availability'
}
target_key = target_mapping.get(sli_name)
if not target_key:
return None
target_value = targets.get(target_key)
if target_value is None:
return None
# Determine comparison operator and format target
if 'latency' in sli_name or 'duration' in sli_name:
operator = '<='
target_display = f"{target_value}ms" if target_value < 10 else f"{target_value/1000}s"
elif 'rate' in sli_name and 'error' in sli_name:
operator = '<='
target_display = f"{target_value * 100}%"
target_value = target_value # Keep as decimal
else:
operator = '>='
target_display = f"{target_value * 100}%"
# Calculate time windows
time_windows = ['1h', '1d', '7d', '30d']
slo = {
'name': f"{sli['name']} SLO",
'description': f"Service level objective for {sli['description'].lower()}",
'sli_name': sli['name'],
'target_value': target_value,
'target_display': target_display,
'operator': operator,
'time_windows': time_windows,
'measurement_window': '30d',
'service': service_def.get('name', 'service'),
'criticality': service_def.get('criticality', 'medium')
}
return slo
def calculate_error_budgets(self, slos: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Calculate error budgets for SLOs."""
error_budgets = []
for slo in slos:
if slo['operator'] == '>=': # Availability-type SLOs
target = slo['target_value']
error_budget_rate = 1 - target
# Calculate budget for different time windows
time_windows = {
'1h': 3600,
'1d': 86400,
'7d': 604800,
'30d': 2592000
}
budgets = {}
for window, seconds in time_windows.items():
budget_seconds = seconds * error_budget_rate
if budget_seconds < 60:
budgets[window] = f"{budget_seconds:.1f} seconds"
elif budget_seconds < 3600:
budgets[window] = f"{budget_seconds/60:.1f} minutes"
else:
budgets[window] = f"{budget_seconds/3600:.1f} hours"
error_budget = {
'slo_name': slo['name'],
'error_budget_rate': error_budget_rate,
'error_budget_percentage': f"{error_budget_rate * 100:.3f}%",
'budgets_by_window': budgets,
'burn_rate_alerts': self._generate_burn_rate_alerts(slo, error_budget_rate)
}
error_budgets.append(error_budget)
return error_budgets
def _generate_burn_rate_alerts(self, slo: Dict[str, Any], error_budget_rate: float) -> List[Dict[str, Any]]:
"""Generate multi-window burn rate alerts."""
alerts = []
service_name = slo['service']
sli_query = self._get_sli_query_for_burn_rate(slo)
for window_config in self.BURN_RATE_WINDOWS:
alert = {
'name': f"{slo['sli_name']} Burn Rate {window_config['budget_consumed']} Alert",
'description': f"Alert when {slo['sli_name']} is consuming error budget at {window_config['burn_rate']}x rate",
'severity': self._determine_alert_severity(float(window_config['budget_consumed'].rstrip('%'))),
'short_window': window_config['short'],
'long_window': window_config['long'],
'burn_rate_threshold': window_config['burn_rate'],
'budget_consumed': window_config['budget_consumed'],
'condition': f"({sli_query}_short > {window_config['burn_rate']}) and ({sli_query}_long > {window_config['burn_rate']})",
'annotations': {
'summary': f"High burn rate detected for {slo['sli_name']}",
'description': f"Error budget consumption rate is {window_config['burn_rate']}x normal, will exhaust {window_config['budget_consumed']} of monthly budget"
}
}
alerts.append(alert)
return alerts
def _get_sli_query_for_burn_rate(self, slo: Dict[str, Any]) -> str:
"""Generate SLI query fragment for burn rate calculation."""
service_name = slo['service']
sli_name = slo['sli_name'].lower().replace(' ', '_')
if 'availability' in sli_name or 'success' in sli_name:
return f"(1 - (sum(rate(http_requests_total{{service='{service_name}',code!~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}}))))"
elif 'error' in sli_name:
return f"(sum(rate(http_requests_total{{service='{service_name}',code=~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}})))"
else:
return f"sli_burn_rate_{sli_name}"
def _determine_alert_severity(self, budget_consumed_percent: float) -> str:
"""Determine alert severity based on budget consumption rate."""
if budget_consumed_percent <= 2:
return 'critical'
elif budget_consumed_percent <= 5:
return 'warning'
else:
return 'info'
def generate_sla_recommendations(self, service_def: Dict[str, Any],
slos: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Generate SLA recommendations for customer-facing services."""
if not service_def.get('user_facing', False):
return {
'applicable': False,
'reason': 'SLA not recommended for non-user-facing services'
}
criticality = service_def.get('criticality', 'medium')
# SLA targets should be more conservative than SLO targets
sla_buffer = 0.001 # 0.1% buffer below SLO
sla_recommendations = {
'applicable': True,
'service': service_def.get('name'),
'commitments': [],
'penalties': self._generate_penalty_structure(criticality),
'measurement_methodology': 'External synthetic monitoring from multiple geographic locations',
'exclusions': [
'Planned maintenance windows (with 72h advance notice)',
'Customer-side network or infrastructure issues',
'Force majeure events',
'Third-party service dependencies beyond our control'
]
}
for slo in slos:
if slo['operator'] == '>=' and 'availability' in slo['sli_name'].lower():
sla_target = max(0.9, slo['target_value'] - sla_buffer)
commitment = {
'metric': slo['sli_name'],
'target': sla_target,
'target_display': f"{sla_target * 100:.2f}%",
'measurement_window': 'monthly',
'measurement_method': 'Uptime monitoring with 1-minute granularity'
}
sla_recommendations['commitments'].append(commitment)
return sla_recommendations
def _generate_penalty_structure(self, criticality: str) -> List[Dict[str, Any]]:
"""Generate penalty structure based on service criticality."""
penalty_structures = {
'critical': [
{'breach_threshold': '< 99.99%', 'credit_percentage': 10},
{'breach_threshold': '< 99.9%', 'credit_percentage': 25},
{'breach_threshold': '< 99%', 'credit_percentage': 50}
],
'high': [
{'breach_threshold': '< 99.9%', 'credit_percentage': 10},
{'breach_threshold': '< 99.5%', 'credit_percentage': 25}
],
'medium': [
{'breach_threshold': '< 99.5%', 'credit_percentage': 10}
],
'low': []
}
return penalty_structures.get(criticality, [])
def generate_framework(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
"""Generate complete SLO framework."""
# Generate SLIs
slis = self.generate_slis(service_def)
# Generate SLOs
slos = self.generate_slos(service_def, slis)
# Calculate error budgets
error_budgets = self.calculate_error_budgets(slos)
# Generate SLA recommendations
sla_recommendations = self.generate_sla_recommendations(service_def, slos)
# Create comprehensive framework
framework = {
'metadata': {
'service': service_def,
'generated_at': datetime.utcnow().isoformat() + 'Z',
'framework_version': '1.0'
},
'slis': slis,
'slos': slos,
'error_budgets': error_budgets,
'sla_recommendations': sla_recommendations,
'monitoring_recommendations': self._generate_monitoring_recommendations(service_def),
'implementation_guide': self._generate_implementation_guide(service_def, slis, slos)
}
return framework
def _generate_monitoring_recommendations(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
"""Generate monitoring tool recommendations."""
service_type = service_def.get('type', 'api')
recommendations = {
'metrics': {
'collection': 'Prometheus with service discovery',
'retention': '90 days for raw metrics, 1 year for aggregated',
'alerting': 'Prometheus Alertmanager with multi-window burn rate alerts'
},
'logging': {
'format': 'Structured JSON logs with correlation IDs',
'aggregation': 'ELK stack or equivalent with proper indexing',
'retention': '30 days for debug logs, 90 days for error logs'
},
'tracing': {
'sampling': 'Adaptive sampling with 1% base rate',
'storage': 'Jaeger or Zipkin with 7-day retention',
'integration': 'OpenTelemetry instrumentation'
}
}
if service_type == 'web':
recommendations['synthetic_monitoring'] = {
'frequency': 'Every 1 minute from 3+ geographic locations',
'checks': 'Full user journey simulation',
'tools': 'Pingdom, DataDog Synthetics, or equivalent'
}
return recommendations
def _generate_implementation_guide(self, service_def: Dict[str, Any],
slis: List[Dict[str, Any]],
slos: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Generate implementation guide for the SLO framework."""
return {
'prerequisites': [
'Service instrumented with metrics collection (Prometheus format)',
'Structured logging with correlation IDs',
'Monitoring infrastructure (Prometheus, Grafana, Alertmanager)',
'Incident response processes and escalation policies'
],
'implementation_steps': [
{
'step': 1,
'title': 'Instrument Service',
'description': 'Add metrics collection for all defined SLIs',
'estimated_effort': '1-2 days'
},
{
'step': 2,
'title': 'Configure Recording Rules',
'description': 'Set up Prometheus recording rules for SLI calculations',
'estimated_effort': '4-8 hours'
},
{
'step': 3,
'title': 'Implement Burn Rate Alerts',
'description': 'Configure multi-window burn rate alerting rules',
'estimated_effort': '1 day'
},
{
'step': 4,
'title': 'Create SLO Dashboard',
'description': 'Build Grafana dashboard for SLO tracking and error budget monitoring',
'estimated_effort': '4-6 hours'
},
{
'step': 5,
'title': 'Test and Validate',
'description': 'Test alerting and validate SLI measurements against expectations',
'estimated_effort': '1-2 days'
},
{
'step': 6,
'title': 'Documentation and Training',
'description': 'Document runbooks and train team on SLO monitoring',
'estimated_effort': '1 day'
}
],
'validation_checklist': [
'All SLIs produce expected metric values',
'Burn rate alerts fire correctly during simulated outages',
'Error budget calculations match manual verification',
'Dashboard displays accurate SLO achievement rates',
'Alert routing reaches correct escalation paths',
'Runbooks are complete and tested'
]
}
def export_json(self, framework: Dict[str, Any], output_file: str):
"""Export framework as JSON."""
with open(output_file, 'w') as f:
json.dump(framework, f, indent=2)
def print_summary(self, framework: Dict[str, Any]):
"""Print human-readable summary of the SLO framework."""
service = framework['metadata']['service']
slis = framework['slis']
slos = framework['slos']
error_budgets = framework['error_budgets']
print(f"\n{'='*60}")
print(f"SLO FRAMEWORK SUMMARY FOR {service['name'].upper()}")
print(f"{'='*60}")
print(f"\nService Details:")
print(f" Type: {service['type']}")
print(f" Criticality: {service['criticality']}")
print(f" User Facing: {'Yes' if service.get('user_facing') else 'No'}")
print(f" Team: {service.get('team', 'Unknown')}")
print(f"\nService Level Indicators ({len(slis)}):")
for i, sli in enumerate(slis, 1):
print(f" {i}. {sli['name']}")
print(f" Description: {sli['description']}")
print(f" Type: {sli['type']}")
print()
print(f"Service Level Objectives ({len(slos)}):")
for i, slo in enumerate(slos, 1):
print(f" {i}. {slo['name']}")
print(f" Target: {slo['target_display']}")
print(f" Measurement Window: {slo['measurement_window']}")
print()
print(f"Error Budget Summary:")
for budget in error_budgets:
print(f" {budget['slo_name']}:")
print(f" Monthly Budget: {budget['error_budget_percentage']}")
print(f" Burn Rate Alerts: {len(budget['burn_rate_alerts'])}")
print()
sla = framework['sla_recommendations']
if sla['applicable']:
print(f"SLA Recommendations:")
print(f" Commitments: {len(sla['commitments'])}")
print(f" Penalty Tiers: {len(sla['penalties'])}")
else:
print(f"SLA Recommendations: {sla['reason']}")
print(f"\nImplementation Timeline: 1-2 weeks")
print(f"Framework generated at: {framework['metadata']['generated_at']}")
print(f"{'='*60}\n")
def main():
"""Main function for CLI usage."""
parser = argparse.ArgumentParser(
description='Generate comprehensive SLO frameworks for services',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate from service definition file
python slo_designer.py --input service.json --output framework.json
# Generate from command line parameters
python slo_designer.py --service-type api --criticality high --user-facing true --output framework.json
# Generate and display summary only
python slo_designer.py --service-type web --criticality critical --user-facing true --summary-only
"""
)
parser.add_argument('--input', '-i',
help='Input service definition JSON file')
parser.add_argument('--output', '-o',
help='Output framework JSON file')
parser.add_argument('--service-type',
choices=['api', 'web', 'database', 'queue', 'batch', 'ml'],
help='Service type')
parser.add_argument('--criticality',
choices=['critical', 'high', 'medium', 'low'],
help='Service criticality level')
parser.add_argument('--user-facing',
choices=['true', 'false'],
help='Whether service is user-facing')
parser.add_argument('--service-name',
help='Service name')
parser.add_argument('--summary-only', action='store_true',
help='Only display summary, do not save JSON')
args = parser.parse_args()
if not args.input and not (args.service_type and args.criticality and args.user_facing):
parser.error("Must provide either --input file or --service-type, --criticality, and --user-facing")
designer = SLODesigner()
try:
# Load or create service definition
if args.input:
service_def = designer.load_service_definition(args.input)
else:
user_facing = args.user_facing.lower() == 'true'
service_def = designer.create_service_definition(
args.service_type, args.criticality, user_facing, args.service_name
)
# Generate framework
framework = designer.generate_framework(service_def)
# Output results
if not args.summary_only:
output_file = args.output or f"{service_def['name']}_slo_framework.json"
designer.export_json(framework, output_file)
print(f"SLO framework saved to: {output_file}")
# Always show summary
designer.print_summary(framework)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()