add brain
This commit is contained in:
@@ -0,0 +1,670 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
SLO Designer - Generate comprehensive SLI/SLO frameworks for services
|
||||
|
||||
This script analyzes service descriptions and generates complete SLO frameworks including:
|
||||
- SLI definitions based on service characteristics
|
||||
- SLO targets based on criticality and user impact
|
||||
- Error budget calculations and policies
|
||||
- Multi-window burn rate alerts
|
||||
- SLA recommendations for customer-facing services
|
||||
|
||||
Usage:
|
||||
python slo_designer.py --input service_definition.json --output slo_framework.json
|
||||
python slo_designer.py --service-type api --criticality high --user-facing true
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import sys
|
||||
import math
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
class SLODesigner:
|
||||
"""Design and generate SLO frameworks for services."""
|
||||
|
||||
# SLO target recommendations based on service criticality
|
||||
SLO_TARGETS = {
|
||||
'critical': {
|
||||
'availability': 0.9999, # 99.99% - 4.38 minutes downtime/month
|
||||
'latency_p95': 100, # 95th percentile latency in ms
|
||||
'latency_p99': 500, # 99th percentile latency in ms
|
||||
'error_rate': 0.001 # 0.1% error rate
|
||||
},
|
||||
'high': {
|
||||
'availability': 0.999, # 99.9% - 43.8 minutes downtime/month
|
||||
'latency_p95': 200, # 95th percentile latency in ms
|
||||
'latency_p99': 1000, # 99th percentile latency in ms
|
||||
'error_rate': 0.005 # 0.5% error rate
|
||||
},
|
||||
'medium': {
|
||||
'availability': 0.995, # 99.5% - 3.65 hours downtime/month
|
||||
'latency_p95': 500, # 95th percentile latency in ms
|
||||
'latency_p99': 2000, # 99th percentile latency in ms
|
||||
'error_rate': 0.01 # 1% error rate
|
||||
},
|
||||
'low': {
|
||||
'availability': 0.99, # 99% - 7.3 hours downtime/month
|
||||
'latency_p95': 1000, # 95th percentile latency in ms
|
||||
'latency_p99': 5000, # 99th percentile latency in ms
|
||||
'error_rate': 0.02 # 2% error rate
|
||||
}
|
||||
}
|
||||
|
||||
# Burn rate windows for multi-window alerting
|
||||
BURN_RATE_WINDOWS = [
|
||||
{'short': '5m', 'long': '1h', 'burn_rate': 14.4, 'budget_consumed': '2%'},
|
||||
{'short': '30m', 'long': '6h', 'burn_rate': 6, 'budget_consumed': '5%'},
|
||||
{'short': '2h', 'long': '1d', 'burn_rate': 3, 'budget_consumed': '10%'},
|
||||
{'short': '6h', 'long': '3d', 'burn_rate': 1, 'budget_consumed': '10%'}
|
||||
]
|
||||
|
||||
# Service type specific SLI recommendations
|
||||
SERVICE_TYPE_SLIS = {
|
||||
'api': ['availability', 'latency', 'error_rate', 'throughput'],
|
||||
'web': ['availability', 'latency', 'error_rate', 'page_load_time'],
|
||||
'database': ['availability', 'query_latency', 'connection_success_rate', 'replication_lag'],
|
||||
'queue': ['availability', 'message_processing_time', 'queue_depth', 'message_loss_rate'],
|
||||
'batch': ['job_success_rate', 'job_duration', 'data_freshness', 'resource_utilization'],
|
||||
'ml': ['model_accuracy', 'prediction_latency', 'training_success_rate', 'feature_freshness']
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the SLO Designer."""
|
||||
self.service_config = {}
|
||||
self.slo_framework = {}
|
||||
|
||||
def load_service_definition(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Load service definition from JSON file."""
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
raise ValueError(f"Service definition file not found: {file_path}")
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON in service definition: {e}")
|
||||
|
||||
def create_service_definition(self, service_type: str, criticality: str,
|
||||
user_facing: bool, name: str = None) -> Dict[str, Any]:
|
||||
"""Create a service definition from parameters."""
|
||||
return {
|
||||
'name': name or f'{service_type}_service',
|
||||
'type': service_type,
|
||||
'criticality': criticality,
|
||||
'user_facing': user_facing,
|
||||
'description': f'A {criticality} criticality {service_type} service',
|
||||
'dependencies': [],
|
||||
'team': 'platform',
|
||||
'environment': 'production'
|
||||
}
|
||||
|
||||
def generate_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Generate Service Level Indicators based on service characteristics."""
|
||||
service_type = service_def.get('type', 'api')
|
||||
base_slis = self.SERVICE_TYPE_SLIS.get(service_type, ['availability', 'latency', 'error_rate'])
|
||||
|
||||
slis = []
|
||||
|
||||
for sli_name in base_slis:
|
||||
sli = self._create_sli_definition(sli_name, service_def)
|
||||
if sli:
|
||||
slis.append(sli)
|
||||
|
||||
# Add user-facing specific SLIs
|
||||
if service_def.get('user_facing', False):
|
||||
user_slis = self._generate_user_facing_slis(service_def)
|
||||
slis.extend(user_slis)
|
||||
|
||||
return slis
|
||||
|
||||
def _create_sli_definition(self, sli_name: str, service_def: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Create detailed SLI definition."""
|
||||
service_name = service_def.get('name', 'service')
|
||||
|
||||
sli_definitions = {
|
||||
'availability': {
|
||||
'name': 'Availability',
|
||||
'description': 'Percentage of successful requests',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))',
|
||||
'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))',
|
||||
'unit': 'percentage'
|
||||
},
|
||||
'latency': {
|
||||
'name': 'Request Latency P95',
|
||||
'description': '95th percentile of request latency',
|
||||
'type': 'threshold',
|
||||
'query': f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'seconds'
|
||||
},
|
||||
'error_rate': {
|
||||
'name': 'Error Rate',
|
||||
'description': 'Rate of 5xx errors',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))',
|
||||
'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))',
|
||||
'unit': 'percentage'
|
||||
},
|
||||
'throughput': {
|
||||
'name': 'Request Throughput',
|
||||
'description': 'Requests per second',
|
||||
'type': 'gauge',
|
||||
'query': f'sum(rate(http_requests_total{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'requests/sec'
|
||||
},
|
||||
'page_load_time': {
|
||||
'name': 'Page Load Time P95',
|
||||
'description': '95th percentile of page load time',
|
||||
'type': 'threshold',
|
||||
'query': f'histogram_quantile(0.95, rate(page_load_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'seconds'
|
||||
},
|
||||
'query_latency': {
|
||||
'name': 'Database Query Latency P95',
|
||||
'description': '95th percentile of database query latency',
|
||||
'type': 'threshold',
|
||||
'query': f'histogram_quantile(0.95, rate(db_query_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'seconds'
|
||||
},
|
||||
'connection_success_rate': {
|
||||
'name': 'Database Connection Success Rate',
|
||||
'description': 'Percentage of successful database connections',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(db_connections_total{{service="{service_name}",status="success"}}[5m]))',
|
||||
'total_events': f'sum(rate(db_connections_total{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'percentage'
|
||||
}
|
||||
}
|
||||
|
||||
return sli_definitions.get(sli_name)
|
||||
|
||||
def _generate_user_facing_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Generate additional SLIs for user-facing services."""
|
||||
service_name = service_def.get('name', 'service')
|
||||
|
||||
return [
|
||||
{
|
||||
'name': 'User Journey Success Rate',
|
||||
'description': 'Percentage of successful complete user journeys',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(user_journey_total{{service="{service_name}",status="success"}}[5m]))',
|
||||
'total_events': f'sum(rate(user_journey_total{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'percentage'
|
||||
},
|
||||
{
|
||||
'name': 'Feature Availability',
|
||||
'description': 'Percentage of time key features are available',
|
||||
'type': 'ratio',
|
||||
'good_events': f'sum(rate(feature_checks_total{{service="{service_name}",status="available"}}[5m]))',
|
||||
'total_events': f'sum(rate(feature_checks_total{{service="{service_name}"}}[5m]))',
|
||||
'unit': 'percentage'
|
||||
}
|
||||
]
|
||||
|
||||
def generate_slos(self, service_def: Dict[str, Any], slis: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Generate Service Level Objectives based on service criticality."""
|
||||
criticality = service_def.get('criticality', 'medium')
|
||||
targets = self.SLO_TARGETS.get(criticality, self.SLO_TARGETS['medium'])
|
||||
|
||||
slos = []
|
||||
|
||||
for sli in slis:
|
||||
slo = self._create_slo_from_sli(sli, targets, service_def)
|
||||
if slo:
|
||||
slos.append(slo)
|
||||
|
||||
return slos
|
||||
|
||||
def _create_slo_from_sli(self, sli: Dict[str, Any], targets: Dict[str, float],
|
||||
service_def: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Create SLO definition from SLI."""
|
||||
sli_name = sli['name'].lower().replace(' ', '_')
|
||||
|
||||
# Map SLI names to target keys
|
||||
target_mapping = {
|
||||
'availability': 'availability',
|
||||
'request_latency_p95': 'latency_p95',
|
||||
'error_rate': 'error_rate',
|
||||
'user_journey_success_rate': 'availability',
|
||||
'feature_availability': 'availability',
|
||||
'page_load_time_p95': 'latency_p95',
|
||||
'database_query_latency_p95': 'latency_p95',
|
||||
'database_connection_success_rate': 'availability'
|
||||
}
|
||||
|
||||
target_key = target_mapping.get(sli_name)
|
||||
if not target_key:
|
||||
return None
|
||||
|
||||
target_value = targets.get(target_key)
|
||||
if target_value is None:
|
||||
return None
|
||||
|
||||
# Determine comparison operator and format target
|
||||
if 'latency' in sli_name or 'duration' in sli_name:
|
||||
operator = '<='
|
||||
target_display = f"{target_value}ms" if target_value < 10 else f"{target_value/1000}s"
|
||||
elif 'rate' in sli_name and 'error' in sli_name:
|
||||
operator = '<='
|
||||
target_display = f"{target_value * 100}%"
|
||||
target_value = target_value # Keep as decimal
|
||||
else:
|
||||
operator = '>='
|
||||
target_display = f"{target_value * 100}%"
|
||||
|
||||
# Calculate time windows
|
||||
time_windows = ['1h', '1d', '7d', '30d']
|
||||
|
||||
slo = {
|
||||
'name': f"{sli['name']} SLO",
|
||||
'description': f"Service level objective for {sli['description'].lower()}",
|
||||
'sli_name': sli['name'],
|
||||
'target_value': target_value,
|
||||
'target_display': target_display,
|
||||
'operator': operator,
|
||||
'time_windows': time_windows,
|
||||
'measurement_window': '30d',
|
||||
'service': service_def.get('name', 'service'),
|
||||
'criticality': service_def.get('criticality', 'medium')
|
||||
}
|
||||
|
||||
return slo
|
||||
|
||||
def calculate_error_budgets(self, slos: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Calculate error budgets for SLOs."""
|
||||
error_budgets = []
|
||||
|
||||
for slo in slos:
|
||||
if slo['operator'] == '>=': # Availability-type SLOs
|
||||
target = slo['target_value']
|
||||
error_budget_rate = 1 - target
|
||||
|
||||
# Calculate budget for different time windows
|
||||
time_windows = {
|
||||
'1h': 3600,
|
||||
'1d': 86400,
|
||||
'7d': 604800,
|
||||
'30d': 2592000
|
||||
}
|
||||
|
||||
budgets = {}
|
||||
for window, seconds in time_windows.items():
|
||||
budget_seconds = seconds * error_budget_rate
|
||||
if budget_seconds < 60:
|
||||
budgets[window] = f"{budget_seconds:.1f} seconds"
|
||||
elif budget_seconds < 3600:
|
||||
budgets[window] = f"{budget_seconds/60:.1f} minutes"
|
||||
else:
|
||||
budgets[window] = f"{budget_seconds/3600:.1f} hours"
|
||||
|
||||
error_budget = {
|
||||
'slo_name': slo['name'],
|
||||
'error_budget_rate': error_budget_rate,
|
||||
'error_budget_percentage': f"{error_budget_rate * 100:.3f}%",
|
||||
'budgets_by_window': budgets,
|
||||
'burn_rate_alerts': self._generate_burn_rate_alerts(slo, error_budget_rate)
|
||||
}
|
||||
|
||||
error_budgets.append(error_budget)
|
||||
|
||||
return error_budgets
|
||||
|
||||
def _generate_burn_rate_alerts(self, slo: Dict[str, Any], error_budget_rate: float) -> List[Dict[str, Any]]:
|
||||
"""Generate multi-window burn rate alerts."""
|
||||
alerts = []
|
||||
service_name = slo['service']
|
||||
sli_query = self._get_sli_query_for_burn_rate(slo)
|
||||
|
||||
for window_config in self.BURN_RATE_WINDOWS:
|
||||
alert = {
|
||||
'name': f"{slo['sli_name']} Burn Rate {window_config['budget_consumed']} Alert",
|
||||
'description': f"Alert when {slo['sli_name']} is consuming error budget at {window_config['burn_rate']}x rate",
|
||||
'severity': self._determine_alert_severity(float(window_config['budget_consumed'].rstrip('%'))),
|
||||
'short_window': window_config['short'],
|
||||
'long_window': window_config['long'],
|
||||
'burn_rate_threshold': window_config['burn_rate'],
|
||||
'budget_consumed': window_config['budget_consumed'],
|
||||
'condition': f"({sli_query}_short > {window_config['burn_rate']}) and ({sli_query}_long > {window_config['burn_rate']})",
|
||||
'annotations': {
|
||||
'summary': f"High burn rate detected for {slo['sli_name']}",
|
||||
'description': f"Error budget consumption rate is {window_config['burn_rate']}x normal, will exhaust {window_config['budget_consumed']} of monthly budget"
|
||||
}
|
||||
}
|
||||
alerts.append(alert)
|
||||
|
||||
return alerts
|
||||
|
||||
def _get_sli_query_for_burn_rate(self, slo: Dict[str, Any]) -> str:
|
||||
"""Generate SLI query fragment for burn rate calculation."""
|
||||
service_name = slo['service']
|
||||
sli_name = slo['sli_name'].lower().replace(' ', '_')
|
||||
|
||||
if 'availability' in sli_name or 'success' in sli_name:
|
||||
return f"(1 - (sum(rate(http_requests_total{{service='{service_name}',code!~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}}))))"
|
||||
elif 'error' in sli_name:
|
||||
return f"(sum(rate(http_requests_total{{service='{service_name}',code=~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}})))"
|
||||
else:
|
||||
return f"sli_burn_rate_{sli_name}"
|
||||
|
||||
def _determine_alert_severity(self, budget_consumed_percent: float) -> str:
|
||||
"""Determine alert severity based on budget consumption rate."""
|
||||
if budget_consumed_percent <= 2:
|
||||
return 'critical'
|
||||
elif budget_consumed_percent <= 5:
|
||||
return 'warning'
|
||||
else:
|
||||
return 'info'
|
||||
|
||||
def generate_sla_recommendations(self, service_def: Dict[str, Any],
|
||||
slos: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Generate SLA recommendations for customer-facing services."""
|
||||
if not service_def.get('user_facing', False):
|
||||
return {
|
||||
'applicable': False,
|
||||
'reason': 'SLA not recommended for non-user-facing services'
|
||||
}
|
||||
|
||||
criticality = service_def.get('criticality', 'medium')
|
||||
|
||||
# SLA targets should be more conservative than SLO targets
|
||||
sla_buffer = 0.001 # 0.1% buffer below SLO
|
||||
|
||||
sla_recommendations = {
|
||||
'applicable': True,
|
||||
'service': service_def.get('name'),
|
||||
'commitments': [],
|
||||
'penalties': self._generate_penalty_structure(criticality),
|
||||
'measurement_methodology': 'External synthetic monitoring from multiple geographic locations',
|
||||
'exclusions': [
|
||||
'Planned maintenance windows (with 72h advance notice)',
|
||||
'Customer-side network or infrastructure issues',
|
||||
'Force majeure events',
|
||||
'Third-party service dependencies beyond our control'
|
||||
]
|
||||
}
|
||||
|
||||
for slo in slos:
|
||||
if slo['operator'] == '>=' and 'availability' in slo['sli_name'].lower():
|
||||
sla_target = max(0.9, slo['target_value'] - sla_buffer)
|
||||
commitment = {
|
||||
'metric': slo['sli_name'],
|
||||
'target': sla_target,
|
||||
'target_display': f"{sla_target * 100:.2f}%",
|
||||
'measurement_window': 'monthly',
|
||||
'measurement_method': 'Uptime monitoring with 1-minute granularity'
|
||||
}
|
||||
sla_recommendations['commitments'].append(commitment)
|
||||
|
||||
return sla_recommendations
|
||||
|
||||
def _generate_penalty_structure(self, criticality: str) -> List[Dict[str, Any]]:
|
||||
"""Generate penalty structure based on service criticality."""
|
||||
penalty_structures = {
|
||||
'critical': [
|
||||
{'breach_threshold': '< 99.99%', 'credit_percentage': 10},
|
||||
{'breach_threshold': '< 99.9%', 'credit_percentage': 25},
|
||||
{'breach_threshold': '< 99%', 'credit_percentage': 50}
|
||||
],
|
||||
'high': [
|
||||
{'breach_threshold': '< 99.9%', 'credit_percentage': 10},
|
||||
{'breach_threshold': '< 99.5%', 'credit_percentage': 25}
|
||||
],
|
||||
'medium': [
|
||||
{'breach_threshold': '< 99.5%', 'credit_percentage': 10}
|
||||
],
|
||||
'low': []
|
||||
}
|
||||
|
||||
return penalty_structures.get(criticality, [])
|
||||
|
||||
def generate_framework(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Generate complete SLO framework."""
|
||||
# Generate SLIs
|
||||
slis = self.generate_slis(service_def)
|
||||
|
||||
# Generate SLOs
|
||||
slos = self.generate_slos(service_def, slis)
|
||||
|
||||
# Calculate error budgets
|
||||
error_budgets = self.calculate_error_budgets(slos)
|
||||
|
||||
# Generate SLA recommendations
|
||||
sla_recommendations = self.generate_sla_recommendations(service_def, slos)
|
||||
|
||||
# Create comprehensive framework
|
||||
framework = {
|
||||
'metadata': {
|
||||
'service': service_def,
|
||||
'generated_at': datetime.utcnow().isoformat() + 'Z',
|
||||
'framework_version': '1.0'
|
||||
},
|
||||
'slis': slis,
|
||||
'slos': slos,
|
||||
'error_budgets': error_budgets,
|
||||
'sla_recommendations': sla_recommendations,
|
||||
'monitoring_recommendations': self._generate_monitoring_recommendations(service_def),
|
||||
'implementation_guide': self._generate_implementation_guide(service_def, slis, slos)
|
||||
}
|
||||
|
||||
return framework
|
||||
|
||||
def _generate_monitoring_recommendations(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Generate monitoring tool recommendations."""
|
||||
service_type = service_def.get('type', 'api')
|
||||
|
||||
recommendations = {
|
||||
'metrics': {
|
||||
'collection': 'Prometheus with service discovery',
|
||||
'retention': '90 days for raw metrics, 1 year for aggregated',
|
||||
'alerting': 'Prometheus Alertmanager with multi-window burn rate alerts'
|
||||
},
|
||||
'logging': {
|
||||
'format': 'Structured JSON logs with correlation IDs',
|
||||
'aggregation': 'ELK stack or equivalent with proper indexing',
|
||||
'retention': '30 days for debug logs, 90 days for error logs'
|
||||
},
|
||||
'tracing': {
|
||||
'sampling': 'Adaptive sampling with 1% base rate',
|
||||
'storage': 'Jaeger or Zipkin with 7-day retention',
|
||||
'integration': 'OpenTelemetry instrumentation'
|
||||
}
|
||||
}
|
||||
|
||||
if service_type == 'web':
|
||||
recommendations['synthetic_monitoring'] = {
|
||||
'frequency': 'Every 1 minute from 3+ geographic locations',
|
||||
'checks': 'Full user journey simulation',
|
||||
'tools': 'Pingdom, DataDog Synthetics, or equivalent'
|
||||
}
|
||||
|
||||
return recommendations
|
||||
|
||||
def _generate_implementation_guide(self, service_def: Dict[str, Any],
|
||||
slis: List[Dict[str, Any]],
|
||||
slos: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Generate implementation guide for the SLO framework."""
|
||||
return {
|
||||
'prerequisites': [
|
||||
'Service instrumented with metrics collection (Prometheus format)',
|
||||
'Structured logging with correlation IDs',
|
||||
'Monitoring infrastructure (Prometheus, Grafana, Alertmanager)',
|
||||
'Incident response processes and escalation policies'
|
||||
],
|
||||
'implementation_steps': [
|
||||
{
|
||||
'step': 1,
|
||||
'title': 'Instrument Service',
|
||||
'description': 'Add metrics collection for all defined SLIs',
|
||||
'estimated_effort': '1-2 days'
|
||||
},
|
||||
{
|
||||
'step': 2,
|
||||
'title': 'Configure Recording Rules',
|
||||
'description': 'Set up Prometheus recording rules for SLI calculations',
|
||||
'estimated_effort': '4-8 hours'
|
||||
},
|
||||
{
|
||||
'step': 3,
|
||||
'title': 'Implement Burn Rate Alerts',
|
||||
'description': 'Configure multi-window burn rate alerting rules',
|
||||
'estimated_effort': '1 day'
|
||||
},
|
||||
{
|
||||
'step': 4,
|
||||
'title': 'Create SLO Dashboard',
|
||||
'description': 'Build Grafana dashboard for SLO tracking and error budget monitoring',
|
||||
'estimated_effort': '4-6 hours'
|
||||
},
|
||||
{
|
||||
'step': 5,
|
||||
'title': 'Test and Validate',
|
||||
'description': 'Test alerting and validate SLI measurements against expectations',
|
||||
'estimated_effort': '1-2 days'
|
||||
},
|
||||
{
|
||||
'step': 6,
|
||||
'title': 'Documentation and Training',
|
||||
'description': 'Document runbooks and train team on SLO monitoring',
|
||||
'estimated_effort': '1 day'
|
||||
}
|
||||
],
|
||||
'validation_checklist': [
|
||||
'All SLIs produce expected metric values',
|
||||
'Burn rate alerts fire correctly during simulated outages',
|
||||
'Error budget calculations match manual verification',
|
||||
'Dashboard displays accurate SLO achievement rates',
|
||||
'Alert routing reaches correct escalation paths',
|
||||
'Runbooks are complete and tested'
|
||||
]
|
||||
}
|
||||
|
||||
def export_json(self, framework: Dict[str, Any], output_file: str):
|
||||
"""Export framework as JSON."""
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(framework, f, indent=2)
|
||||
|
||||
def print_summary(self, framework: Dict[str, Any]):
|
||||
"""Print human-readable summary of the SLO framework."""
|
||||
service = framework['metadata']['service']
|
||||
slis = framework['slis']
|
||||
slos = framework['slos']
|
||||
error_budgets = framework['error_budgets']
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SLO FRAMEWORK SUMMARY FOR {service['name'].upper()}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
print(f"\nService Details:")
|
||||
print(f" Type: {service['type']}")
|
||||
print(f" Criticality: {service['criticality']}")
|
||||
print(f" User Facing: {'Yes' if service.get('user_facing') else 'No'}")
|
||||
print(f" Team: {service.get('team', 'Unknown')}")
|
||||
|
||||
print(f"\nService Level Indicators ({len(slis)}):")
|
||||
for i, sli in enumerate(slis, 1):
|
||||
print(f" {i}. {sli['name']}")
|
||||
print(f" Description: {sli['description']}")
|
||||
print(f" Type: {sli['type']}")
|
||||
print()
|
||||
|
||||
print(f"Service Level Objectives ({len(slos)}):")
|
||||
for i, slo in enumerate(slos, 1):
|
||||
print(f" {i}. {slo['name']}")
|
||||
print(f" Target: {slo['target_display']}")
|
||||
print(f" Measurement Window: {slo['measurement_window']}")
|
||||
print()
|
||||
|
||||
print(f"Error Budget Summary:")
|
||||
for budget in error_budgets:
|
||||
print(f" {budget['slo_name']}:")
|
||||
print(f" Monthly Budget: {budget['error_budget_percentage']}")
|
||||
print(f" Burn Rate Alerts: {len(budget['burn_rate_alerts'])}")
|
||||
print()
|
||||
|
||||
sla = framework['sla_recommendations']
|
||||
if sla['applicable']:
|
||||
print(f"SLA Recommendations:")
|
||||
print(f" Commitments: {len(sla['commitments'])}")
|
||||
print(f" Penalty Tiers: {len(sla['penalties'])}")
|
||||
else:
|
||||
print(f"SLA Recommendations: {sla['reason']}")
|
||||
|
||||
print(f"\nImplementation Timeline: 1-2 weeks")
|
||||
print(f"Framework generated at: {framework['metadata']['generated_at']}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate comprehensive SLO frameworks for services',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Generate from service definition file
|
||||
python slo_designer.py --input service.json --output framework.json
|
||||
|
||||
# Generate from command line parameters
|
||||
python slo_designer.py --service-type api --criticality high --user-facing true --output framework.json
|
||||
|
||||
# Generate and display summary only
|
||||
python slo_designer.py --service-type web --criticality critical --user-facing true --summary-only
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--input', '-i',
|
||||
help='Input service definition JSON file')
|
||||
parser.add_argument('--output', '-o',
|
||||
help='Output framework JSON file')
|
||||
parser.add_argument('--service-type',
|
||||
choices=['api', 'web', 'database', 'queue', 'batch', 'ml'],
|
||||
help='Service type')
|
||||
parser.add_argument('--criticality',
|
||||
choices=['critical', 'high', 'medium', 'low'],
|
||||
help='Service criticality level')
|
||||
parser.add_argument('--user-facing',
|
||||
choices=['true', 'false'],
|
||||
help='Whether service is user-facing')
|
||||
parser.add_argument('--service-name',
|
||||
help='Service name')
|
||||
parser.add_argument('--summary-only', action='store_true',
|
||||
help='Only display summary, do not save JSON')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.input and not (args.service_type and args.criticality and args.user_facing):
|
||||
parser.error("Must provide either --input file or --service-type, --criticality, and --user-facing")
|
||||
|
||||
designer = SLODesigner()
|
||||
|
||||
try:
|
||||
# Load or create service definition
|
||||
if args.input:
|
||||
service_def = designer.load_service_definition(args.input)
|
||||
else:
|
||||
user_facing = args.user_facing.lower() == 'true'
|
||||
service_def = designer.create_service_definition(
|
||||
args.service_type, args.criticality, user_facing, args.service_name
|
||||
)
|
||||
|
||||
# Generate framework
|
||||
framework = designer.generate_framework(service_def)
|
||||
|
||||
# Output results
|
||||
if not args.summary_only:
|
||||
output_file = args.output or f"{service_def['name']}_slo_framework.json"
|
||||
designer.export_json(framework, output_file)
|
||||
print(f"SLO framework saved to: {output_file}")
|
||||
|
||||
# Always show summary
|
||||
designer.print_summary(framework)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user