1109 lines
49 KiB
Python
1109 lines
49 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Rollback Generator - Generate comprehensive rollback procedures for migrations
|
|
|
|
This tool takes a migration plan and generates detailed rollback procedures for each phase,
|
|
including data rollback scripts, service rollback steps, validation checks, and communication
|
|
templates to ensure safe and reliable migration reversals.
|
|
|
|
Author: Migration Architect Skill
|
|
Version: 1.0.0
|
|
License: MIT
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import sys
|
|
import datetime
|
|
import hashlib
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
from dataclasses import dataclass, asdict
|
|
from enum import Enum
|
|
|
|
|
|
class RollbackTrigger(Enum):
|
|
"""Types of rollback triggers"""
|
|
MANUAL = "manual"
|
|
AUTOMATED = "automated"
|
|
THRESHOLD_BASED = "threshold_based"
|
|
TIME_BASED = "time_based"
|
|
|
|
|
|
class RollbackUrgency(Enum):
|
|
"""Rollback urgency levels"""
|
|
LOW = "low"
|
|
MEDIUM = "medium"
|
|
HIGH = "high"
|
|
EMERGENCY = "emergency"
|
|
|
|
|
|
@dataclass
|
|
class RollbackStep:
|
|
"""Individual rollback step"""
|
|
step_id: str
|
|
name: str
|
|
description: str
|
|
script_type: str # sql, bash, api, manual
|
|
script_content: str
|
|
estimated_duration_minutes: int
|
|
dependencies: List[str]
|
|
validation_commands: List[str]
|
|
success_criteria: List[str]
|
|
failure_escalation: str
|
|
rollback_order: int
|
|
|
|
|
|
@dataclass
|
|
class RollbackPhase:
|
|
"""Rollback phase containing multiple steps"""
|
|
phase_name: str
|
|
description: str
|
|
urgency_level: str
|
|
estimated_duration_minutes: int
|
|
prerequisites: List[str]
|
|
steps: List[RollbackStep]
|
|
validation_checkpoints: List[str]
|
|
communication_requirements: List[str]
|
|
risk_level: str
|
|
|
|
|
|
@dataclass
|
|
class RollbackTriggerCondition:
|
|
"""Conditions that trigger automatic rollback"""
|
|
trigger_id: str
|
|
name: str
|
|
condition: str
|
|
metric_threshold: Optional[Dict[str, Any]]
|
|
evaluation_window_minutes: int
|
|
auto_execute: bool
|
|
escalation_contacts: List[str]
|
|
|
|
|
|
@dataclass
|
|
class DataRecoveryPlan:
|
|
"""Data recovery and restoration plan"""
|
|
recovery_method: str # backup_restore, point_in_time, event_replay
|
|
backup_location: str
|
|
recovery_scripts: List[str]
|
|
data_validation_queries: List[str]
|
|
estimated_recovery_time_minutes: int
|
|
recovery_dependencies: List[str]
|
|
|
|
|
|
@dataclass
|
|
class CommunicationTemplate:
|
|
"""Communication template for rollback scenarios"""
|
|
template_type: str # start, progress, completion, escalation
|
|
audience: str # technical, business, executive, customers
|
|
subject: str
|
|
body: str
|
|
urgency: str
|
|
delivery_methods: List[str]
|
|
|
|
|
|
@dataclass
|
|
class RollbackRunbook:
|
|
"""Complete rollback runbook"""
|
|
runbook_id: str
|
|
migration_id: str
|
|
created_at: str
|
|
rollback_phases: List[RollbackPhase]
|
|
trigger_conditions: List[RollbackTriggerCondition]
|
|
data_recovery_plan: DataRecoveryPlan
|
|
communication_templates: List[CommunicationTemplate]
|
|
escalation_matrix: Dict[str, Any]
|
|
validation_checklist: List[str]
|
|
post_rollback_procedures: List[str]
|
|
emergency_contacts: List[Dict[str, str]]
|
|
|
|
|
|
class RollbackGenerator:
|
|
"""Main rollback generator class"""
|
|
|
|
def __init__(self):
|
|
self.rollback_templates = self._load_rollback_templates()
|
|
self.validation_templates = self._load_validation_templates()
|
|
self.communication_templates = self._load_communication_templates()
|
|
|
|
def _load_rollback_templates(self) -> Dict[str, Any]:
|
|
"""Load rollback script templates for different migration types"""
|
|
return {
|
|
"database": {
|
|
"schema_rollback": {
|
|
"drop_table": "DROP TABLE IF EXISTS {table_name};",
|
|
"drop_column": "ALTER TABLE {table_name} DROP COLUMN IF EXISTS {column_name};",
|
|
"restore_column": "ALTER TABLE {table_name} ADD COLUMN {column_definition};",
|
|
"revert_type": "ALTER TABLE {table_name} ALTER COLUMN {column_name} TYPE {original_type};",
|
|
"drop_constraint": "ALTER TABLE {table_name} DROP CONSTRAINT {constraint_name};",
|
|
"add_constraint": "ALTER TABLE {table_name} ADD CONSTRAINT {constraint_name} {constraint_definition};"
|
|
},
|
|
"data_rollback": {
|
|
"restore_backup": "pg_restore -d {database_name} -c {backup_file}",
|
|
"point_in_time_recovery": "SELECT pg_create_restore_point('pre_migration_{timestamp}');",
|
|
"delete_migrated_data": "DELETE FROM {table_name} WHERE migration_batch_id = '{batch_id}';",
|
|
"restore_original_values": "UPDATE {table_name} SET {column_name} = backup_{column_name} WHERE migration_flag = true;"
|
|
}
|
|
},
|
|
"service": {
|
|
"deployment_rollback": {
|
|
"rollback_blue_green": "kubectl patch service {service_name} -p '{\"spec\":{\"selector\":{\"version\":\"blue\"}}}'",
|
|
"rollback_canary": "kubectl scale deployment {service_name}-canary --replicas=0",
|
|
"restore_previous_version": "kubectl rollout undo deployment/{service_name} --to-revision={revision_number}",
|
|
"update_load_balancer": "aws elbv2 modify-rule --rule-arn {rule_arn} --actions Type=forward,TargetGroupArn={original_target_group}"
|
|
},
|
|
"configuration_rollback": {
|
|
"restore_config_map": "kubectl apply -f {original_config_file}",
|
|
"revert_feature_flags": "curl -X PUT {feature_flag_api}/flags/{flag_name} -d '{\"enabled\": false}'",
|
|
"restore_environment_vars": "kubectl set env deployment/{deployment_name} {env_var_name}={original_value}"
|
|
}
|
|
},
|
|
"infrastructure": {
|
|
"cloud_rollback": {
|
|
"revert_terraform": "terraform apply -target={resource_name} {rollback_plan_file}",
|
|
"restore_dns": "aws route53 change-resource-record-sets --hosted-zone-id {zone_id} --change-batch file://{rollback_dns_changes}",
|
|
"rollback_security_groups": "aws ec2 authorize-security-group-ingress --group-id {group_id} --protocol {protocol} --port {port} --cidr {cidr}",
|
|
"restore_iam_policies": "aws iam put-role-policy --role-name {role_name} --policy-name {policy_name} --policy-document file://{original_policy}"
|
|
},
|
|
"network_rollback": {
|
|
"restore_routing": "aws ec2 replace-route --route-table-id {route_table_id} --destination-cidr-block {cidr} --gateway-id {original_gateway}",
|
|
"revert_load_balancer": "aws elbv2 modify-load-balancer --load-balancer-arn {lb_arn} --scheme {original_scheme}",
|
|
"restore_firewall_rules": "aws ec2 revoke-security-group-ingress --group-id {group_id} --protocol {protocol} --port {port} --source-group {source_group}"
|
|
}
|
|
}
|
|
}
|
|
|
|
def _load_validation_templates(self) -> Dict[str, List[str]]:
|
|
"""Load validation command templates"""
|
|
return {
|
|
"database": [
|
|
"SELECT COUNT(*) FROM {table_name};",
|
|
"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';",
|
|
"SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';",
|
|
"SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};",
|
|
"SELECT MAX({timestamp_column}) FROM {table_name};"
|
|
],
|
|
"service": [
|
|
"curl -f {health_check_url}",
|
|
"kubectl get pods -l app={service_name} --field-selector=status.phase=Running",
|
|
"kubectl logs deployment/{service_name} --tail=100 | grep -i error",
|
|
"curl -f {service_endpoint}/api/v1/status"
|
|
],
|
|
"infrastructure": [
|
|
"aws ec2 describe-instances --instance-ids {instance_id} --query 'Reservations[*].Instances[*].State.Name'",
|
|
"nslookup {domain_name}",
|
|
"curl -I {load_balancer_url}",
|
|
"aws elbv2 describe-target-health --target-group-arn {target_group_arn}"
|
|
]
|
|
}
|
|
|
|
def _load_communication_templates(self) -> Dict[str, Dict[str, str]]:
|
|
"""Load communication templates"""
|
|
return {
|
|
"rollback_start": {
|
|
"technical": {
|
|
"subject": "ROLLBACK INITIATED: {migration_name}",
|
|
"body": """Team,
|
|
|
|
We have initiated rollback for migration: {migration_name}
|
|
Rollback ID: {rollback_id}
|
|
Start Time: {start_time}
|
|
Estimated Duration: {estimated_duration}
|
|
|
|
Reason: {rollback_reason}
|
|
|
|
Current Status: Rolling back phase {current_phase}
|
|
|
|
Next Updates: Every 15 minutes or upon phase completion
|
|
|
|
Actions Required:
|
|
- Monitor system health dashboards
|
|
- Stand by for escalation if needed
|
|
- Do not make manual changes during rollback
|
|
|
|
Incident Commander: {incident_commander}
|
|
"""
|
|
},
|
|
"business": {
|
|
"subject": "System Rollback In Progress - {system_name}",
|
|
"body": """Business Stakeholders,
|
|
|
|
We are currently performing a planned rollback of the {system_name} migration due to {rollback_reason}.
|
|
|
|
Impact: {business_impact}
|
|
Expected Resolution: {estimated_completion_time}
|
|
Affected Services: {affected_services}
|
|
|
|
We will provide updates every 30 minutes.
|
|
|
|
Contact: {business_contact}
|
|
"""
|
|
},
|
|
"executive": {
|
|
"subject": "EXEC ALERT: Critical System Rollback - {system_name}",
|
|
"body": """Executive Team,
|
|
|
|
A critical rollback is in progress for {system_name}.
|
|
|
|
Summary:
|
|
- Rollback Reason: {rollback_reason}
|
|
- Business Impact: {business_impact}
|
|
- Expected Resolution: {estimated_completion_time}
|
|
- Customer Impact: {customer_impact}
|
|
|
|
We are following established procedures and will update hourly.
|
|
|
|
Escalation: {escalation_contact}
|
|
"""
|
|
}
|
|
},
|
|
"rollback_complete": {
|
|
"technical": {
|
|
"subject": "ROLLBACK COMPLETED: {migration_name}",
|
|
"body": """Team,
|
|
|
|
Rollback has been successfully completed for migration: {migration_name}
|
|
|
|
Summary:
|
|
- Start Time: {start_time}
|
|
- End Time: {end_time}
|
|
- Duration: {actual_duration}
|
|
- Phases Completed: {completed_phases}
|
|
|
|
Validation Results:
|
|
{validation_results}
|
|
|
|
System Status: {system_status}
|
|
|
|
Next Steps:
|
|
- Continue monitoring for 24 hours
|
|
- Post-rollback review scheduled for {review_date}
|
|
- Root cause analysis to begin
|
|
|
|
All clear to resume normal operations.
|
|
|
|
Incident Commander: {incident_commander}
|
|
"""
|
|
}
|
|
}
|
|
}
|
|
|
|
def generate_rollback_runbook(self, migration_plan: Dict[str, Any]) -> RollbackRunbook:
|
|
"""Generate comprehensive rollback runbook from migration plan"""
|
|
runbook_id = f"rb_{hashlib.md5(str(migration_plan).encode()).hexdigest()[:8]}"
|
|
migration_id = migration_plan.get("migration_id", "unknown")
|
|
migration_type = migration_plan.get("migration_type", "unknown")
|
|
|
|
# Generate rollback phases (reverse order of migration phases)
|
|
rollback_phases = self._generate_rollback_phases(migration_plan)
|
|
|
|
# Generate trigger conditions
|
|
trigger_conditions = self._generate_trigger_conditions(migration_plan)
|
|
|
|
# Generate data recovery plan
|
|
data_recovery_plan = self._generate_data_recovery_plan(migration_plan)
|
|
|
|
# Generate communication templates
|
|
communication_templates = self._generate_communication_templates(migration_plan)
|
|
|
|
# Generate escalation matrix
|
|
escalation_matrix = self._generate_escalation_matrix(migration_plan)
|
|
|
|
# Generate validation checklist
|
|
validation_checklist = self._generate_validation_checklist(migration_plan)
|
|
|
|
# Generate post-rollback procedures
|
|
post_rollback_procedures = self._generate_post_rollback_procedures(migration_plan)
|
|
|
|
# Generate emergency contacts
|
|
emergency_contacts = self._generate_emergency_contacts(migration_plan)
|
|
|
|
return RollbackRunbook(
|
|
runbook_id=runbook_id,
|
|
migration_id=migration_id,
|
|
created_at=datetime.datetime.now().isoformat(),
|
|
rollback_phases=rollback_phases,
|
|
trigger_conditions=trigger_conditions,
|
|
data_recovery_plan=data_recovery_plan,
|
|
communication_templates=communication_templates,
|
|
escalation_matrix=escalation_matrix,
|
|
validation_checklist=validation_checklist,
|
|
post_rollback_procedures=post_rollback_procedures,
|
|
emergency_contacts=emergency_contacts
|
|
)
|
|
|
|
def _generate_rollback_phases(self, migration_plan: Dict[str, Any]) -> List[RollbackPhase]:
|
|
"""Generate rollback phases from migration plan"""
|
|
migration_phases = migration_plan.get("phases", [])
|
|
migration_type = migration_plan.get("migration_type", "unknown")
|
|
rollback_phases = []
|
|
|
|
# Reverse the order of migration phases for rollback
|
|
for i, phase in enumerate(reversed(migration_phases)):
|
|
if isinstance(phase, dict):
|
|
phase_name = phase.get("name", f"phase_{i}")
|
|
phase_duration = phase.get("duration_hours", 2) * 60 # Convert to minutes
|
|
phase_risk = phase.get("risk_level", "medium")
|
|
else:
|
|
phase_name = str(phase)
|
|
phase_duration = 120 # Default 2 hours
|
|
phase_risk = "medium"
|
|
|
|
rollback_steps = self._generate_rollback_steps(phase_name, migration_type, i)
|
|
|
|
rollback_phase = RollbackPhase(
|
|
phase_name=f"rollback_{phase_name}",
|
|
description=f"Rollback changes made during {phase_name} phase",
|
|
urgency_level=self._calculate_urgency(phase_risk),
|
|
estimated_duration_minutes=phase_duration // 2, # Rollback typically faster
|
|
prerequisites=self._get_rollback_prerequisites(phase_name, i),
|
|
steps=rollback_steps,
|
|
validation_checkpoints=self._get_validation_checkpoints(phase_name, migration_type),
|
|
communication_requirements=self._get_communication_requirements(phase_name, phase_risk),
|
|
risk_level=phase_risk
|
|
)
|
|
|
|
rollback_phases.append(rollback_phase)
|
|
|
|
return rollback_phases
|
|
|
|
def _generate_rollback_steps(self, phase_name: str, migration_type: str, phase_index: int) -> List[RollbackStep]:
|
|
"""Generate specific rollback steps for a phase"""
|
|
steps = []
|
|
templates = self.rollback_templates.get(migration_type, {})
|
|
|
|
if migration_type == "database":
|
|
if "migration" in phase_name.lower() or "cutover" in phase_name.lower():
|
|
# Data rollback steps
|
|
steps.extend([
|
|
RollbackStep(
|
|
step_id=f"rb_data_{phase_index}_01",
|
|
name="Stop data migration processes",
|
|
description="Halt all ongoing data migration processes",
|
|
script_type="sql",
|
|
script_content="-- Stop migration processes\nSELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE query LIKE '%migration%';",
|
|
estimated_duration_minutes=5,
|
|
dependencies=[],
|
|
validation_commands=["SELECT COUNT(*) FROM pg_stat_activity WHERE query LIKE '%migration%';"],
|
|
success_criteria=["No active migration processes"],
|
|
failure_escalation="Contact DBA immediately",
|
|
rollback_order=1
|
|
),
|
|
RollbackStep(
|
|
step_id=f"rb_data_{phase_index}_02",
|
|
name="Restore from backup",
|
|
description="Restore database from pre-migration backup",
|
|
script_type="bash",
|
|
script_content=templates.get("data_rollback", {}).get("restore_backup", "pg_restore -d {database_name} -c {backup_file}"),
|
|
estimated_duration_minutes=30,
|
|
dependencies=[f"rb_data_{phase_index}_01"],
|
|
validation_commands=["SELECT COUNT(*) FROM information_schema.tables;"],
|
|
success_criteria=["Database restored successfully", "All expected tables present"],
|
|
failure_escalation="Escalate to senior DBA and infrastructure team",
|
|
rollback_order=2
|
|
)
|
|
])
|
|
|
|
if "preparation" in phase_name.lower():
|
|
# Schema rollback steps
|
|
steps.append(
|
|
RollbackStep(
|
|
step_id=f"rb_schema_{phase_index}_01",
|
|
name="Drop migration artifacts",
|
|
description="Remove temporary migration tables and procedures",
|
|
script_type="sql",
|
|
script_content="-- Drop migration artifacts\nDROP TABLE IF EXISTS migration_log;\nDROP PROCEDURE IF EXISTS migrate_data();",
|
|
estimated_duration_minutes=5,
|
|
dependencies=[],
|
|
validation_commands=["SELECT COUNT(*) FROM information_schema.tables WHERE table_name LIKE '%migration%';"],
|
|
success_criteria=["No migration artifacts remain"],
|
|
failure_escalation="Manual cleanup required",
|
|
rollback_order=1
|
|
)
|
|
)
|
|
|
|
elif migration_type == "service":
|
|
if "cutover" in phase_name.lower():
|
|
# Service rollback steps
|
|
steps.extend([
|
|
RollbackStep(
|
|
step_id=f"rb_service_{phase_index}_01",
|
|
name="Redirect traffic back to old service",
|
|
description="Update load balancer to route traffic back to previous service version",
|
|
script_type="bash",
|
|
script_content=templates.get("deployment_rollback", {}).get("update_load_balancer", "aws elbv2 modify-rule --rule-arn {rule_arn} --actions Type=forward,TargetGroupArn={original_target_group}"),
|
|
estimated_duration_minutes=2,
|
|
dependencies=[],
|
|
validation_commands=["curl -f {health_check_url}"],
|
|
success_criteria=["Traffic routing to original service", "Health checks passing"],
|
|
failure_escalation="Emergency procedure - manual traffic routing",
|
|
rollback_order=1
|
|
),
|
|
RollbackStep(
|
|
step_id=f"rb_service_{phase_index}_02",
|
|
name="Rollback service deployment",
|
|
description="Revert to previous service deployment version",
|
|
script_type="bash",
|
|
script_content=templates.get("deployment_rollback", {}).get("restore_previous_version", "kubectl rollout undo deployment/{service_name} --to-revision={revision_number}"),
|
|
estimated_duration_minutes=10,
|
|
dependencies=[f"rb_service_{phase_index}_01"],
|
|
validation_commands=["kubectl get pods -l app={service_name} --field-selector=status.phase=Running"],
|
|
success_criteria=["Previous version deployed", "All pods running"],
|
|
failure_escalation="Manual pod management required",
|
|
rollback_order=2
|
|
)
|
|
])
|
|
|
|
elif migration_type == "infrastructure":
|
|
steps.extend([
|
|
RollbackStep(
|
|
step_id=f"rb_infra_{phase_index}_01",
|
|
name="Revert infrastructure changes",
|
|
description="Apply terraform plan to revert infrastructure to previous state",
|
|
script_type="bash",
|
|
script_content=templates.get("cloud_rollback", {}).get("revert_terraform", "terraform apply -target={resource_name} {rollback_plan_file}"),
|
|
estimated_duration_minutes=15,
|
|
dependencies=[],
|
|
validation_commands=["terraform plan -detailed-exitcode"],
|
|
success_criteria=["Infrastructure matches previous state", "No planned changes"],
|
|
failure_escalation="Manual infrastructure review required",
|
|
rollback_order=1
|
|
),
|
|
RollbackStep(
|
|
step_id=f"rb_infra_{phase_index}_02",
|
|
name="Restore DNS configuration",
|
|
description="Revert DNS changes to point back to original infrastructure",
|
|
script_type="bash",
|
|
script_content=templates.get("cloud_rollback", {}).get("restore_dns", "aws route53 change-resource-record-sets --hosted-zone-id {zone_id} --change-batch file://{rollback_dns_changes}"),
|
|
estimated_duration_minutes=10,
|
|
dependencies=[f"rb_infra_{phase_index}_01"],
|
|
validation_commands=["nslookup {domain_name}"],
|
|
success_criteria=["DNS resolves to original endpoints"],
|
|
failure_escalation="Contact DNS administrator",
|
|
rollback_order=2
|
|
)
|
|
])
|
|
|
|
# Add generic validation step for all migration types
|
|
steps.append(
|
|
RollbackStep(
|
|
step_id=f"rb_validate_{phase_index}_final",
|
|
name="Validate rollback completion",
|
|
description=f"Comprehensive validation that {phase_name} rollback completed successfully",
|
|
script_type="manual",
|
|
script_content="Execute validation checklist for this phase",
|
|
estimated_duration_minutes=10,
|
|
dependencies=[step.step_id for step in steps],
|
|
validation_commands=self.validation_templates.get(migration_type, []),
|
|
success_criteria=[f"{phase_name} fully rolled back", "All validation checks pass"],
|
|
failure_escalation=f"Investigate {phase_name} rollback failures",
|
|
rollback_order=99
|
|
)
|
|
)
|
|
|
|
return steps
|
|
|
|
def _generate_trigger_conditions(self, migration_plan: Dict[str, Any]) -> List[RollbackTriggerCondition]:
|
|
"""Generate automatic rollback trigger conditions"""
|
|
triggers = []
|
|
migration_type = migration_plan.get("migration_type", "unknown")
|
|
|
|
# Generic triggers for all migration types
|
|
triggers.extend([
|
|
RollbackTriggerCondition(
|
|
trigger_id="error_rate_spike",
|
|
name="Error Rate Spike",
|
|
condition="error_rate > baseline * 5 for 5 minutes",
|
|
metric_threshold={
|
|
"metric": "error_rate",
|
|
"operator": "greater_than",
|
|
"value": "baseline_error_rate * 5",
|
|
"duration_minutes": 5
|
|
},
|
|
evaluation_window_minutes=5,
|
|
auto_execute=True,
|
|
escalation_contacts=["on_call_engineer", "migration_lead"]
|
|
),
|
|
RollbackTriggerCondition(
|
|
trigger_id="response_time_degradation",
|
|
name="Response Time Degradation",
|
|
condition="p95_response_time > baseline * 3 for 10 minutes",
|
|
metric_threshold={
|
|
"metric": "p95_response_time",
|
|
"operator": "greater_than",
|
|
"value": "baseline_p95 * 3",
|
|
"duration_minutes": 10
|
|
},
|
|
evaluation_window_minutes=10,
|
|
auto_execute=False,
|
|
escalation_contacts=["performance_team", "migration_lead"]
|
|
),
|
|
RollbackTriggerCondition(
|
|
trigger_id="availability_drop",
|
|
name="Service Availability Drop",
|
|
condition="availability < 95% for 2 minutes",
|
|
metric_threshold={
|
|
"metric": "availability",
|
|
"operator": "less_than",
|
|
"value": 0.95,
|
|
"duration_minutes": 2
|
|
},
|
|
evaluation_window_minutes=2,
|
|
auto_execute=True,
|
|
escalation_contacts=["sre_team", "incident_commander"]
|
|
)
|
|
])
|
|
|
|
# Migration-type specific triggers
|
|
if migration_type == "database":
|
|
triggers.extend([
|
|
RollbackTriggerCondition(
|
|
trigger_id="data_integrity_failure",
|
|
name="Data Integrity Check Failure",
|
|
condition="data_validation_failures > 0",
|
|
metric_threshold={
|
|
"metric": "data_validation_failures",
|
|
"operator": "greater_than",
|
|
"value": 0,
|
|
"duration_minutes": 1
|
|
},
|
|
evaluation_window_minutes=1,
|
|
auto_execute=True,
|
|
escalation_contacts=["dba_team", "data_team"]
|
|
),
|
|
RollbackTriggerCondition(
|
|
trigger_id="migration_progress_stalled",
|
|
name="Migration Progress Stalled",
|
|
condition="migration_progress unchanged for 30 minutes",
|
|
metric_threshold={
|
|
"metric": "migration_progress_rate",
|
|
"operator": "equals",
|
|
"value": 0,
|
|
"duration_minutes": 30
|
|
},
|
|
evaluation_window_minutes=30,
|
|
auto_execute=False,
|
|
escalation_contacts=["migration_team", "dba_team"]
|
|
)
|
|
])
|
|
|
|
elif migration_type == "service":
|
|
triggers.extend([
|
|
RollbackTriggerCondition(
|
|
trigger_id="cpu_utilization_spike",
|
|
name="CPU Utilization Spike",
|
|
condition="cpu_utilization > 90% for 15 minutes",
|
|
metric_threshold={
|
|
"metric": "cpu_utilization",
|
|
"operator": "greater_than",
|
|
"value": 0.90,
|
|
"duration_minutes": 15
|
|
},
|
|
evaluation_window_minutes=15,
|
|
auto_execute=False,
|
|
escalation_contacts=["devops_team", "infrastructure_team"]
|
|
),
|
|
RollbackTriggerCondition(
|
|
trigger_id="memory_leak_detected",
|
|
name="Memory Leak Detected",
|
|
condition="memory_usage increasing continuously for 20 minutes",
|
|
metric_threshold={
|
|
"metric": "memory_growth_rate",
|
|
"operator": "greater_than",
|
|
"value": "1MB/minute",
|
|
"duration_minutes": 20
|
|
},
|
|
evaluation_window_minutes=20,
|
|
auto_execute=True,
|
|
escalation_contacts=["development_team", "sre_team"]
|
|
)
|
|
])
|
|
|
|
return triggers
|
|
|
|
def _generate_data_recovery_plan(self, migration_plan: Dict[str, Any]) -> DataRecoveryPlan:
|
|
"""Generate data recovery plan"""
|
|
migration_type = migration_plan.get("migration_type", "unknown")
|
|
|
|
if migration_type == "database":
|
|
return DataRecoveryPlan(
|
|
recovery_method="point_in_time",
|
|
backup_location="/backups/pre_migration_{migration_id}_{timestamp}.sql",
|
|
recovery_scripts=[
|
|
"pg_restore -d production -c /backups/pre_migration_backup.sql",
|
|
"SELECT pg_create_restore_point('rollback_point');",
|
|
"VACUUM ANALYZE; -- Refresh statistics after restore"
|
|
],
|
|
data_validation_queries=[
|
|
"SELECT COUNT(*) FROM critical_business_table;",
|
|
"SELECT MAX(created_at) FROM audit_log;",
|
|
"SELECT COUNT(DISTINCT user_id) FROM user_sessions;",
|
|
"SELECT SUM(amount) FROM financial_transactions WHERE date = CURRENT_DATE;"
|
|
],
|
|
estimated_recovery_time_minutes=45,
|
|
recovery_dependencies=["database_instance_running", "backup_file_accessible"]
|
|
)
|
|
else:
|
|
return DataRecoveryPlan(
|
|
recovery_method="backup_restore",
|
|
backup_location="/backups/pre_migration_state",
|
|
recovery_scripts=[
|
|
"# Restore configuration files from backup",
|
|
"cp -r /backups/pre_migration_state/config/* /app/config/",
|
|
"# Restart services with previous configuration",
|
|
"systemctl restart application_service"
|
|
],
|
|
data_validation_queries=[
|
|
"curl -f http://localhost:8080/health",
|
|
"curl -f http://localhost:8080/api/status"
|
|
],
|
|
estimated_recovery_time_minutes=20,
|
|
recovery_dependencies=["service_stopped", "backup_accessible"]
|
|
)
|
|
|
|
def _generate_communication_templates(self, migration_plan: Dict[str, Any]) -> List[CommunicationTemplate]:
|
|
"""Generate communication templates for rollback scenarios"""
|
|
templates = []
|
|
base_templates = self.communication_templates
|
|
|
|
# Rollback start notifications
|
|
for audience in ["technical", "business", "executive"]:
|
|
if audience in base_templates["rollback_start"]:
|
|
template_data = base_templates["rollback_start"][audience]
|
|
templates.append(CommunicationTemplate(
|
|
template_type="rollback_start",
|
|
audience=audience,
|
|
subject=template_data["subject"],
|
|
body=template_data["body"],
|
|
urgency="high" if audience == "executive" else "medium",
|
|
delivery_methods=["email", "slack"] if audience == "technical" else ["email"]
|
|
))
|
|
|
|
# Rollback completion notifications
|
|
for audience in ["technical", "business"]:
|
|
if audience in base_templates.get("rollback_complete", {}):
|
|
template_data = base_templates["rollback_complete"][audience]
|
|
templates.append(CommunicationTemplate(
|
|
template_type="rollback_complete",
|
|
audience=audience,
|
|
subject=template_data["subject"],
|
|
body=template_data["body"],
|
|
urgency="medium",
|
|
delivery_methods=["email", "slack"] if audience == "technical" else ["email"]
|
|
))
|
|
|
|
# Emergency escalation template
|
|
templates.append(CommunicationTemplate(
|
|
template_type="emergency_escalation",
|
|
audience="executive",
|
|
subject="CRITICAL: Rollback Emergency - {migration_name}",
|
|
body="""CRITICAL SITUATION - IMMEDIATE ATTENTION REQUIRED
|
|
|
|
Migration: {migration_name}
|
|
Issue: Rollback procedure has encountered critical failures
|
|
|
|
Current Status: {current_status}
|
|
Failed Components: {failed_components}
|
|
Business Impact: {business_impact}
|
|
Customer Impact: {customer_impact}
|
|
|
|
Immediate Actions:
|
|
1. Emergency response team activated
|
|
2. {emergency_action_1}
|
|
3. {emergency_action_2}
|
|
|
|
War Room: {war_room_location}
|
|
Bridge Line: {conference_bridge}
|
|
|
|
Next Update: {next_update_time}
|
|
|
|
Incident Commander: {incident_commander}
|
|
Executive On-Call: {executive_on_call}
|
|
""",
|
|
urgency="emergency",
|
|
delivery_methods=["email", "sms", "phone_call"]
|
|
))
|
|
|
|
return templates
|
|
|
|
def _generate_escalation_matrix(self, migration_plan: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate escalation matrix for different failure scenarios"""
|
|
return {
|
|
"level_1": {
|
|
"trigger": "Single component failure",
|
|
"response_time_minutes": 5,
|
|
"contacts": ["on_call_engineer", "migration_lead"],
|
|
"actions": ["Investigate issue", "Attempt automated remediation", "Monitor closely"]
|
|
},
|
|
"level_2": {
|
|
"trigger": "Multiple component failures or single critical failure",
|
|
"response_time_minutes": 2,
|
|
"contacts": ["senior_engineer", "team_lead", "devops_lead"],
|
|
"actions": ["Initiate rollback", "Establish war room", "Notify stakeholders"]
|
|
},
|
|
"level_3": {
|
|
"trigger": "System-wide failure or data corruption",
|
|
"response_time_minutes": 1,
|
|
"contacts": ["engineering_manager", "cto", "incident_commander"],
|
|
"actions": ["Emergency rollback", "All hands on deck", "Executive notification"]
|
|
},
|
|
"emergency": {
|
|
"trigger": "Business-critical failure with customer impact",
|
|
"response_time_minutes": 0,
|
|
"contacts": ["ceo", "cto", "head_of_operations"],
|
|
"actions": ["Emergency procedures", "Customer communication", "Media preparation if needed"]
|
|
}
|
|
}
|
|
|
|
def _generate_validation_checklist(self, migration_plan: Dict[str, Any]) -> List[str]:
|
|
"""Generate comprehensive validation checklist"""
|
|
migration_type = migration_plan.get("migration_type", "unknown")
|
|
|
|
base_checklist = [
|
|
"Verify system is responding to health checks",
|
|
"Confirm error rates are within normal parameters",
|
|
"Validate response times meet SLA requirements",
|
|
"Check all critical business processes are functioning",
|
|
"Verify monitoring and alerting systems are operational",
|
|
"Confirm no data corruption has occurred",
|
|
"Validate security controls are functioning properly",
|
|
"Check backup systems are working correctly",
|
|
"Verify integration points with downstream systems",
|
|
"Confirm user authentication and authorization working"
|
|
]
|
|
|
|
if migration_type == "database":
|
|
base_checklist.extend([
|
|
"Validate database schema matches expected state",
|
|
"Confirm referential integrity constraints",
|
|
"Check database performance metrics",
|
|
"Verify data consistency across related tables",
|
|
"Validate indexes and statistics are optimal",
|
|
"Confirm transaction logs are clean",
|
|
"Check database connections and connection pooling"
|
|
])
|
|
|
|
elif migration_type == "service":
|
|
base_checklist.extend([
|
|
"Verify service discovery is working correctly",
|
|
"Confirm load balancing is distributing traffic properly",
|
|
"Check service-to-service communication",
|
|
"Validate API endpoints are responding correctly",
|
|
"Confirm feature flags are in correct state",
|
|
"Check resource utilization (CPU, memory, disk)",
|
|
"Verify container orchestration is healthy"
|
|
])
|
|
|
|
elif migration_type == "infrastructure":
|
|
base_checklist.extend([
|
|
"Verify network connectivity between components",
|
|
"Confirm DNS resolution is working correctly",
|
|
"Check firewall rules and security groups",
|
|
"Validate load balancer configuration",
|
|
"Confirm SSL/TLS certificates are valid",
|
|
"Check storage systems are accessible",
|
|
"Verify backup and disaster recovery systems"
|
|
])
|
|
|
|
return base_checklist
|
|
|
|
def _generate_post_rollback_procedures(self, migration_plan: Dict[str, Any]) -> List[str]:
|
|
"""Generate post-rollback procedures"""
|
|
return [
|
|
"Monitor system stability for 24-48 hours post-rollback",
|
|
"Conduct thorough post-rollback testing of all critical paths",
|
|
"Review and analyze rollback metrics and timing",
|
|
"Document lessons learned and rollback procedure improvements",
|
|
"Schedule post-mortem meeting with all stakeholders",
|
|
"Update rollback procedures based on actual experience",
|
|
"Communicate rollback completion to all stakeholders",
|
|
"Archive rollback logs and artifacts for future reference",
|
|
"Review and update monitoring thresholds if needed",
|
|
"Plan for next migration attempt with improved procedures",
|
|
"Conduct security review to ensure no vulnerabilities introduced",
|
|
"Update disaster recovery procedures if affected by rollback",
|
|
"Review capacity planning based on rollback resource usage",
|
|
"Update documentation with rollback experience and timings"
|
|
]
|
|
|
|
def _generate_emergency_contacts(self, migration_plan: Dict[str, Any]) -> List[Dict[str, str]]:
|
|
"""Generate emergency contact list"""
|
|
return [
|
|
{
|
|
"role": "Incident Commander",
|
|
"name": "TBD - Assigned during migration",
|
|
"primary_phone": "+1-XXX-XXX-XXXX",
|
|
"email": "incident.commander@company.com",
|
|
"backup_contact": "backup.commander@company.com"
|
|
},
|
|
{
|
|
"role": "Technical Lead",
|
|
"name": "TBD - Migration technical owner",
|
|
"primary_phone": "+1-XXX-XXX-XXXX",
|
|
"email": "tech.lead@company.com",
|
|
"backup_contact": "senior.engineer@company.com"
|
|
},
|
|
{
|
|
"role": "Business Owner",
|
|
"name": "TBD - Business stakeholder",
|
|
"primary_phone": "+1-XXX-XXX-XXXX",
|
|
"email": "business.owner@company.com",
|
|
"backup_contact": "product.manager@company.com"
|
|
},
|
|
{
|
|
"role": "On-Call Engineer",
|
|
"name": "Current on-call rotation",
|
|
"primary_phone": "+1-XXX-XXX-XXXX",
|
|
"email": "oncall@company.com",
|
|
"backup_contact": "backup.oncall@company.com"
|
|
},
|
|
{
|
|
"role": "Executive Escalation",
|
|
"name": "CTO/VP Engineering",
|
|
"primary_phone": "+1-XXX-XXX-XXXX",
|
|
"email": "cto@company.com",
|
|
"backup_contact": "vp.engineering@company.com"
|
|
}
|
|
]
|
|
|
|
def _calculate_urgency(self, risk_level: str) -> str:
|
|
"""Calculate rollback urgency based on risk level"""
|
|
risk_to_urgency = {
|
|
"low": "low",
|
|
"medium": "medium",
|
|
"high": "high",
|
|
"critical": "emergency"
|
|
}
|
|
return risk_to_urgency.get(risk_level, "medium")
|
|
|
|
def _get_rollback_prerequisites(self, phase_name: str, phase_index: int) -> List[str]:
|
|
"""Get prerequisites for rollback phase"""
|
|
prerequisites = [
|
|
"Incident commander assigned and briefed",
|
|
"All team members notified of rollback initiation",
|
|
"Monitoring systems confirmed operational",
|
|
"Backup systems verified and accessible"
|
|
]
|
|
|
|
if phase_index > 0:
|
|
prerequisites.append("Previous rollback phase completed successfully")
|
|
|
|
if "cutover" in phase_name.lower():
|
|
prerequisites.extend([
|
|
"Traffic redirection capabilities confirmed",
|
|
"Load balancer configuration backed up",
|
|
"DNS changes prepared for quick execution"
|
|
])
|
|
|
|
if "data" in phase_name.lower() or "migration" in phase_name.lower():
|
|
prerequisites.extend([
|
|
"Database backup verified and accessible",
|
|
"Data validation queries prepared",
|
|
"Database administrator on standby"
|
|
])
|
|
|
|
return prerequisites
|
|
|
|
def _get_validation_checkpoints(self, phase_name: str, migration_type: str) -> List[str]:
|
|
"""Get validation checkpoints for rollback phase"""
|
|
checkpoints = [
|
|
f"{phase_name} rollback steps completed",
|
|
"System health checks passing",
|
|
"No critical errors in logs",
|
|
"Key metrics within acceptable ranges"
|
|
]
|
|
|
|
validation_commands = self.validation_templates.get(migration_type, [])
|
|
checkpoints.extend([f"Validation command passed: {cmd[:50]}..." for cmd in validation_commands[:3]])
|
|
|
|
return checkpoints
|
|
|
|
def _get_communication_requirements(self, phase_name: str, risk_level: str) -> List[str]:
|
|
"""Get communication requirements for rollback phase"""
|
|
base_requirements = [
|
|
"Notify incident commander of phase start/completion",
|
|
"Update rollback status dashboard",
|
|
"Log all actions and decisions"
|
|
]
|
|
|
|
if risk_level in ["high", "critical"]:
|
|
base_requirements.extend([
|
|
"Notify all stakeholders of phase progress",
|
|
"Update executive team if rollback extends beyond expected time",
|
|
"Prepare customer communication if needed"
|
|
])
|
|
|
|
if "cutover" in phase_name.lower():
|
|
base_requirements.append("Immediate notification when traffic is redirected")
|
|
|
|
return base_requirements
|
|
|
|
def generate_human_readable_runbook(self, runbook: RollbackRunbook) -> str:
|
|
"""Generate human-readable rollback runbook"""
|
|
output = []
|
|
output.append("=" * 80)
|
|
output.append(f"ROLLBACK RUNBOOK: {runbook.runbook_id}")
|
|
output.append("=" * 80)
|
|
output.append(f"Migration ID: {runbook.migration_id}")
|
|
output.append(f"Created: {runbook.created_at}")
|
|
output.append("")
|
|
|
|
# Emergency Contacts
|
|
output.append("EMERGENCY CONTACTS")
|
|
output.append("-" * 40)
|
|
for contact in runbook.emergency_contacts:
|
|
output.append(f"{contact['role']}: {contact['name']}")
|
|
output.append(f" Phone: {contact['primary_phone']}")
|
|
output.append(f" Email: {contact['email']}")
|
|
output.append(f" Backup: {contact['backup_contact']}")
|
|
output.append("")
|
|
|
|
# Escalation Matrix
|
|
output.append("ESCALATION MATRIX")
|
|
output.append("-" * 40)
|
|
for level, details in runbook.escalation_matrix.items():
|
|
output.append(f"{level.upper()}:")
|
|
output.append(f" Trigger: {details['trigger']}")
|
|
output.append(f" Response Time: {details['response_time_minutes']} minutes")
|
|
output.append(f" Contacts: {', '.join(details['contacts'])}")
|
|
output.append(f" Actions: {', '.join(details['actions'])}")
|
|
output.append("")
|
|
|
|
# Rollback Trigger Conditions
|
|
output.append("AUTOMATIC ROLLBACK TRIGGERS")
|
|
output.append("-" * 40)
|
|
for trigger in runbook.trigger_conditions:
|
|
output.append(f"• {trigger.name}")
|
|
output.append(f" Condition: {trigger.condition}")
|
|
output.append(f" Auto-Execute: {'Yes' if trigger.auto_execute else 'No'}")
|
|
output.append(f" Evaluation Window: {trigger.evaluation_window_minutes} minutes")
|
|
output.append(f" Contacts: {', '.join(trigger.escalation_contacts)}")
|
|
output.append("")
|
|
|
|
# Rollback Phases
|
|
output.append("ROLLBACK PHASES")
|
|
output.append("-" * 40)
|
|
for i, phase in enumerate(runbook.rollback_phases, 1):
|
|
output.append(f"{i}. {phase.phase_name.upper()}")
|
|
output.append(f" Description: {phase.description}")
|
|
output.append(f" Urgency: {phase.urgency_level.upper()}")
|
|
output.append(f" Duration: {phase.estimated_duration_minutes} minutes")
|
|
output.append(f" Risk Level: {phase.risk_level.upper()}")
|
|
|
|
if phase.prerequisites:
|
|
output.append(" Prerequisites:")
|
|
for prereq in phase.prerequisites:
|
|
output.append(f" ✓ {prereq}")
|
|
|
|
output.append(" Steps:")
|
|
for step in sorted(phase.steps, key=lambda x: x.rollback_order):
|
|
output.append(f" {step.rollback_order}. {step.name}")
|
|
output.append(f" Duration: {step.estimated_duration_minutes} min")
|
|
output.append(f" Type: {step.script_type}")
|
|
if step.script_content and step.script_type != "manual":
|
|
output.append(" Script:")
|
|
for line in step.script_content.split('\n')[:3]: # Show first 3 lines
|
|
output.append(f" {line}")
|
|
if len(step.script_content.split('\n')) > 3:
|
|
output.append(" ...")
|
|
output.append(f" Success Criteria: {', '.join(step.success_criteria)}")
|
|
output.append("")
|
|
|
|
if phase.validation_checkpoints:
|
|
output.append(" Validation Checkpoints:")
|
|
for checkpoint in phase.validation_checkpoints:
|
|
output.append(f" ☐ {checkpoint}")
|
|
output.append("")
|
|
|
|
# Data Recovery Plan
|
|
output.append("DATA RECOVERY PLAN")
|
|
output.append("-" * 40)
|
|
drp = runbook.data_recovery_plan
|
|
output.append(f"Recovery Method: {drp.recovery_method}")
|
|
output.append(f"Backup Location: {drp.backup_location}")
|
|
output.append(f"Estimated Recovery Time: {drp.estimated_recovery_time_minutes} minutes")
|
|
output.append("Recovery Scripts:")
|
|
for script in drp.recovery_scripts:
|
|
output.append(f" • {script}")
|
|
output.append("Validation Queries:")
|
|
for query in drp.data_validation_queries:
|
|
output.append(f" • {query}")
|
|
output.append("")
|
|
|
|
# Validation Checklist
|
|
output.append("POST-ROLLBACK VALIDATION CHECKLIST")
|
|
output.append("-" * 40)
|
|
for i, item in enumerate(runbook.validation_checklist, 1):
|
|
output.append(f"{i:2d}. ☐ {item}")
|
|
output.append("")
|
|
|
|
# Post-Rollback Procedures
|
|
output.append("POST-ROLLBACK PROCEDURES")
|
|
output.append("-" * 40)
|
|
for i, procedure in enumerate(runbook.post_rollback_procedures, 1):
|
|
output.append(f"{i:2d}. {procedure}")
|
|
output.append("")
|
|
|
|
return "\n".join(output)
|
|
|
|
|
|
def main():
|
|
"""Main function with command line interface"""
|
|
parser = argparse.ArgumentParser(description="Generate comprehensive rollback runbooks from migration plans")
|
|
parser.add_argument("--input", "-i", required=True, help="Input migration plan file (JSON)")
|
|
parser.add_argument("--output", "-o", help="Output file for rollback runbook (JSON)")
|
|
parser.add_argument("--format", "-f", choices=["json", "text", "both"], default="both", help="Output format")
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
# Load migration plan
|
|
with open(args.input, 'r') as f:
|
|
migration_plan = json.load(f)
|
|
|
|
# Validate required fields
|
|
if "migration_id" not in migration_plan and "source" not in migration_plan:
|
|
print("Error: Migration plan must contain migration_id or source field", file=sys.stderr)
|
|
return 1
|
|
|
|
# Generate rollback runbook
|
|
generator = RollbackGenerator()
|
|
runbook = generator.generate_rollback_runbook(migration_plan)
|
|
|
|
# Output results
|
|
if args.format in ["json", "both"]:
|
|
runbook_dict = asdict(runbook)
|
|
if args.output:
|
|
with open(args.output, 'w') as f:
|
|
json.dump(runbook_dict, f, indent=2)
|
|
print(f"Rollback runbook saved to {args.output}")
|
|
else:
|
|
print(json.dumps(runbook_dict, indent=2))
|
|
|
|
if args.format in ["text", "both"]:
|
|
human_runbook = generator.generate_human_readable_runbook(runbook)
|
|
text_output = args.output.replace('.json', '.txt') if args.output else None
|
|
if text_output:
|
|
with open(text_output, 'w') as f:
|
|
f.write(human_runbook)
|
|
print(f"Human-readable runbook saved to {text_output}")
|
|
else:
|
|
print("\n" + "="*80)
|
|
print("HUMAN-READABLE ROLLBACK RUNBOOK")
|
|
print("="*80)
|
|
print(human_runbook)
|
|
|
|
except FileNotFoundError:
|
|
print(f"Error: Input file '{args.input}' not found", file=sys.stderr)
|
|
return 1
|
|
except json.JSONDecodeError as e:
|
|
print(f"Error: Invalid JSON in input file: {e}", file=sys.stderr)
|
|
return 1
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main()) |