Files
CleanArchitecture-template/.brain/.agent/skills/engineering-advanced-skills/migration-architect/expected_outputs/rollback_runbook.json
2026-03-12 15:17:52 +07:00

577 lines
23 KiB
JSON

{
"runbook_id": "rb_921c0bca",
"migration_id": "23a52ed1507f",
"created_at": "2026-02-16T13:47:31.108500",
"rollback_phases": [
{
"phase_name": "rollback_cleanup",
"description": "Rollback changes made during cleanup phase",
"urgency_level": "medium",
"estimated_duration_minutes": 570,
"prerequisites": [
"Incident commander assigned and briefed",
"All team members notified of rollback initiation",
"Monitoring systems confirmed operational",
"Backup systems verified and accessible"
],
"steps": [
{
"step_id": "rb_validate_0_final",
"name": "Validate rollback completion",
"description": "Comprehensive validation that cleanup rollback completed successfully",
"script_type": "manual",
"script_content": "Execute validation checklist for this phase",
"estimated_duration_minutes": 10,
"dependencies": [],
"validation_commands": [
"SELECT COUNT(*) FROM {table_name};",
"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';",
"SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';",
"SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};",
"SELECT MAX({timestamp_column}) FROM {table_name};"
],
"success_criteria": [
"cleanup fully rolled back",
"All validation checks pass"
],
"failure_escalation": "Investigate cleanup rollback failures",
"rollback_order": 99
}
],
"validation_checkpoints": [
"cleanup rollback steps completed",
"System health checks passing",
"No critical errors in logs",
"Key metrics within acceptable ranges",
"Validation command passed: SELECT COUNT(*) FROM {table_name};...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..."
],
"communication_requirements": [
"Notify incident commander of phase start/completion",
"Update rollback status dashboard",
"Log all actions and decisions"
],
"risk_level": "medium"
},
{
"phase_name": "rollback_contract",
"description": "Rollback changes made during contract phase",
"urgency_level": "medium",
"estimated_duration_minutes": 570,
"prerequisites": [
"Incident commander assigned and briefed",
"All team members notified of rollback initiation",
"Monitoring systems confirmed operational",
"Backup systems verified and accessible",
"Previous rollback phase completed successfully"
],
"steps": [
{
"step_id": "rb_validate_1_final",
"name": "Validate rollback completion",
"description": "Comprehensive validation that contract rollback completed successfully",
"script_type": "manual",
"script_content": "Execute validation checklist for this phase",
"estimated_duration_minutes": 10,
"dependencies": [],
"validation_commands": [
"SELECT COUNT(*) FROM {table_name};",
"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';",
"SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';",
"SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};",
"SELECT MAX({timestamp_column}) FROM {table_name};"
],
"success_criteria": [
"contract fully rolled back",
"All validation checks pass"
],
"failure_escalation": "Investigate contract rollback failures",
"rollback_order": 99
}
],
"validation_checkpoints": [
"contract rollback steps completed",
"System health checks passing",
"No critical errors in logs",
"Key metrics within acceptable ranges",
"Validation command passed: SELECT COUNT(*) FROM {table_name};...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..."
],
"communication_requirements": [
"Notify incident commander of phase start/completion",
"Update rollback status dashboard",
"Log all actions and decisions"
],
"risk_level": "medium"
},
{
"phase_name": "rollback_migrate",
"description": "Rollback changes made during migrate phase",
"urgency_level": "medium",
"estimated_duration_minutes": 570,
"prerequisites": [
"Incident commander assigned and briefed",
"All team members notified of rollback initiation",
"Monitoring systems confirmed operational",
"Backup systems verified and accessible",
"Previous rollback phase completed successfully"
],
"steps": [
{
"step_id": "rb_validate_2_final",
"name": "Validate rollback completion",
"description": "Comprehensive validation that migrate rollback completed successfully",
"script_type": "manual",
"script_content": "Execute validation checklist for this phase",
"estimated_duration_minutes": 10,
"dependencies": [],
"validation_commands": [
"SELECT COUNT(*) FROM {table_name};",
"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';",
"SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';",
"SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};",
"SELECT MAX({timestamp_column}) FROM {table_name};"
],
"success_criteria": [
"migrate fully rolled back",
"All validation checks pass"
],
"failure_escalation": "Investigate migrate rollback failures",
"rollback_order": 99
}
],
"validation_checkpoints": [
"migrate rollback steps completed",
"System health checks passing",
"No critical errors in logs",
"Key metrics within acceptable ranges",
"Validation command passed: SELECT COUNT(*) FROM {table_name};...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..."
],
"communication_requirements": [
"Notify incident commander of phase start/completion",
"Update rollback status dashboard",
"Log all actions and decisions"
],
"risk_level": "medium"
},
{
"phase_name": "rollback_expand",
"description": "Rollback changes made during expand phase",
"urgency_level": "medium",
"estimated_duration_minutes": 570,
"prerequisites": [
"Incident commander assigned and briefed",
"All team members notified of rollback initiation",
"Monitoring systems confirmed operational",
"Backup systems verified and accessible",
"Previous rollback phase completed successfully"
],
"steps": [
{
"step_id": "rb_validate_3_final",
"name": "Validate rollback completion",
"description": "Comprehensive validation that expand rollback completed successfully",
"script_type": "manual",
"script_content": "Execute validation checklist for this phase",
"estimated_duration_minutes": 10,
"dependencies": [],
"validation_commands": [
"SELECT COUNT(*) FROM {table_name};",
"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';",
"SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';",
"SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};",
"SELECT MAX({timestamp_column}) FROM {table_name};"
],
"success_criteria": [
"expand fully rolled back",
"All validation checks pass"
],
"failure_escalation": "Investigate expand rollback failures",
"rollback_order": 99
}
],
"validation_checkpoints": [
"expand rollback steps completed",
"System health checks passing",
"No critical errors in logs",
"Key metrics within acceptable ranges",
"Validation command passed: SELECT COUNT(*) FROM {table_name};...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..."
],
"communication_requirements": [
"Notify incident commander of phase start/completion",
"Update rollback status dashboard",
"Log all actions and decisions"
],
"risk_level": "medium"
},
{
"phase_name": "rollback_preparation",
"description": "Rollback changes made during preparation phase",
"urgency_level": "medium",
"estimated_duration_minutes": 570,
"prerequisites": [
"Incident commander assigned and briefed",
"All team members notified of rollback initiation",
"Monitoring systems confirmed operational",
"Backup systems verified and accessible",
"Previous rollback phase completed successfully"
],
"steps": [
{
"step_id": "rb_schema_4_01",
"name": "Drop migration artifacts",
"description": "Remove temporary migration tables and procedures",
"script_type": "sql",
"script_content": "-- Drop migration artifacts\nDROP TABLE IF EXISTS migration_log;\nDROP PROCEDURE IF EXISTS migrate_data();",
"estimated_duration_minutes": 5,
"dependencies": [],
"validation_commands": [
"SELECT COUNT(*) FROM information_schema.tables WHERE table_name LIKE '%migration%';"
],
"success_criteria": [
"No migration artifacts remain"
],
"failure_escalation": "Manual cleanup required",
"rollback_order": 1
},
{
"step_id": "rb_validate_4_final",
"name": "Validate rollback completion",
"description": "Comprehensive validation that preparation rollback completed successfully",
"script_type": "manual",
"script_content": "Execute validation checklist for this phase",
"estimated_duration_minutes": 10,
"dependencies": [
"rb_schema_4_01"
],
"validation_commands": [
"SELECT COUNT(*) FROM {table_name};",
"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';",
"SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';",
"SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};",
"SELECT MAX({timestamp_column}) FROM {table_name};"
],
"success_criteria": [
"preparation fully rolled back",
"All validation checks pass"
],
"failure_escalation": "Investigate preparation rollback failures",
"rollback_order": 99
}
],
"validation_checkpoints": [
"preparation rollback steps completed",
"System health checks passing",
"No critical errors in logs",
"Key metrics within acceptable ranges",
"Validation command passed: SELECT COUNT(*) FROM {table_name};...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...",
"Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..."
],
"communication_requirements": [
"Notify incident commander of phase start/completion",
"Update rollback status dashboard",
"Log all actions and decisions"
],
"risk_level": "medium"
}
],
"trigger_conditions": [
{
"trigger_id": "error_rate_spike",
"name": "Error Rate Spike",
"condition": "error_rate > baseline * 5 for 5 minutes",
"metric_threshold": {
"metric": "error_rate",
"operator": "greater_than",
"value": "baseline_error_rate * 5",
"duration_minutes": 5
},
"evaluation_window_minutes": 5,
"auto_execute": true,
"escalation_contacts": [
"on_call_engineer",
"migration_lead"
]
},
{
"trigger_id": "response_time_degradation",
"name": "Response Time Degradation",
"condition": "p95_response_time > baseline * 3 for 10 minutes",
"metric_threshold": {
"metric": "p95_response_time",
"operator": "greater_than",
"value": "baseline_p95 * 3",
"duration_minutes": 10
},
"evaluation_window_minutes": 10,
"auto_execute": false,
"escalation_contacts": [
"performance_team",
"migration_lead"
]
},
{
"trigger_id": "availability_drop",
"name": "Service Availability Drop",
"condition": "availability < 95% for 2 minutes",
"metric_threshold": {
"metric": "availability",
"operator": "less_than",
"value": 0.95,
"duration_minutes": 2
},
"evaluation_window_minutes": 2,
"auto_execute": true,
"escalation_contacts": [
"sre_team",
"incident_commander"
]
},
{
"trigger_id": "data_integrity_failure",
"name": "Data Integrity Check Failure",
"condition": "data_validation_failures > 0",
"metric_threshold": {
"metric": "data_validation_failures",
"operator": "greater_than",
"value": 0,
"duration_minutes": 1
},
"evaluation_window_minutes": 1,
"auto_execute": true,
"escalation_contacts": [
"dba_team",
"data_team"
]
},
{
"trigger_id": "migration_progress_stalled",
"name": "Migration Progress Stalled",
"condition": "migration_progress unchanged for 30 minutes",
"metric_threshold": {
"metric": "migration_progress_rate",
"operator": "equals",
"value": 0,
"duration_minutes": 30
},
"evaluation_window_minutes": 30,
"auto_execute": false,
"escalation_contacts": [
"migration_team",
"dba_team"
]
}
],
"data_recovery_plan": {
"recovery_method": "point_in_time",
"backup_location": "/backups/pre_migration_{migration_id}_{timestamp}.sql",
"recovery_scripts": [
"pg_restore -d production -c /backups/pre_migration_backup.sql",
"SELECT pg_create_restore_point('rollback_point');",
"VACUUM ANALYZE; -- Refresh statistics after restore"
],
"data_validation_queries": [
"SELECT COUNT(*) FROM critical_business_table;",
"SELECT MAX(created_at) FROM audit_log;",
"SELECT COUNT(DISTINCT user_id) FROM user_sessions;",
"SELECT SUM(amount) FROM financial_transactions WHERE date = CURRENT_DATE;"
],
"estimated_recovery_time_minutes": 45,
"recovery_dependencies": [
"database_instance_running",
"backup_file_accessible"
]
},
"communication_templates": [
{
"template_type": "rollback_start",
"audience": "technical",
"subject": "ROLLBACK INITIATED: {migration_name}",
"body": "Team,\n\nWe have initiated rollback for migration: {migration_name}\nRollback ID: {rollback_id}\nStart Time: {start_time}\nEstimated Duration: {estimated_duration}\n\nReason: {rollback_reason}\n\nCurrent Status: Rolling back phase {current_phase}\n\nNext Updates: Every 15 minutes or upon phase completion\n\nActions Required:\n- Monitor system health dashboards\n- Stand by for escalation if needed\n- Do not make manual changes during rollback\n\nIncident Commander: {incident_commander}\n",
"urgency": "medium",
"delivery_methods": [
"email",
"slack"
]
},
{
"template_type": "rollback_start",
"audience": "business",
"subject": "System Rollback In Progress - {system_name}",
"body": "Business Stakeholders,\n\nWe are currently performing a planned rollback of the {system_name} migration due to {rollback_reason}.\n\nImpact: {business_impact}\nExpected Resolution: {estimated_completion_time}\nAffected Services: {affected_services}\n\nWe will provide updates every 30 minutes.\n\nContact: {business_contact}\n",
"urgency": "medium",
"delivery_methods": [
"email"
]
},
{
"template_type": "rollback_start",
"audience": "executive",
"subject": "EXEC ALERT: Critical System Rollback - {system_name}",
"body": "Executive Team,\n\nA critical rollback is in progress for {system_name}.\n\nSummary:\n- Rollback Reason: {rollback_reason}\n- Business Impact: {business_impact}\n- Expected Resolution: {estimated_completion_time}\n- Customer Impact: {customer_impact}\n\nWe are following established procedures and will update hourly.\n\nEscalation: {escalation_contact}\n",
"urgency": "high",
"delivery_methods": [
"email"
]
},
{
"template_type": "rollback_complete",
"audience": "technical",
"subject": "ROLLBACK COMPLETED: {migration_name}",
"body": "Team,\n\nRollback has been successfully completed for migration: {migration_name}\n\nSummary:\n- Start Time: {start_time}\n- End Time: {end_time}\n- Duration: {actual_duration}\n- Phases Completed: {completed_phases}\n\nValidation Results:\n{validation_results}\n\nSystem Status: {system_status}\n\nNext Steps:\n- Continue monitoring for 24 hours\n- Post-rollback review scheduled for {review_date}\n- Root cause analysis to begin\n\nAll clear to resume normal operations.\n\nIncident Commander: {incident_commander}\n",
"urgency": "medium",
"delivery_methods": [
"email",
"slack"
]
},
{
"template_type": "emergency_escalation",
"audience": "executive",
"subject": "CRITICAL: Rollback Emergency - {migration_name}",
"body": "CRITICAL SITUATION - IMMEDIATE ATTENTION REQUIRED\n\nMigration: {migration_name}\nIssue: Rollback procedure has encountered critical failures\n\nCurrent Status: {current_status}\nFailed Components: {failed_components}\nBusiness Impact: {business_impact}\nCustomer Impact: {customer_impact}\n\nImmediate Actions:\n1. Emergency response team activated\n2. {emergency_action_1}\n3. {emergency_action_2}\n\nWar Room: {war_room_location}\nBridge Line: {conference_bridge}\n\nNext Update: {next_update_time}\n\nIncident Commander: {incident_commander}\nExecutive On-Call: {executive_on_call}\n",
"urgency": "emergency",
"delivery_methods": [
"email",
"sms",
"phone_call"
]
}
],
"escalation_matrix": {
"level_1": {
"trigger": "Single component failure",
"response_time_minutes": 5,
"contacts": [
"on_call_engineer",
"migration_lead"
],
"actions": [
"Investigate issue",
"Attempt automated remediation",
"Monitor closely"
]
},
"level_2": {
"trigger": "Multiple component failures or single critical failure",
"response_time_minutes": 2,
"contacts": [
"senior_engineer",
"team_lead",
"devops_lead"
],
"actions": [
"Initiate rollback",
"Establish war room",
"Notify stakeholders"
]
},
"level_3": {
"trigger": "System-wide failure or data corruption",
"response_time_minutes": 1,
"contacts": [
"engineering_manager",
"cto",
"incident_commander"
],
"actions": [
"Emergency rollback",
"All hands on deck",
"Executive notification"
]
},
"emergency": {
"trigger": "Business-critical failure with customer impact",
"response_time_minutes": 0,
"contacts": [
"ceo",
"cto",
"head_of_operations"
],
"actions": [
"Emergency procedures",
"Customer communication",
"Media preparation if needed"
]
}
},
"validation_checklist": [
"Verify system is responding to health checks",
"Confirm error rates are within normal parameters",
"Validate response times meet SLA requirements",
"Check all critical business processes are functioning",
"Verify monitoring and alerting systems are operational",
"Confirm no data corruption has occurred",
"Validate security controls are functioning properly",
"Check backup systems are working correctly",
"Verify integration points with downstream systems",
"Confirm user authentication and authorization working",
"Validate database schema matches expected state",
"Confirm referential integrity constraints",
"Check database performance metrics",
"Verify data consistency across related tables",
"Validate indexes and statistics are optimal",
"Confirm transaction logs are clean",
"Check database connections and connection pooling"
],
"post_rollback_procedures": [
"Monitor system stability for 24-48 hours post-rollback",
"Conduct thorough post-rollback testing of all critical paths",
"Review and analyze rollback metrics and timing",
"Document lessons learned and rollback procedure improvements",
"Schedule post-mortem meeting with all stakeholders",
"Update rollback procedures based on actual experience",
"Communicate rollback completion to all stakeholders",
"Archive rollback logs and artifacts for future reference",
"Review and update monitoring thresholds if needed",
"Plan for next migration attempt with improved procedures",
"Conduct security review to ensure no vulnerabilities introduced",
"Update disaster recovery procedures if affected by rollback",
"Review capacity planning based on rollback resource usage",
"Update documentation with rollback experience and timings"
],
"emergency_contacts": [
{
"role": "Incident Commander",
"name": "TBD - Assigned during migration",
"primary_phone": "+1-XXX-XXX-XXXX",
"email": "incident.commander@company.com",
"backup_contact": "backup.commander@company.com"
},
{
"role": "Technical Lead",
"name": "TBD - Migration technical owner",
"primary_phone": "+1-XXX-XXX-XXXX",
"email": "tech.lead@company.com",
"backup_contact": "senior.engineer@company.com"
},
{
"role": "Business Owner",
"name": "TBD - Business stakeholder",
"primary_phone": "+1-XXX-XXX-XXXX",
"email": "business.owner@company.com",
"backup_contact": "product.manager@company.com"
},
{
"role": "On-Call Engineer",
"name": "Current on-call rotation",
"primary_phone": "+1-XXX-XXX-XXXX",
"email": "oncall@company.com",
"backup_contact": "backup.oncall@company.com"
},
{
"role": "Executive Escalation",
"name": "CTO/VP Engineering",
"primary_phone": "+1-XXX-XXX-XXXX",
"email": "cto@company.com",
"backup_contact": "vp.engineering@company.com"
}
]
}