add brain
This commit is contained in:
@@ -0,0 +1,276 @@
|
||||
{
|
||||
"alerts": [
|
||||
{
|
||||
"alert": "HighLatency",
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m])) > 0.5",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High request latency detected",
|
||||
"description": "95th percentile latency is {{ $value }}s for payment-service",
|
||||
"runbook_url": "https://runbooks.company.com/high-latency"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 2.5,
|
||||
"false_positive_rate": 0.15,
|
||||
"average_duration_minutes": 12
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "ServiceDown",
|
||||
"expr": "up{service=\"payment-service\"} == 0",
|
||||
"labels": {
|
||||
"severity": "critical",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Payment service is down",
|
||||
"description": "Payment service has been down for more than 1 minute",
|
||||
"runbook_url": "https://runbooks.company.com/service-down"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 0.1,
|
||||
"false_positive_rate": 0.05,
|
||||
"average_duration_minutes": 3
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "HighErrorRate",
|
||||
"expr": "sum(rate(http_requests_total{service=\"payment-service\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"payment-service\"}[5m])) > 0.01",
|
||||
"for": "2m",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High error rate detected",
|
||||
"description": "Error rate is {{ $value | humanizePercentage }} for payment-service",
|
||||
"runbook_url": "https://runbooks.company.com/high-error-rate"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 1.8,
|
||||
"false_positive_rate": 0.25,
|
||||
"average_duration_minutes": 8
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "HighCPUUsage",
|
||||
"expr": "rate(process_cpu_seconds_total{service=\"payment-service\"}[5m]) * 100 > 80",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High CPU usage",
|
||||
"description": "CPU usage is {{ $value }}% for payment-service"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 15.2,
|
||||
"false_positive_rate": 0.8,
|
||||
"average_duration_minutes": 45
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "HighMemoryUsage",
|
||||
"expr": "process_resident_memory_bytes{service=\"payment-service\"} / process_virtual_memory_max_bytes{service=\"payment-service\"} * 100 > 85",
|
||||
"labels": {
|
||||
"severity": "info",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High memory usage",
|
||||
"description": "Memory usage is {{ $value }}% for payment-service"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 8.5,
|
||||
"false_positive_rate": 0.6,
|
||||
"average_duration_minutes": 30
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "DatabaseConnectionPoolExhaustion",
|
||||
"expr": "db_connections_active{service=\"payment-service\"} / db_connections_max{service=\"payment-service\"} > 0.9",
|
||||
"for": "1m",
|
||||
"labels": {
|
||||
"severity": "critical",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Database connection pool near exhaustion",
|
||||
"description": "Connection pool utilization is {{ $value | humanizePercentage }}",
|
||||
"runbook_url": "https://runbooks.company.com/db-connections"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 0.3,
|
||||
"false_positive_rate": 0.1,
|
||||
"average_duration_minutes": 5
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "LowTraffic",
|
||||
"expr": "sum(rate(http_requests_total{service=\"payment-service\"}[5m])) < 10",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Unusually low traffic",
|
||||
"description": "Request rate is {{ $value }} RPS, which is unusually low"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 12.0,
|
||||
"false_positive_rate": 0.9,
|
||||
"average_duration_minutes": 120
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "HighLatencyDuplicate",
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m])) > 0.5",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "High request latency detected (duplicate)",
|
||||
"description": "95th percentile latency is {{ $value }}s for payment-service"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 2.5,
|
||||
"false_positive_rate": 0.15,
|
||||
"average_duration_minutes": 12
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "VeryLowErrorRate",
|
||||
"expr": "sum(rate(http_requests_total{service=\"payment-service\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"payment-service\"}[5m])) > 0.001",
|
||||
"labels": {
|
||||
"severity": "info",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Error rate above 0.1%",
|
||||
"description": "Error rate is {{ $value | humanizePercentage }}"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 25.0,
|
||||
"false_positive_rate": 0.95,
|
||||
"average_duration_minutes": 5
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "DiskUsageHigh",
|
||||
"expr": "disk_usage_percent{service=\"payment-service\"} > 85",
|
||||
"labels": {
|
||||
"severity": "warning",
|
||||
"service": "payment-service",
|
||||
"team": "payments"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Disk usage high",
|
||||
"description": "Disk usage is {{ $value }}%"
|
||||
},
|
||||
"historical_data": {
|
||||
"fires_per_day": 3.2,
|
||||
"false_positive_rate": 0.4,
|
||||
"average_duration_minutes": 240
|
||||
}
|
||||
}
|
||||
],
|
||||
"services": [
|
||||
{
|
||||
"name": "payment-service",
|
||||
"type": "api",
|
||||
"criticality": "critical",
|
||||
"team": "payments"
|
||||
},
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high",
|
||||
"team": "identity"
|
||||
},
|
||||
{
|
||||
"name": "notification-service",
|
||||
"type": "api",
|
||||
"criticality": "medium",
|
||||
"team": "communications"
|
||||
}
|
||||
],
|
||||
"alert_routing": {
|
||||
"routes": [
|
||||
{
|
||||
"match": {
|
||||
"severity": "critical"
|
||||
},
|
||||
"receiver": "pager-critical",
|
||||
"group_wait": "10s",
|
||||
"group_interval": "1m",
|
||||
"repeat_interval": "5m"
|
||||
},
|
||||
{
|
||||
"match": {
|
||||
"severity": "warning"
|
||||
},
|
||||
"receiver": "slack-warnings",
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "1h"
|
||||
},
|
||||
{
|
||||
"match": {
|
||||
"severity": "info"
|
||||
},
|
||||
"receiver": "email-info",
|
||||
"group_wait": "2m",
|
||||
"group_interval": "10m",
|
||||
"repeat_interval": "24h"
|
||||
}
|
||||
]
|
||||
},
|
||||
"receivers": [
|
||||
{
|
||||
"name": "pager-critical",
|
||||
"pagerduty_configs": [
|
||||
{
|
||||
"routing_key": "pager-key-critical",
|
||||
"description": "Critical alert: {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "slack-warnings",
|
||||
"slack_configs": [
|
||||
{
|
||||
"api_url": "https://hooks.slack.com/services/warnings",
|
||||
"channel": "#alerts-warnings",
|
||||
"title": "Warning Alert",
|
||||
"text": "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "email-info",
|
||||
"email_configs": [
|
||||
{
|
||||
"to": "team-notifications@company.com",
|
||||
"subject": "Info Alert: {{ .GroupLabels.alertname }}",
|
||||
"body": "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
{
|
||||
"name": "payment-service",
|
||||
"type": "api",
|
||||
"criticality": "critical",
|
||||
"user_facing": true,
|
||||
"description": "Handles payment processing and transaction management",
|
||||
"team": "payments",
|
||||
"environment": "production",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "payment-gateway",
|
||||
"type": "external",
|
||||
"criticality": "critical"
|
||||
},
|
||||
{
|
||||
"name": "fraud-detection",
|
||||
"type": "ml",
|
||||
"criticality": "high"
|
||||
}
|
||||
],
|
||||
"endpoints": [
|
||||
{
|
||||
"path": "/api/v1/payments",
|
||||
"method": "POST",
|
||||
"sla_latency_ms": 500,
|
||||
"expected_tps": 100
|
||||
},
|
||||
{
|
||||
"path": "/api/v1/payments/{id}",
|
||||
"method": "GET",
|
||||
"sla_latency_ms": 200,
|
||||
"expected_tps": 500
|
||||
},
|
||||
{
|
||||
"path": "/api/v1/payments/{id}/refund",
|
||||
"method": "POST",
|
||||
"sla_latency_ms": 1000,
|
||||
"expected_tps": 10
|
||||
}
|
||||
],
|
||||
"business_metrics": {
|
||||
"revenue_per_hour": {
|
||||
"metric": "sum(payment_amount * rate(payments_successful_total[1h]))",
|
||||
"target": 50000,
|
||||
"unit": "USD"
|
||||
},
|
||||
"conversion_rate": {
|
||||
"metric": "sum(rate(payments_successful_total[5m])) / sum(rate(payment_attempts_total[5m]))",
|
||||
"target": 0.95,
|
||||
"unit": "percentage"
|
||||
}
|
||||
},
|
||||
"infrastructure": {
|
||||
"container_orchestrator": "kubernetes",
|
||||
"replicas": 6,
|
||||
"cpu_limit": "2000m",
|
||||
"memory_limit": "4Gi",
|
||||
"database": {
|
||||
"type": "postgresql",
|
||||
"connection_pool_size": 20
|
||||
},
|
||||
"cache": {
|
||||
"type": "redis",
|
||||
"cluster_size": 3
|
||||
}
|
||||
},
|
||||
"compliance_requirements": [
|
||||
"PCI-DSS",
|
||||
"SOX",
|
||||
"GDPR"
|
||||
],
|
||||
"tags": [
|
||||
"payment",
|
||||
"transaction",
|
||||
"critical-path",
|
||||
"revenue-generating"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
{
|
||||
"name": "customer-portal",
|
||||
"type": "web",
|
||||
"criticality": "high",
|
||||
"user_facing": true,
|
||||
"description": "Customer-facing web application for account management and billing",
|
||||
"team": "frontend",
|
||||
"environment": "production",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "user-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "billing-service",
|
||||
"type": "api",
|
||||
"criticality": "high"
|
||||
},
|
||||
{
|
||||
"name": "notification-service",
|
||||
"type": "api",
|
||||
"criticality": "medium"
|
||||
},
|
||||
{
|
||||
"name": "cdn",
|
||||
"type": "external",
|
||||
"criticality": "medium"
|
||||
}
|
||||
],
|
||||
"pages": [
|
||||
{
|
||||
"path": "/dashboard",
|
||||
"sla_load_time_ms": 2000,
|
||||
"expected_concurrent_users": 1000
|
||||
},
|
||||
{
|
||||
"path": "/billing",
|
||||
"sla_load_time_ms": 3000,
|
||||
"expected_concurrent_users": 200
|
||||
},
|
||||
{
|
||||
"path": "/settings",
|
||||
"sla_load_time_ms": 1500,
|
||||
"expected_concurrent_users": 100
|
||||
}
|
||||
],
|
||||
"business_metrics": {
|
||||
"daily_active_users": {
|
||||
"metric": "count(user_sessions_started_total[1d])",
|
||||
"target": 10000,
|
||||
"unit": "users"
|
||||
},
|
||||
"session_duration": {
|
||||
"metric": "avg(user_session_duration_seconds)",
|
||||
"target": 300,
|
||||
"unit": "seconds"
|
||||
},
|
||||
"bounce_rate": {
|
||||
"metric": "sum(rate(page_views_bounced_total[1h])) / sum(rate(page_views_total[1h]))",
|
||||
"target": 0.3,
|
||||
"unit": "percentage"
|
||||
}
|
||||
},
|
||||
"infrastructure": {
|
||||
"container_orchestrator": "kubernetes",
|
||||
"replicas": 4,
|
||||
"cpu_limit": "1000m",
|
||||
"memory_limit": "2Gi",
|
||||
"storage": {
|
||||
"type": "nfs",
|
||||
"size": "50Gi"
|
||||
},
|
||||
"ingress": {
|
||||
"type": "nginx",
|
||||
"ssl_termination": true,
|
||||
"rate_limiting": {
|
||||
"requests_per_second": 100,
|
||||
"burst": 200
|
||||
}
|
||||
}
|
||||
},
|
||||
"monitoring": {
|
||||
"synthetic_checks": [
|
||||
{
|
||||
"name": "login_flow",
|
||||
"url": "/auth/login",
|
||||
"frequency": "1m",
|
||||
"locations": ["us-east", "eu-west", "ap-south"]
|
||||
},
|
||||
{
|
||||
"name": "checkout_flow",
|
||||
"url": "/billing/checkout",
|
||||
"frequency": "5m",
|
||||
"locations": ["us-east", "eu-west"]
|
||||
}
|
||||
],
|
||||
"rum": {
|
||||
"enabled": true,
|
||||
"sampling_rate": 0.1
|
||||
}
|
||||
},
|
||||
"compliance_requirements": [
|
||||
"GDPR",
|
||||
"CCPA"
|
||||
],
|
||||
"tags": [
|
||||
"frontend",
|
||||
"customer-facing",
|
||||
"billing",
|
||||
"high-traffic"
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user