groups: - name: takeoutsaas-app interval: 30s rules: - alert: HighErrorRate expr: | sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~"5.."}[5m])) / sum(rate(http_server_request_duration_seconds_count[5m])) > 0.05 for: 5m labels: severity: critical annotations: summary: "API 5xx 错误率过高" description: "过去 5 分钟 5xx 占比超过 5%,请检查依赖或发布" - alert: HighP95Latency expr: | histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service_name)) > 1 for: 5m labels: severity: warning annotations: summary: "API P95 延迟过高" description: "过去 5 分钟 P95 超过 1s,请排查热点接口或依赖" - alert: InstanceDown expr: up{job=~"admin-api|mini-api|user-api"} == 0 for: 2m labels: severity: critical annotations: summary: "实例不可达" description: "Prometheus 抓取失败,实例处于 down 状态"