35 lines
1.1 KiB
YAML
35 lines
1.1 KiB
YAML
groups:
|
||
- name: takeoutsaas-app
|
||
interval: 30s
|
||
rules:
|
||
- alert: HighErrorRate
|
||
expr: |
|
||
sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~"5.."}[5m]))
|
||
/ sum(rate(http_server_request_duration_seconds_count[5m])) > 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "API 5xx 错误率过高"
|
||
description: "过去 5 分钟 5xx 占比超过 5%,请检查依赖或发布"
|
||
|
||
- alert: HighP95Latency
|
||
expr: |
|
||
histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service_name))
|
||
> 1
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "API P95 延迟过高"
|
||
description: "过去 5 分钟 P95 超过 1s,请排查热点接口或依赖"
|
||
|
||
- alert: InstanceDown
|
||
expr: up{job=~"admin-api|mini-api|user-api"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "实例不可达"
|
||
description: "Prometheus 抓取失败,实例处于 down 状态"
|