feat: add tracing enrichment and prometheus exporter
This commit is contained in:
34
deploy/prometheus/alert.rules.yml
Normal file
34
deploy/prometheus/alert.rules.yml
Normal file
@@ -0,0 +1,34 @@
|
||||
groups:
|
||||
- name: takeoutsaas-app
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~"5.."}[5m]))
|
||||
/ sum(rate(http_server_request_duration_seconds_count[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API 5xx 错误率过高"
|
||||
description: "过去 5 分钟 5xx 占比超过 5%,请检查依赖或发布"
|
||||
|
||||
- alert: HighP95Latency
|
||||
expr: |
|
||||
histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service_name))
|
||||
> 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "API P95 延迟过高"
|
||||
description: "过去 5 分钟 P95 超过 1s,请排查热点接口或依赖"
|
||||
|
||||
- alert: InstanceDown
|
||||
expr: up{job=~"admin-api|mini-api|user-api"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "实例不可达"
|
||||
description: "Prometheus 抓取失败,实例处于 down 状态"
|
||||
Reference in New Issue
Block a user