feat: add tracing enrichment and prometheus exporter

This commit is contained in:
msumshk
2025-12-02 22:29:38 +08:00
parent 0d2ad0aecb
commit 2121432d5d
13 changed files with 163 additions and 41 deletions

View File

@@ -709,35 +709,37 @@ scrape_configs:
- targets: ['node-exporter:9100']
```
### 8.2 应用监控指标
### 8.2 应用监控指标OpenTelemetry + Prometheus Exporter
```csharp
// Program.cs - 添加Prometheus监控
builder.Services.AddPrometheusMetrics();
app.UseMetricServer(); // /metrics端点
app.UseHttpMetrics(); // HTTP请求指标
// 自定义指标
public class MetricsService
// Program.cs - 指标与探针
builder.Services.AddHealthChecks();
builder.Services.AddOpenTelemetry()
.WithMetrics(metrics =>
{
private static readonly Counter OrderCreatedCounter = Metrics
.CreateCounter("orders_created_total", "Total orders created");
metrics
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddRuntimeInstrumentation()
.AddPrometheusExporter(); // /metrics
});
private static readonly Histogram OrderProcessingDuration = Metrics
.CreateHistogram("order_processing_duration_seconds", "Order processing duration");
var app = builder.Build();
app.MapHealthChecks("/healthz"); // 存活/就绪探针
app.MapPrometheusScrapingEndpoint(); // 默认 /metrics
```
public void RecordOrderCreated()
自定义业务指标(使用 `System.Diagnostics.Metrics`,由 Prometheus Exporter 暴露):
```csharp
internal static class BusinessMetrics
{
OrderCreatedCounter.Inc();
}
public IDisposable MeasureOrderProcessing()
{
return OrderProcessingDuration.NewTimer();
}
private static readonly Meter Meter = new("TakeoutSaaS.App", "1.0.0");
public static readonly Counter<long> OrdersCreated = Meter.CreateCounter<long>("orders_created_total", "个", "订单创建计数");
public static readonly Histogram<double> OrderProcessingSeconds = Meter.CreateHistogram<double>("order_processing_duration_seconds", "s", "订单处理耗时");
}
```
Prometheus 抓取示例:见 `deploy/prometheus/prometheus.yml`,默认拉取 `/metrics`,告警规则见 `deploy/prometheus/alert.rules.yml`
### 8.3 Grafana仪表板
```json
{
@@ -1007,4 +1009,3 @@ docker-compose up -d --force-recreate --no-deps api
docker pull takeout-saas-api:previous-version
docker-compose up -d
```

View File

@@ -28,7 +28,7 @@
## 4. 安全与合规
- [x] RBAC 权限、租户隔离、用户/权限洞察 API 完整演示并在 Swagger 中提供示例。
- [ ] 现状梳理:租户解析/过滤已具备TenantResolutionMiddleware、TenantAwareDbContextJWT 已写入 roles/permissions/tenant_idJwtTokenServicePermissionAuthorize 已在 Admin API 使用CurrentUserProfile 含角色/权限/租户;但仅有内嵌 string[] 权限存储,无角色/权限表与洞察查询Swagger 缺少示例与多租户示例。
- [x] 现状梳理:租户解析/过滤已具备TenantResolutionMiddleware、TenantAwareDbContextJWT 已写入 roles/permissions/tenant_idJwtTokenServicePermissionAuthorize 已在 Admin API 使用CurrentUserProfile 含角色/权限/租户;但仅有内嵌 string[] 权限存储,无角色/权限表与洞察查询Swagger 缺少示例与多租户示例。
- [x] 差距与步骤:
- [x] 增加权限/租户洞察查询(按用户、按租户分页)并确保带 tenant 过滤TenantAwareDbContext 或 Dapper 参数化)。
- [x] 输出可读的角色/权限列表(基于现有种子/配置的只读查询。【已落地RBAC1 模型 + 角色/权限管理 APISwagger 示例后续补充】
@@ -41,8 +41,8 @@
- [ ] Secret Store/KeyVault/KMS 管理敏感配置,禁止密钥写入 Git/数据库明文。
## 5. 观测与运维
- [ ] TraceId 贯通,并在 Serilog 输出 Console/File/ELK 三种目标
- [ ] Prometheus exporter 暴露关键指标,/health 探针与告警规则同步推送。
- [x] TraceId 贯通Serilog 输出 Console/FileELK 待后续配置)
- [x] Prometheus exporter 暴露关键指标,/health 探针与告警规则同步推送。
- [ ] PostgreSQL 全量/增量备份脚本及一次真实恢复演练报告。
## 6. 业务能力补全

View File

@@ -0,0 +1,34 @@
groups:
- name: takeoutsaas-app
interval: 30s
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~"5.."}[5m]))
/ sum(rate(http_server_request_duration_seconds_count[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "API 5xx 错误率过高"
description: "过去 5 分钟 5xx 占比超过 5%,请检查依赖或发布"
- alert: HighP95Latency
expr: |
histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service_name))
> 1
for: 5m
labels:
severity: warning
annotations:
summary: "API P95 延迟过高"
description: "过去 5 分钟 P95 超过 1s请排查热点接口或依赖"
- alert: InstanceDown
expr: up{job=~"admin-api|mini-api|user-api"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "实例不可达"
description: "Prometheus 抓取失败,实例处于 down 状态"

View File

@@ -0,0 +1,28 @@
global:
scrape_interval: 15s
evaluation_interval: 30s
rule_files:
- alert.rules.yml
scrape_configs:
- job_name: admin-api
metrics_path: /metrics
static_configs:
- targets: ["admin-api:8080"]
labels:
service: admin-api
- job_name: mini-api
metrics_path: /metrics
static_configs:
- targets: ["mini-api:8080"]
labels:
service: mini-api
- job_name: user-api
metrics_path: /metrics
static_configs:
- targets: ["user-api:8080"]
labels:
service: user-api

View File

@@ -27,6 +27,7 @@ using TakeoutSaaS.Shared.Web.Extensions;
using TakeoutSaaS.Shared.Web.Swagger;
var builder = WebApplication.CreateBuilder(args);
const string logTemplate = "[{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} {Level:u3}] [TraceId:{TraceId}] [SpanId:{SpanId}] [Service:{Service}] {SourceContext} {Message:lj}{NewLine}{Exception}";
builder.Configuration
.AddJsonFile("appsettings.Seed.json", optional: true, reloadOnChange: true)
@@ -37,12 +38,13 @@ builder.Host.UseSerilog((context, _, configuration) =>
configuration
.Enrich.FromLogContext()
.Enrich.WithProperty("Service", "AdminApi")
.WriteTo.Console()
.WriteTo.Console(outputTemplate: logTemplate)
.WriteTo.File(
"logs/admin-api-.log",
rollingInterval: RollingInterval.Day,
retainedFileCountLimit: 7,
shared: true);
shared: true,
outputTemplate: logTemplate);
});
builder.Services.AddSharedWebCore();
@@ -68,6 +70,7 @@ builder.Services.AddSmsApplication(builder.Configuration);
builder.Services.AddMessagingModule(builder.Configuration);
builder.Services.AddMessagingApplication();
builder.Services.AddSchedulerModule(builder.Configuration);
builder.Services.AddHealthChecks();
var otelSection = builder.Configuration.GetSection("Otel");
var otelEndpoint = otelSection.GetValue<string>("Endpoint");
var useConsoleExporter = otelSection.GetValue<bool?>("UseConsoleExporter") ?? builder.Environment.IsDevelopment();
@@ -102,7 +105,8 @@ builder.Services.AddOpenTelemetry()
metrics
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddRuntimeInstrumentation();
.AddRuntimeInstrumentation()
.AddPrometheusExporter();
if (!string.IsNullOrWhiteSpace(otelEndpoint))
{
@@ -137,6 +141,8 @@ app.UseAuthorization();
app.UseSharedSwagger();
app.UseSchedulerDashboard(builder.Configuration);
app.MapHealthChecks("/healthz");
app.MapPrometheusScrapingEndpoint();
app.MapControllers();
app.Run();

View File

@@ -17,18 +17,20 @@ using TakeoutSaaS.Shared.Web.Extensions;
using TakeoutSaaS.Shared.Web.Swagger;
var builder = WebApplication.CreateBuilder(args);
const string logTemplate = "[{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} {Level:u3}] [TraceId:{TraceId}] [SpanId:{SpanId}] [Service:{Service}] {SourceContext} {Message:lj}{NewLine}{Exception}";
builder.Host.UseSerilog((_, _, configuration) =>
{
configuration
.Enrich.FromLogContext()
.Enrich.WithProperty("Service", "MiniApi")
.WriteTo.Console()
.WriteTo.Console(outputTemplate: logTemplate)
.WriteTo.File(
"logs/mini-api-.log",
rollingInterval: RollingInterval.Day,
retainedFileCountLimit: 7,
shared: true);
shared: true,
outputTemplate: logTemplate);
});
builder.Services.AddSharedWebCore();
@@ -45,6 +47,7 @@ builder.Services.AddSmsModule(builder.Configuration);
builder.Services.AddSmsApplication(builder.Configuration);
builder.Services.AddMessagingModule(builder.Configuration);
builder.Services.AddMessagingApplication();
builder.Services.AddHealthChecks();
var otelSection = builder.Configuration.GetSection("Otel");
var otelEndpoint = otelSection.GetValue<string>("Endpoint");
var useConsoleExporter = otelSection.GetValue<bool?>("UseConsoleExporter") ?? builder.Environment.IsDevelopment();
@@ -79,7 +82,8 @@ builder.Services.AddOpenTelemetry()
metrics
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddRuntimeInstrumentation();
.AddRuntimeInstrumentation()
.AddPrometheusExporter();
if (!string.IsNullOrWhiteSpace(otelEndpoint))
{
@@ -111,6 +115,8 @@ app.UseTenantResolution();
app.UseSharedWebCore();
app.UseSharedSwagger();
app.MapHealthChecks("/healthz");
app.MapPrometheusScrapingEndpoint();
app.MapControllers();
app.Run();

View File

@@ -11,18 +11,20 @@ using TakeoutSaaS.Shared.Web.Extensions;
using TakeoutSaaS.Shared.Web.Swagger;
var builder = WebApplication.CreateBuilder(args);
const string logTemplate = "[{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} {Level:u3}] [TraceId:{TraceId}] [SpanId:{SpanId}] [Service:{Service}] {SourceContext} {Message:lj}{NewLine}{Exception}";
builder.Host.UseSerilog((_, _, configuration) =>
{
configuration
.Enrich.FromLogContext()
.Enrich.WithProperty("Service", "UserApi")
.WriteTo.Console()
.WriteTo.Console(outputTemplate: logTemplate)
.WriteTo.File(
"logs/user-api-.log",
rollingInterval: RollingInterval.Day,
retainedFileCountLimit: 7,
shared: true);
shared: true,
outputTemplate: logTemplate);
});
builder.Services.AddSharedWebCore();
@@ -33,6 +35,7 @@ builder.Services.AddSharedSwagger(options =>
options.EnableAuthorization = true;
});
builder.Services.AddTenantResolution(builder.Configuration);
builder.Services.AddHealthChecks();
var otelSection = builder.Configuration.GetSection("Otel");
var otelEndpoint = otelSection.GetValue<string>("Endpoint");
var useConsoleExporter = otelSection.GetValue<bool?>("UseConsoleExporter") ?? builder.Environment.IsDevelopment();
@@ -67,7 +70,8 @@ builder.Services.AddOpenTelemetry()
metrics
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddRuntimeInstrumentation();
.AddRuntimeInstrumentation()
.AddPrometheusExporter();
if (!string.IsNullOrWhiteSpace(otelEndpoint))
{
@@ -99,6 +103,8 @@ app.UseTenantResolution();
app.UseSharedWebCore();
app.UseSharedSwagger();
app.MapHealthChecks("/healthz");
app.MapPrometheusScrapingEndpoint();
app.MapControllers();
app.Run();

View File

@@ -3,11 +3,12 @@ using System.Threading;
namespace TakeoutSaaS.Shared.Abstractions.Diagnostics;
/// <summary>
/// 轻量级 TraceId 上下文,便于跨层访问当前请求的追踪标识。
/// 轻量级 TraceId/SpanId 上下文,便于跨层访问当前请求的追踪标识。
/// </summary>
public static class TraceContext
{
private static readonly AsyncLocal<string?> TraceIdHolder = new();
private static readonly AsyncLocal<string?> SpanIdHolder = new();
/// <summary>
/// 当前请求的 TraceId。
@@ -18,8 +19,21 @@ public static class TraceContext
set => TraceIdHolder.Value = value;
}
/// <summary>
/// 当前请求的 SpanId。
/// </summary>
public static string? SpanId
{
get => SpanIdHolder.Value;
set => SpanIdHolder.Value = value;
}
/// <summary>
/// 清理 TraceId避免 AsyncLocal 污染其它请求。
/// </summary>
public static void Clear() => TraceIdHolder.Value = null;
public static void Clear()
{
TraceIdHolder.Value = null;
SpanIdHolder.Value = null;
}
}

View File

@@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Http;
using Microsoft.Extensions.Logging;
@@ -14,23 +15,43 @@ namespace TakeoutSaaS.Shared.Web.Middleware;
public sealed class CorrelationIdMiddleware(RequestDelegate next, ILogger<CorrelationIdMiddleware> logger, IIdGenerator idGenerator)
{
private const string TraceHeader = "X-Trace-Id";
private const string SpanHeader = "X-Span-Id";
private const string RequestHeader = "X-Request-Id";
public async Task InvokeAsync(HttpContext context)
{
var traceId = ResolveTraceId(context);
var ownsActivity = Activity.Current is null;
var activity = Activity.Current ?? new Activity("TakeoutSaaS.Request");
if (activity.Id is null)
{
activity.SetIdFormat(ActivityIdFormat.W3C);
activity.Start();
}
var traceId = activity.TraceId.ToString();
var spanId = activity.SpanId.ToString();
if (string.IsNullOrWhiteSpace(traceId))
{
traceId = ResolveTraceId(context);
}
context.TraceIdentifier = traceId;
TraceContext.TraceId = traceId;
TraceContext.SpanId = spanId;
context.Response.OnStarting(() =>
{
context.Response.Headers[TraceHeader] = traceId;
context.Response.Headers[SpanHeader] = spanId;
return Task.CompletedTask;
});
using (logger.BeginScope(new Dictionary<string, object>
{
["TraceId"] = traceId
["TraceId"] = traceId,
["SpanId"] = spanId
}))
{
try
@@ -40,6 +61,10 @@ public sealed class CorrelationIdMiddleware(RequestDelegate next, ILogger<Correl
finally
{
TraceContext.Clear();
if (ownsActivity)
{
activity.Stop();
}
}
}
}

View File

@@ -23,13 +23,15 @@ public sealed class RequestLoggingMiddleware(RequestDelegate next, ILogger<Reque
{
stopwatch.Stop();
var traceId = TraceContext.TraceId ?? context.TraceIdentifier;
var spanId = TraceContext.SpanId ?? Activity.Current?.SpanId.ToString() ?? string.Empty;
logger.LogInformation(
"HTTP {Method} {Path} => {StatusCode} ({Elapsed} ms) TraceId:{TraceId}",
"HTTP {Method} {Path} => {StatusCode} ({Elapsed} ms) TraceId:{TraceId} SpanId:{SpanId}",
context.Request.Method,
context.Request.Path,
context.Response.StatusCode,
stopwatch.Elapsed.TotalMilliseconds,
traceId);
traceId,
spanId);
}
}
}