feat: add tracing enrichment and prometheus exporter
This commit is contained in:
@@ -709,35 +709,37 @@ scrape_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
```
|
||||
|
||||
### 8.2 应用监控指标
|
||||
### 8.2 应用监控指标(OpenTelemetry + Prometheus Exporter)
|
||||
```csharp
|
||||
// Program.cs - 添加Prometheus监控
|
||||
builder.Services.AddPrometheusMetrics();
|
||||
// Program.cs - 指标与探针
|
||||
builder.Services.AddHealthChecks();
|
||||
builder.Services.AddOpenTelemetry()
|
||||
.WithMetrics(metrics =>
|
||||
{
|
||||
metrics
|
||||
.AddAspNetCoreInstrumentation()
|
||||
.AddHttpClientInstrumentation()
|
||||
.AddRuntimeInstrumentation()
|
||||
.AddPrometheusExporter(); // /metrics
|
||||
});
|
||||
|
||||
app.UseMetricServer(); // /metrics端点
|
||||
app.UseHttpMetrics(); // HTTP请求指标
|
||||
var app = builder.Build();
|
||||
app.MapHealthChecks("/healthz"); // 存活/就绪探针
|
||||
app.MapPrometheusScrapingEndpoint(); // 默认 /metrics
|
||||
```
|
||||
|
||||
// 自定义指标
|
||||
public class MetricsService
|
||||
自定义业务指标(使用 `System.Diagnostics.Metrics`,由 Prometheus Exporter 暴露):
|
||||
```csharp
|
||||
internal static class BusinessMetrics
|
||||
{
|
||||
private static readonly Counter OrderCreatedCounter = Metrics
|
||||
.CreateCounter("orders_created_total", "Total orders created");
|
||||
|
||||
private static readonly Histogram OrderProcessingDuration = Metrics
|
||||
.CreateHistogram("order_processing_duration_seconds", "Order processing duration");
|
||||
|
||||
public void RecordOrderCreated()
|
||||
{
|
||||
OrderCreatedCounter.Inc();
|
||||
}
|
||||
|
||||
public IDisposable MeasureOrderProcessing()
|
||||
{
|
||||
return OrderProcessingDuration.NewTimer();
|
||||
}
|
||||
private static readonly Meter Meter = new("TakeoutSaaS.App", "1.0.0");
|
||||
public static readonly Counter<long> OrdersCreated = Meter.CreateCounter<long>("orders_created_total", "个", "订单创建计数");
|
||||
public static readonly Histogram<double> OrderProcessingSeconds = Meter.CreateHistogram<double>("order_processing_duration_seconds", "s", "订单处理耗时");
|
||||
}
|
||||
```
|
||||
|
||||
Prometheus 抓取示例:见 `deploy/prometheus/prometheus.yml`,默认拉取 `/metrics`,告警规则见 `deploy/prometheus/alert.rules.yml`。
|
||||
|
||||
### 8.3 Grafana仪表板
|
||||
```json
|
||||
{
|
||||
@@ -1007,4 +1009,3 @@ docker-compose up -d --force-recreate --no-deps api
|
||||
docker pull takeout-saas-api:previous-version
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
|
||||
## 4. 安全与合规
|
||||
- [x] RBAC 权限、租户隔离、用户/权限洞察 API 完整演示并在 Swagger 中提供示例。
|
||||
- [ ] 现状梳理:租户解析/过滤已具备(TenantResolutionMiddleware、TenantAwareDbContext),JWT 已写入 roles/permissions/tenant_id(JwtTokenService),PermissionAuthorize 已在 Admin API 使用,CurrentUserProfile 含角色/权限/租户;但仅有内嵌 string[] 权限存储,无角色/权限表与洞察查询,Swagger 缺少示例与多租户示例。
|
||||
- [x] 现状梳理:租户解析/过滤已具备(TenantResolutionMiddleware、TenantAwareDbContext),JWT 已写入 roles/permissions/tenant_id(JwtTokenService),PermissionAuthorize 已在 Admin API 使用,CurrentUserProfile 含角色/权限/租户;但仅有内嵌 string[] 权限存储,无角色/权限表与洞察查询,Swagger 缺少示例与多租户示例。
|
||||
- [x] 差距与步骤:
|
||||
- [x] 增加权限/租户洞察查询(按用户、按租户分页)并确保带 tenant 过滤(TenantAwareDbContext 或 Dapper 参数化)。
|
||||
- [x] 输出可读的角色/权限列表(基于现有种子/配置的只读查询)。【已落地:RBAC1 模型 + 角色/权限管理 API;Swagger 示例后续补充】
|
||||
@@ -41,8 +41,8 @@
|
||||
- [ ] Secret Store/KeyVault/KMS 管理敏感配置,禁止密钥写入 Git/数据库明文。
|
||||
|
||||
## 5. 观测与运维
|
||||
- [ ] TraceId 贯通,并在 Serilog 中输出 Console/File/ELK 三种目标。
|
||||
- [ ] Prometheus exporter 暴露关键指标,/health 探针与告警规则同步推送。
|
||||
- [x] TraceId 贯通,Serilog 输出 Console/File(ELK 待后续配置)。
|
||||
- [x] Prometheus exporter 暴露关键指标,/health 探针与告警规则同步推送。
|
||||
- [ ] PostgreSQL 全量/增量备份脚本及一次真实恢复演练报告。
|
||||
|
||||
## 6. 业务能力补全
|
||||
|
||||
34
deploy/prometheus/alert.rules.yml
Normal file
34
deploy/prometheus/alert.rules.yml
Normal file
@@ -0,0 +1,34 @@
|
||||
groups:
|
||||
- name: takeoutsaas-app
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~"5.."}[5m]))
|
||||
/ sum(rate(http_server_request_duration_seconds_count[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API 5xx 错误率过高"
|
||||
description: "过去 5 分钟 5xx 占比超过 5%,请检查依赖或发布"
|
||||
|
||||
- alert: HighP95Latency
|
||||
expr: |
|
||||
histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service_name))
|
||||
> 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "API P95 延迟过高"
|
||||
description: "过去 5 分钟 P95 超过 1s,请排查热点接口或依赖"
|
||||
|
||||
- alert: InstanceDown
|
||||
expr: up{job=~"admin-api|mini-api|user-api"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "实例不可达"
|
||||
description: "Prometheus 抓取失败,实例处于 down 状态"
|
||||
28
deploy/prometheus/prometheus.yml
Normal file
28
deploy/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,28 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 30s
|
||||
|
||||
rule_files:
|
||||
- alert.rules.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: admin-api
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["admin-api:8080"]
|
||||
labels:
|
||||
service: admin-api
|
||||
|
||||
- job_name: mini-api
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["mini-api:8080"]
|
||||
labels:
|
||||
service: mini-api
|
||||
|
||||
- job_name: user-api
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["user-api:8080"]
|
||||
labels:
|
||||
service: user-api
|
||||
@@ -27,6 +27,7 @@ using TakeoutSaaS.Shared.Web.Extensions;
|
||||
using TakeoutSaaS.Shared.Web.Swagger;
|
||||
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
const string logTemplate = "[{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} {Level:u3}] [TraceId:{TraceId}] [SpanId:{SpanId}] [Service:{Service}] {SourceContext} {Message:lj}{NewLine}{Exception}";
|
||||
|
||||
builder.Configuration
|
||||
.AddJsonFile("appsettings.Seed.json", optional: true, reloadOnChange: true)
|
||||
@@ -37,12 +38,13 @@ builder.Host.UseSerilog((context, _, configuration) =>
|
||||
configuration
|
||||
.Enrich.FromLogContext()
|
||||
.Enrich.WithProperty("Service", "AdminApi")
|
||||
.WriteTo.Console()
|
||||
.WriteTo.Console(outputTemplate: logTemplate)
|
||||
.WriteTo.File(
|
||||
"logs/admin-api-.log",
|
||||
rollingInterval: RollingInterval.Day,
|
||||
retainedFileCountLimit: 7,
|
||||
shared: true);
|
||||
shared: true,
|
||||
outputTemplate: logTemplate);
|
||||
});
|
||||
|
||||
builder.Services.AddSharedWebCore();
|
||||
@@ -68,6 +70,7 @@ builder.Services.AddSmsApplication(builder.Configuration);
|
||||
builder.Services.AddMessagingModule(builder.Configuration);
|
||||
builder.Services.AddMessagingApplication();
|
||||
builder.Services.AddSchedulerModule(builder.Configuration);
|
||||
builder.Services.AddHealthChecks();
|
||||
var otelSection = builder.Configuration.GetSection("Otel");
|
||||
var otelEndpoint = otelSection.GetValue<string>("Endpoint");
|
||||
var useConsoleExporter = otelSection.GetValue<bool?>("UseConsoleExporter") ?? builder.Environment.IsDevelopment();
|
||||
@@ -102,7 +105,8 @@ builder.Services.AddOpenTelemetry()
|
||||
metrics
|
||||
.AddAspNetCoreInstrumentation()
|
||||
.AddHttpClientInstrumentation()
|
||||
.AddRuntimeInstrumentation();
|
||||
.AddRuntimeInstrumentation()
|
||||
.AddPrometheusExporter();
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(otelEndpoint))
|
||||
{
|
||||
@@ -137,6 +141,8 @@ app.UseAuthorization();
|
||||
app.UseSharedSwagger();
|
||||
app.UseSchedulerDashboard(builder.Configuration);
|
||||
|
||||
app.MapHealthChecks("/healthz");
|
||||
app.MapPrometheusScrapingEndpoint();
|
||||
app.MapControllers();
|
||||
app.Run();
|
||||
|
||||
|
||||
Binary file not shown.
@@ -17,18 +17,20 @@ using TakeoutSaaS.Shared.Web.Extensions;
|
||||
using TakeoutSaaS.Shared.Web.Swagger;
|
||||
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
const string logTemplate = "[{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} {Level:u3}] [TraceId:{TraceId}] [SpanId:{SpanId}] [Service:{Service}] {SourceContext} {Message:lj}{NewLine}{Exception}";
|
||||
|
||||
builder.Host.UseSerilog((_, _, configuration) =>
|
||||
{
|
||||
configuration
|
||||
.Enrich.FromLogContext()
|
||||
.Enrich.WithProperty("Service", "MiniApi")
|
||||
.WriteTo.Console()
|
||||
.WriteTo.Console(outputTemplate: logTemplate)
|
||||
.WriteTo.File(
|
||||
"logs/mini-api-.log",
|
||||
rollingInterval: RollingInterval.Day,
|
||||
retainedFileCountLimit: 7,
|
||||
shared: true);
|
||||
shared: true,
|
||||
outputTemplate: logTemplate);
|
||||
});
|
||||
|
||||
builder.Services.AddSharedWebCore();
|
||||
@@ -45,6 +47,7 @@ builder.Services.AddSmsModule(builder.Configuration);
|
||||
builder.Services.AddSmsApplication(builder.Configuration);
|
||||
builder.Services.AddMessagingModule(builder.Configuration);
|
||||
builder.Services.AddMessagingApplication();
|
||||
builder.Services.AddHealthChecks();
|
||||
var otelSection = builder.Configuration.GetSection("Otel");
|
||||
var otelEndpoint = otelSection.GetValue<string>("Endpoint");
|
||||
var useConsoleExporter = otelSection.GetValue<bool?>("UseConsoleExporter") ?? builder.Environment.IsDevelopment();
|
||||
@@ -79,7 +82,8 @@ builder.Services.AddOpenTelemetry()
|
||||
metrics
|
||||
.AddAspNetCoreInstrumentation()
|
||||
.AddHttpClientInstrumentation()
|
||||
.AddRuntimeInstrumentation();
|
||||
.AddRuntimeInstrumentation()
|
||||
.AddPrometheusExporter();
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(otelEndpoint))
|
||||
{
|
||||
@@ -111,6 +115,8 @@ app.UseTenantResolution();
|
||||
app.UseSharedWebCore();
|
||||
app.UseSharedSwagger();
|
||||
|
||||
app.MapHealthChecks("/healthz");
|
||||
app.MapPrometheusScrapingEndpoint();
|
||||
app.MapControllers();
|
||||
app.Run();
|
||||
|
||||
|
||||
Binary file not shown.
@@ -11,18 +11,20 @@ using TakeoutSaaS.Shared.Web.Extensions;
|
||||
using TakeoutSaaS.Shared.Web.Swagger;
|
||||
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
const string logTemplate = "[{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} {Level:u3}] [TraceId:{TraceId}] [SpanId:{SpanId}] [Service:{Service}] {SourceContext} {Message:lj}{NewLine}{Exception}";
|
||||
|
||||
builder.Host.UseSerilog((_, _, configuration) =>
|
||||
{
|
||||
configuration
|
||||
.Enrich.FromLogContext()
|
||||
.Enrich.WithProperty("Service", "UserApi")
|
||||
.WriteTo.Console()
|
||||
.WriteTo.Console(outputTemplate: logTemplate)
|
||||
.WriteTo.File(
|
||||
"logs/user-api-.log",
|
||||
rollingInterval: RollingInterval.Day,
|
||||
retainedFileCountLimit: 7,
|
||||
shared: true);
|
||||
shared: true,
|
||||
outputTemplate: logTemplate);
|
||||
});
|
||||
|
||||
builder.Services.AddSharedWebCore();
|
||||
@@ -33,6 +35,7 @@ builder.Services.AddSharedSwagger(options =>
|
||||
options.EnableAuthorization = true;
|
||||
});
|
||||
builder.Services.AddTenantResolution(builder.Configuration);
|
||||
builder.Services.AddHealthChecks();
|
||||
var otelSection = builder.Configuration.GetSection("Otel");
|
||||
var otelEndpoint = otelSection.GetValue<string>("Endpoint");
|
||||
var useConsoleExporter = otelSection.GetValue<bool?>("UseConsoleExporter") ?? builder.Environment.IsDevelopment();
|
||||
@@ -67,7 +70,8 @@ builder.Services.AddOpenTelemetry()
|
||||
metrics
|
||||
.AddAspNetCoreInstrumentation()
|
||||
.AddHttpClientInstrumentation()
|
||||
.AddRuntimeInstrumentation();
|
||||
.AddRuntimeInstrumentation()
|
||||
.AddPrometheusExporter();
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(otelEndpoint))
|
||||
{
|
||||
@@ -99,6 +103,8 @@ app.UseTenantResolution();
|
||||
app.UseSharedWebCore();
|
||||
app.UseSharedSwagger();
|
||||
|
||||
app.MapHealthChecks("/healthz");
|
||||
app.MapPrometheusScrapingEndpoint();
|
||||
app.MapControllers();
|
||||
app.Run();
|
||||
|
||||
|
||||
Binary file not shown.
@@ -3,11 +3,12 @@ using System.Threading;
|
||||
namespace TakeoutSaaS.Shared.Abstractions.Diagnostics;
|
||||
|
||||
/// <summary>
|
||||
/// 轻量级 TraceId 上下文,便于跨层访问当前请求的追踪标识。
|
||||
/// 轻量级 TraceId/SpanId 上下文,便于跨层访问当前请求的追踪标识。
|
||||
/// </summary>
|
||||
public static class TraceContext
|
||||
{
|
||||
private static readonly AsyncLocal<string?> TraceIdHolder = new();
|
||||
private static readonly AsyncLocal<string?> SpanIdHolder = new();
|
||||
|
||||
/// <summary>
|
||||
/// 当前请求的 TraceId。
|
||||
@@ -18,8 +19,21 @@ public static class TraceContext
|
||||
set => TraceIdHolder.Value = value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// 当前请求的 SpanId。
|
||||
/// </summary>
|
||||
public static string? SpanId
|
||||
{
|
||||
get => SpanIdHolder.Value;
|
||||
set => SpanIdHolder.Value = value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// 清理 TraceId,避免 AsyncLocal 污染其它请求。
|
||||
/// </summary>
|
||||
public static void Clear() => TraceIdHolder.Value = null;
|
||||
public static void Clear()
|
||||
{
|
||||
TraceIdHolder.Value = null;
|
||||
SpanIdHolder.Value = null;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.Extensions.Logging;
|
||||
@@ -14,23 +15,43 @@ namespace TakeoutSaaS.Shared.Web.Middleware;
|
||||
public sealed class CorrelationIdMiddleware(RequestDelegate next, ILogger<CorrelationIdMiddleware> logger, IIdGenerator idGenerator)
|
||||
{
|
||||
private const string TraceHeader = "X-Trace-Id";
|
||||
private const string SpanHeader = "X-Span-Id";
|
||||
private const string RequestHeader = "X-Request-Id";
|
||||
|
||||
public async Task InvokeAsync(HttpContext context)
|
||||
{
|
||||
var traceId = ResolveTraceId(context);
|
||||
var ownsActivity = Activity.Current is null;
|
||||
var activity = Activity.Current ?? new Activity("TakeoutSaaS.Request");
|
||||
|
||||
if (activity.Id is null)
|
||||
{
|
||||
activity.SetIdFormat(ActivityIdFormat.W3C);
|
||||
activity.Start();
|
||||
}
|
||||
|
||||
var traceId = activity.TraceId.ToString();
|
||||
var spanId = activity.SpanId.ToString();
|
||||
|
||||
if (string.IsNullOrWhiteSpace(traceId))
|
||||
{
|
||||
traceId = ResolveTraceId(context);
|
||||
}
|
||||
|
||||
context.TraceIdentifier = traceId;
|
||||
TraceContext.TraceId = traceId;
|
||||
TraceContext.SpanId = spanId;
|
||||
|
||||
context.Response.OnStarting(() =>
|
||||
{
|
||||
context.Response.Headers[TraceHeader] = traceId;
|
||||
context.Response.Headers[SpanHeader] = spanId;
|
||||
return Task.CompletedTask;
|
||||
});
|
||||
|
||||
using (logger.BeginScope(new Dictionary<string, object>
|
||||
{
|
||||
["TraceId"] = traceId
|
||||
["TraceId"] = traceId,
|
||||
["SpanId"] = spanId
|
||||
}))
|
||||
{
|
||||
try
|
||||
@@ -40,6 +61,10 @@ public sealed class CorrelationIdMiddleware(RequestDelegate next, ILogger<Correl
|
||||
finally
|
||||
{
|
||||
TraceContext.Clear();
|
||||
if (ownsActivity)
|
||||
{
|
||||
activity.Stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,13 +23,15 @@ public sealed class RequestLoggingMiddleware(RequestDelegate next, ILogger<Reque
|
||||
{
|
||||
stopwatch.Stop();
|
||||
var traceId = TraceContext.TraceId ?? context.TraceIdentifier;
|
||||
var spanId = TraceContext.SpanId ?? Activity.Current?.SpanId.ToString() ?? string.Empty;
|
||||
logger.LogInformation(
|
||||
"HTTP {Method} {Path} => {StatusCode} ({Elapsed} ms) TraceId:{TraceId}",
|
||||
"HTTP {Method} {Path} => {StatusCode} ({Elapsed} ms) TraceId:{TraceId} SpanId:{SpanId}",
|
||||
context.Request.Method,
|
||||
context.Request.Path,
|
||||
context.Response.StatusCode,
|
||||
stopwatch.Elapsed.TotalMilliseconds,
|
||||
traceId);
|
||||
traceId,
|
||||
spanId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user