diff --git a/Document/05_部署运维.md b/Document/05_部署运维.md index 449ab67..2d90a48 100644 --- a/Document/05_部署运维.md +++ b/Document/05_部署运维.md @@ -709,35 +709,37 @@ scrape_configs: - targets: ['node-exporter:9100'] ``` -### 8.2 应用监控指标 +### 8.2 应用监控指标(OpenTelemetry + Prometheus Exporter) ```csharp -// Program.cs - 添加Prometheus监控 -builder.Services.AddPrometheusMetrics(); +// Program.cs - 指标与探针 +builder.Services.AddHealthChecks(); +builder.Services.AddOpenTelemetry() + .WithMetrics(metrics => + { + metrics + .AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation() + .AddRuntimeInstrumentation() + .AddPrometheusExporter(); // /metrics + }); -app.UseMetricServer(); // /metrics端点 -app.UseHttpMetrics(); // HTTP请求指标 +var app = builder.Build(); +app.MapHealthChecks("/healthz"); // 存活/就绪探针 +app.MapPrometheusScrapingEndpoint(); // 默认 /metrics +``` -// 自定义指标 -public class MetricsService +自定义业务指标(使用 `System.Diagnostics.Metrics`,由 Prometheus Exporter 暴露): +```csharp +internal static class BusinessMetrics { - private static readonly Counter OrderCreatedCounter = Metrics - .CreateCounter("orders_created_total", "Total orders created"); - - private static readonly Histogram OrderProcessingDuration = Metrics - .CreateHistogram("order_processing_duration_seconds", "Order processing duration"); - - public void RecordOrderCreated() - { - OrderCreatedCounter.Inc(); - } - - public IDisposable MeasureOrderProcessing() - { - return OrderProcessingDuration.NewTimer(); - } + private static readonly Meter Meter = new("TakeoutSaaS.App", "1.0.0"); + public static readonly Counter OrdersCreated = Meter.CreateCounter("orders_created_total", "个", "订单创建计数"); + public static readonly Histogram OrderProcessingSeconds = Meter.CreateHistogram("order_processing_duration_seconds", "s", "订单处理耗时"); } ``` +Prometheus 抓取示例:见 `deploy/prometheus/prometheus.yml`,默认拉取 `/metrics`,告警规则见 `deploy/prometheus/alert.rules.yml`。 + ### 8.3 Grafana仪表板 ```json { @@ -1007,4 +1009,3 @@ docker-compose up -d --force-recreate --no-deps api docker pull takeout-saas-api:previous-version docker-compose up -d ``` - diff --git a/Document/11_SystemTodo.md b/Document/11_SystemTodo.md index c0284c4..9d9adbe 100644 --- a/Document/11_SystemTodo.md +++ b/Document/11_SystemTodo.md @@ -28,7 +28,7 @@ ## 4. 安全与合规 - [x] RBAC 权限、租户隔离、用户/权限洞察 API 完整演示并在 Swagger 中提供示例。 - - [ ] 现状梳理:租户解析/过滤已具备(TenantResolutionMiddleware、TenantAwareDbContext),JWT 已写入 roles/permissions/tenant_id(JwtTokenService),PermissionAuthorize 已在 Admin API 使用,CurrentUserProfile 含角色/权限/租户;但仅有内嵌 string[] 权限存储,无角色/权限表与洞察查询,Swagger 缺少示例与多租户示例。 + - [x] 现状梳理:租户解析/过滤已具备(TenantResolutionMiddleware、TenantAwareDbContext),JWT 已写入 roles/permissions/tenant_id(JwtTokenService),PermissionAuthorize 已在 Admin API 使用,CurrentUserProfile 含角色/权限/租户;但仅有内嵌 string[] 权限存储,无角色/权限表与洞察查询,Swagger 缺少示例与多租户示例。 - [x] 差距与步骤: - [x] 增加权限/租户洞察查询(按用户、按租户分页)并确保带 tenant 过滤(TenantAwareDbContext 或 Dapper 参数化)。 - [x] 输出可读的角色/权限列表(基于现有种子/配置的只读查询)。【已落地:RBAC1 模型 + 角色/权限管理 API;Swagger 示例后续补充】 @@ -41,8 +41,8 @@ - [ ] Secret Store/KeyVault/KMS 管理敏感配置,禁止密钥写入 Git/数据库明文。 ## 5. 观测与运维 -- [ ] TraceId 贯通,并在 Serilog 中输出 Console/File/ELK 三种目标。 -- [ ] Prometheus exporter 暴露关键指标,/health 探针与告警规则同步推送。 +- [x] TraceId 贯通,Serilog 输出 Console/File(ELK 待后续配置)。 +- [x] Prometheus exporter 暴露关键指标,/health 探针与告警规则同步推送。 - [ ] PostgreSQL 全量/增量备份脚本及一次真实恢复演练报告。 ## 6. 业务能力补全 diff --git a/deploy/prometheus/alert.rules.yml b/deploy/prometheus/alert.rules.yml new file mode 100644 index 0000000..a74f845 --- /dev/null +++ b/deploy/prometheus/alert.rules.yml @@ -0,0 +1,34 @@ +groups: + - name: takeoutsaas-app + interval: 30s + rules: + - alert: HighErrorRate + expr: | + sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~"5.."}[5m])) + / sum(rate(http_server_request_duration_seconds_count[5m])) > 0.05 + for: 5m + labels: + severity: critical + annotations: + summary: "API 5xx 错误率过高" + description: "过去 5 分钟 5xx 占比超过 5%,请检查依赖或发布" + + - alert: HighP95Latency + expr: | + histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service_name)) + > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "API P95 延迟过高" + description: "过去 5 分钟 P95 超过 1s,请排查热点接口或依赖" + + - alert: InstanceDown + expr: up{job=~"admin-api|mini-api|user-api"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "实例不可达" + description: "Prometheus 抓取失败,实例处于 down 状态" diff --git a/deploy/prometheus/prometheus.yml b/deploy/prometheus/prometheus.yml new file mode 100644 index 0000000..3385f12 --- /dev/null +++ b/deploy/prometheus/prometheus.yml @@ -0,0 +1,28 @@ +global: + scrape_interval: 15s + evaluation_interval: 30s + +rule_files: + - alert.rules.yml + +scrape_configs: + - job_name: admin-api + metrics_path: /metrics + static_configs: + - targets: ["admin-api:8080"] + labels: + service: admin-api + + - job_name: mini-api + metrics_path: /metrics + static_configs: + - targets: ["mini-api:8080"] + labels: + service: mini-api + + - job_name: user-api + metrics_path: /metrics + static_configs: + - targets: ["user-api:8080"] + labels: + service: user-api diff --git a/src/Api/TakeoutSaaS.AdminApi/Program.cs b/src/Api/TakeoutSaaS.AdminApi/Program.cs index 0feed22..32b6036 100644 --- a/src/Api/TakeoutSaaS.AdminApi/Program.cs +++ b/src/Api/TakeoutSaaS.AdminApi/Program.cs @@ -27,6 +27,7 @@ using TakeoutSaaS.Shared.Web.Extensions; using TakeoutSaaS.Shared.Web.Swagger; var builder = WebApplication.CreateBuilder(args); +const string logTemplate = "[{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} {Level:u3}] [TraceId:{TraceId}] [SpanId:{SpanId}] [Service:{Service}] {SourceContext} {Message:lj}{NewLine}{Exception}"; builder.Configuration .AddJsonFile("appsettings.Seed.json", optional: true, reloadOnChange: true) @@ -37,12 +38,13 @@ builder.Host.UseSerilog((context, _, configuration) => configuration .Enrich.FromLogContext() .Enrich.WithProperty("Service", "AdminApi") - .WriteTo.Console() + .WriteTo.Console(outputTemplate: logTemplate) .WriteTo.File( "logs/admin-api-.log", rollingInterval: RollingInterval.Day, retainedFileCountLimit: 7, - shared: true); + shared: true, + outputTemplate: logTemplate); }); builder.Services.AddSharedWebCore(); @@ -68,6 +70,7 @@ builder.Services.AddSmsApplication(builder.Configuration); builder.Services.AddMessagingModule(builder.Configuration); builder.Services.AddMessagingApplication(); builder.Services.AddSchedulerModule(builder.Configuration); +builder.Services.AddHealthChecks(); var otelSection = builder.Configuration.GetSection("Otel"); var otelEndpoint = otelSection.GetValue("Endpoint"); var useConsoleExporter = otelSection.GetValue("UseConsoleExporter") ?? builder.Environment.IsDevelopment(); @@ -102,7 +105,8 @@ builder.Services.AddOpenTelemetry() metrics .AddAspNetCoreInstrumentation() .AddHttpClientInstrumentation() - .AddRuntimeInstrumentation(); + .AddRuntimeInstrumentation() + .AddPrometheusExporter(); if (!string.IsNullOrWhiteSpace(otelEndpoint)) { @@ -137,6 +141,8 @@ app.UseAuthorization(); app.UseSharedSwagger(); app.UseSchedulerDashboard(builder.Configuration); +app.MapHealthChecks("/healthz"); +app.MapPrometheusScrapingEndpoint(); app.MapControllers(); app.Run(); diff --git a/src/Api/TakeoutSaaS.AdminApi/TakeoutSaaS.AdminApi.csproj b/src/Api/TakeoutSaaS.AdminApi/TakeoutSaaS.AdminApi.csproj index e52a27b..57391fc 100644 Binary files a/src/Api/TakeoutSaaS.AdminApi/TakeoutSaaS.AdminApi.csproj and b/src/Api/TakeoutSaaS.AdminApi/TakeoutSaaS.AdminApi.csproj differ diff --git a/src/Api/TakeoutSaaS.MiniApi/Program.cs b/src/Api/TakeoutSaaS.MiniApi/Program.cs index 0ae228f..34d6a45 100644 --- a/src/Api/TakeoutSaaS.MiniApi/Program.cs +++ b/src/Api/TakeoutSaaS.MiniApi/Program.cs @@ -17,18 +17,20 @@ using TakeoutSaaS.Shared.Web.Extensions; using TakeoutSaaS.Shared.Web.Swagger; var builder = WebApplication.CreateBuilder(args); +const string logTemplate = "[{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} {Level:u3}] [TraceId:{TraceId}] [SpanId:{SpanId}] [Service:{Service}] {SourceContext} {Message:lj}{NewLine}{Exception}"; builder.Host.UseSerilog((_, _, configuration) => { configuration .Enrich.FromLogContext() .Enrich.WithProperty("Service", "MiniApi") - .WriteTo.Console() + .WriteTo.Console(outputTemplate: logTemplate) .WriteTo.File( "logs/mini-api-.log", rollingInterval: RollingInterval.Day, retainedFileCountLimit: 7, - shared: true); + shared: true, + outputTemplate: logTemplate); }); builder.Services.AddSharedWebCore(); @@ -45,6 +47,7 @@ builder.Services.AddSmsModule(builder.Configuration); builder.Services.AddSmsApplication(builder.Configuration); builder.Services.AddMessagingModule(builder.Configuration); builder.Services.AddMessagingApplication(); +builder.Services.AddHealthChecks(); var otelSection = builder.Configuration.GetSection("Otel"); var otelEndpoint = otelSection.GetValue("Endpoint"); var useConsoleExporter = otelSection.GetValue("UseConsoleExporter") ?? builder.Environment.IsDevelopment(); @@ -79,7 +82,8 @@ builder.Services.AddOpenTelemetry() metrics .AddAspNetCoreInstrumentation() .AddHttpClientInstrumentation() - .AddRuntimeInstrumentation(); + .AddRuntimeInstrumentation() + .AddPrometheusExporter(); if (!string.IsNullOrWhiteSpace(otelEndpoint)) { @@ -111,6 +115,8 @@ app.UseTenantResolution(); app.UseSharedWebCore(); app.UseSharedSwagger(); +app.MapHealthChecks("/healthz"); +app.MapPrometheusScrapingEndpoint(); app.MapControllers(); app.Run(); diff --git a/src/Api/TakeoutSaaS.MiniApi/TakeoutSaaS.MiniApi.csproj b/src/Api/TakeoutSaaS.MiniApi/TakeoutSaaS.MiniApi.csproj index c5e376a..0b5e0ba 100644 Binary files a/src/Api/TakeoutSaaS.MiniApi/TakeoutSaaS.MiniApi.csproj and b/src/Api/TakeoutSaaS.MiniApi/TakeoutSaaS.MiniApi.csproj differ diff --git a/src/Api/TakeoutSaaS.UserApi/Program.cs b/src/Api/TakeoutSaaS.UserApi/Program.cs index 7f76faa..98ed208 100644 --- a/src/Api/TakeoutSaaS.UserApi/Program.cs +++ b/src/Api/TakeoutSaaS.UserApi/Program.cs @@ -11,18 +11,20 @@ using TakeoutSaaS.Shared.Web.Extensions; using TakeoutSaaS.Shared.Web.Swagger; var builder = WebApplication.CreateBuilder(args); +const string logTemplate = "[{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} {Level:u3}] [TraceId:{TraceId}] [SpanId:{SpanId}] [Service:{Service}] {SourceContext} {Message:lj}{NewLine}{Exception}"; builder.Host.UseSerilog((_, _, configuration) => { configuration .Enrich.FromLogContext() .Enrich.WithProperty("Service", "UserApi") - .WriteTo.Console() + .WriteTo.Console(outputTemplate: logTemplate) .WriteTo.File( "logs/user-api-.log", rollingInterval: RollingInterval.Day, retainedFileCountLimit: 7, - shared: true); + shared: true, + outputTemplate: logTemplate); }); builder.Services.AddSharedWebCore(); @@ -33,6 +35,7 @@ builder.Services.AddSharedSwagger(options => options.EnableAuthorization = true; }); builder.Services.AddTenantResolution(builder.Configuration); +builder.Services.AddHealthChecks(); var otelSection = builder.Configuration.GetSection("Otel"); var otelEndpoint = otelSection.GetValue("Endpoint"); var useConsoleExporter = otelSection.GetValue("UseConsoleExporter") ?? builder.Environment.IsDevelopment(); @@ -67,7 +70,8 @@ builder.Services.AddOpenTelemetry() metrics .AddAspNetCoreInstrumentation() .AddHttpClientInstrumentation() - .AddRuntimeInstrumentation(); + .AddRuntimeInstrumentation() + .AddPrometheusExporter(); if (!string.IsNullOrWhiteSpace(otelEndpoint)) { @@ -99,6 +103,8 @@ app.UseTenantResolution(); app.UseSharedWebCore(); app.UseSharedSwagger(); +app.MapHealthChecks("/healthz"); +app.MapPrometheusScrapingEndpoint(); app.MapControllers(); app.Run(); diff --git a/src/Api/TakeoutSaaS.UserApi/TakeoutSaaS.UserApi.csproj b/src/Api/TakeoutSaaS.UserApi/TakeoutSaaS.UserApi.csproj index 08afe63..b5f3476 100644 Binary files a/src/Api/TakeoutSaaS.UserApi/TakeoutSaaS.UserApi.csproj and b/src/Api/TakeoutSaaS.UserApi/TakeoutSaaS.UserApi.csproj differ diff --git a/src/Core/TakeoutSaaS.Shared.Abstractions/Diagnostics/TraceContext.cs b/src/Core/TakeoutSaaS.Shared.Abstractions/Diagnostics/TraceContext.cs index 0715cf1..ad8aa43 100644 --- a/src/Core/TakeoutSaaS.Shared.Abstractions/Diagnostics/TraceContext.cs +++ b/src/Core/TakeoutSaaS.Shared.Abstractions/Diagnostics/TraceContext.cs @@ -3,11 +3,12 @@ using System.Threading; namespace TakeoutSaaS.Shared.Abstractions.Diagnostics; /// -/// 轻量级 TraceId 上下文,便于跨层访问当前请求的追踪标识。 +/// 轻量级 TraceId/SpanId 上下文,便于跨层访问当前请求的追踪标识。 /// public static class TraceContext { private static readonly AsyncLocal TraceIdHolder = new(); + private static readonly AsyncLocal SpanIdHolder = new(); /// /// 当前请求的 TraceId。 @@ -18,8 +19,21 @@ public static class TraceContext set => TraceIdHolder.Value = value; } + /// + /// 当前请求的 SpanId。 + /// + public static string? SpanId + { + get => SpanIdHolder.Value; + set => SpanIdHolder.Value = value; + } + /// /// 清理 TraceId,避免 AsyncLocal 污染其它请求。 /// - public static void Clear() => TraceIdHolder.Value = null; + public static void Clear() + { + TraceIdHolder.Value = null; + SpanIdHolder.Value = null; + } } diff --git a/src/Core/TakeoutSaaS.Shared.Web/Middleware/CorrelationIdMiddleware.cs b/src/Core/TakeoutSaaS.Shared.Web/Middleware/CorrelationIdMiddleware.cs index 07740f7..e1dadd9 100644 --- a/src/Core/TakeoutSaaS.Shared.Web/Middleware/CorrelationIdMiddleware.cs +++ b/src/Core/TakeoutSaaS.Shared.Web/Middleware/CorrelationIdMiddleware.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Diagnostics; using System.Threading.Tasks; using Microsoft.AspNetCore.Http; using Microsoft.Extensions.Logging; @@ -14,23 +15,43 @@ namespace TakeoutSaaS.Shared.Web.Middleware; public sealed class CorrelationIdMiddleware(RequestDelegate next, ILogger logger, IIdGenerator idGenerator) { private const string TraceHeader = "X-Trace-Id"; + private const string SpanHeader = "X-Span-Id"; private const string RequestHeader = "X-Request-Id"; public async Task InvokeAsync(HttpContext context) { - var traceId = ResolveTraceId(context); + var ownsActivity = Activity.Current is null; + var activity = Activity.Current ?? new Activity("TakeoutSaaS.Request"); + + if (activity.Id is null) + { + activity.SetIdFormat(ActivityIdFormat.W3C); + activity.Start(); + } + + var traceId = activity.TraceId.ToString(); + var spanId = activity.SpanId.ToString(); + + if (string.IsNullOrWhiteSpace(traceId)) + { + traceId = ResolveTraceId(context); + } + context.TraceIdentifier = traceId; TraceContext.TraceId = traceId; + TraceContext.SpanId = spanId; context.Response.OnStarting(() => { context.Response.Headers[TraceHeader] = traceId; + context.Response.Headers[SpanHeader] = spanId; return Task.CompletedTask; }); using (logger.BeginScope(new Dictionary { - ["TraceId"] = traceId + ["TraceId"] = traceId, + ["SpanId"] = spanId })) { try @@ -40,6 +61,10 @@ public sealed class CorrelationIdMiddleware(RequestDelegate next, ILogger {StatusCode} ({Elapsed} ms) TraceId:{TraceId}", + "HTTP {Method} {Path} => {StatusCode} ({Elapsed} ms) TraceId:{TraceId} SpanId:{SpanId}", context.Request.Method, context.Request.Path, context.Response.StatusCode, stopwatch.Elapsed.TotalMilliseconds, - traceId); + traceId, + spanId); } } }