KOA技术分享

专注 Koa.js 框架的编程知识分享

Koa.js云原生可观测性实践与监控告警

引言

在云原生环境下,系统的可观测性是保障服务稳定运行的关键。通过日志、指标、追踪三位一体的可观测性体系,可以快速定位问题、洞察系统行为、预防潜在风险。

可观测性三大支柱

云原生应用的可观测性包含三个核心维度:

类型 工具 用途
日志 ELK、Loki 问题排查、审计追踪
指标 Prometheus、Grafana 性能监控、容量规划
追踪 Jaeger、Zipkin 请求链路、性能分析

结构化日志实现

在 Koa.js 中实现结构化日志:

// 结构化日志服务
const pino = require('pino');

class LoggerService {
  constructor(options = {}) {
    this.logger = pino({
      level: process.env.LOG_LEVEL || 'info',
      formatters: {
        level: (label) => ({ level: label })
      },
      timestamp: () => `,"timestamp":"${new Date().toISOString()}"`,
      ...options
    });

    // 请求日志中间件
    this.requestLogger = this.createRequestLogger();
  }

  // 请求日志中间件
  createRequestLogger() {
    return async (ctx, next) => {
      const start = Date.now();
      const requestId = ctx.get('X-Request-ID') || this.generateId();

      ctx.state.requestId = requestId;

      // 等待请求完成
      await next();

      const duration = Date.now() - start;

      // 结构化日志输出
      this.logger.info({
        type: 'request',
        requestId,
        method: ctx.method,
        url: ctx.url,
        status: ctx.status,
        duration,
        ip: ctx.ip,
        userAgent: ctx.get('user-agent')
      }, 'HTTP Request');
    };
  }

  // 错误日志
  error(err, context = {}) {
    this.logger.error({
      type: 'error',
      requestId: context.requestId,
      stack: err.stack,
      message: err.message,
      ...context
    }, 'Application Error');
  }

  // 业务日志
  business(operation, data) {
    this.logger.info({
      type: 'business',
      operation,
      requestId: this.getRequestId(),
      ...data
    }, `Business: ${operation}`);
  }

  generateId() {
    return `req_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
  }

  getRequestId() {
    return require('async_hooks').executionAsyncId();
  }
}

// Koa 中间件使用
const logger = new LoggerService();

app.use(logger.requestLogger);

// 全局错误处理
app.use(async (ctx, next) => {
  try {
    await next();
  } catch (err) {
    logger.error(err, {
      requestId: ctx.state.requestId,
      url: ctx.url,
      method: ctx.method
    });
    ctx.status = err.status || 500;
    ctx.body = { error: err.message };
  }
});

Prometheus 指标采集

集成 Prometheus 实现应用指标监控:

// Prometheus 指标服务
const promClient = require('prom-client');

class MetricsService {
  constructor() {
    // 创建注册表
    this.register = new promClient.Registry();

    // 添加默认指标
    promClient.collectDefaultMetrics({ register: this.register });

    // 自定义指标
    this.setupCustomMetrics();
  }

  setupCustomMetrics() {
    // HTTP 请求计数器
    this.httpRequestsTotal = new promClient.Counter({
      name: 'http_requests_total',
      help: 'Total number of HTTP requests',
      labelNames: ['method', 'route', 'status'],
      registers: [this.register]
    });

    // HTTP 请求延迟
    this.httpRequestDuration = new promClient.Histogram({
      name: 'http_request_duration_seconds',
      help: 'Duration of HTTP requests in seconds',
      labelNames: ['method', 'route', 'status'],
      buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
      registers: [this.register]
    });

    // 业务指标
    this.businessOperations = new promClient.Counter({
      name: 'business_operations_total',
      help: 'Total number of business operations',
      labelNames: ['operation', 'status'],
      registers: [this.register]
    });

    // 活跃连接数
    this.activeConnections = new promClient.Gauge({
      name: 'active_connections',
      help: 'Number of active connections',
      registers: [this.register]
    });

    // 队列长度
    this.queueLength = new promClient.Gauge({
      name: 'queue_length',
      help: 'Current queue length',
      labelNames: ['queue_name'],
      registers: [this.register]
    });
  }

  // 指标中间件
  metricsMiddleware() {
    return async (ctx, next) => {
      const start = Date.now();
      const route = ctx.routePath || ctx.path;

      await next();

      const duration = (Date.now() - start) / 1000;

      // 记录指标
      this.httpRequestsTotal.inc({
        method: ctx.method,
        route: route,
        status: ctx.status
      });

      this.httpRequestDuration.observe({
        method: ctx.method,
        route: route,
        status: ctx.status
      }, duration);
    };
  }

  // 记录业务指标
  recordBusinessOperation(operation, status) {
    this.businessOperations.inc({ operation, status });
  }

  // 获取指标数据
  async getMetrics() {
    return this.register.metrics();
  }
}

// 指标自动采集端点
const metricsService = new MetricsService();

app.get('/metrics', async (ctx) => {
  ctx.set('Content-Type', metricsService.register.contentType);
  ctx.body = await metricsService.getMetrics();
});

// 使用中间件
app.use(metricsService.metricsMiddleware());

分布式追踪集成

使用 OpenTelemetry 实现分布式追踪:

// OpenTelemetry 追踪服务
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
const { KoaInstrumentation } = require('@opentelemetry/instrumentation-koa');
const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express');
const { resource from } = require('@opentelemetry/resources');
const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');

class TracingService {
  constructor(serviceName) {
    this.sdk = new NodeSDK({
      resource: new resource({
        [SemanticResourceAttributes.SERVICE_NAME]: serviceName,
        [SemanticResourceAttributes.SERVICE_VERSION]: process.env.APP_VERSION || '1.0.0'
      }),
      instrumentations: [
        new HttpInstrumentation(),
        new KoaInstrumentation(),
        new ExpressInstrumentation()
      ],
      exporter: new JaegerExporter({
        endpoint: process.env.JAEGER_ENDPOINT || 'http://localhost:14268/api/traces'
      })
    });
  }

  start() {
    this.sdk.start();

    // 优雅关闭
    process.on('SIGTERM', () => {
      this.sdk.shutdown().catch(console.error);
    });
  }

  // 获取当前追踪上下文
  getSpanContext() {
    const { trace } = require('@opentelemetry/api');
    return trace.getSpanContext();
  }

  // 创建自定义 span
  createSpan(name, attributes = {}) {
    const tracer = require('@opentelemetry/api').trace.getTracer('koa-app');
    return tracer.startSpan(name, { attributes });
  }
}

// 追踪中间件
class TracingMiddleware {
  constructor(tracingService) {
    this.tracing = tracingService;
  }

  middleware() {
    return async (ctx, next) => {
      // 确保有追踪上下文
      const parentSpan = this.tracing.getSpanContext();
      const span = this.tracing.createSpan(`${ctx.method} ${ctx.path}`, {
        'http.method': ctx.method,
        'http.url': ctx.url,
        'http.route': ctx.routePath || ctx.path
      });

      try {
        await next();
        span.setAttribute('http.status_code', ctx.status);
      } catch (error) {
        span.setAttribute('error', true);
        span.setAttribute('error.message', error.message);
        throw error;
      } finally {
        span.end();
      }
    };
  }
}

告警通知系统

构建多渠道告警通知:

// 告警服务
class AlertService {
  constructor() {
    this.channels = {
      email: new EmailChannel(),
      webhook: new WebhookChannel(),
      sms: new SMSChannel(),
      dingtalk: new DingtalkChannel()
    };

    this.rules = this.loadAlertRules();
  }

  // 告警规则配置
  loadAlertRules() {
    return [
      {
        name: 'high_error_rate',
        condition: (metrics) => metrics.errorRate > 0.05,
        severity: 'critical',
        channels: ['email', 'dingtalk'],
        message: '错误率超过5%'
      },
      {
        name: 'high_latency',
        condition: (metrics) => metrics.p99Latency > 2000,
        severity: 'warning',
        channels: ['webhook', 'dingtalk'],
        message: 'P99延迟超过2秒'
      },
      {
        name: 'low_availability',
        condition: (metrics) => metrics.availability < 0.99,
        severity: 'critical',
        channels: ['email', 'sms', 'dingtalk'],
        message: '可用性低于99%'
      },
      {
        name: 'high_memory',
        condition: (metrics) => metrics.memoryUsage > 0.85,
        severity: 'warning',
        channels: ['dingtalk'],
        message: '内存使用率超过85%'
      }
    ];
  }

  // 检查告警
  async checkAlerts(metrics) {
    for (const rule of this.rules) {
      if (rule.condition(metrics)) {
        await this.sendAlert(rule, metrics);
      }
    }
  }

  // 发送告警
  async sendAlert(rule, metrics) {
    const alert = {
      name: rule.name,
      severity: rule.severity,
      message: rule.message,
      timestamp: new Date().toISOString(),
      metrics: metrics,
      environment: process.env.NODE_ENV
    };

    // 发送到各个渠道
    for (const channelName of rule.channels) {
      const channel = this.channels[channelName];
      if (channel) {
        try {
          await channel.send(alert);
        } catch (error) {
          console.error(`Failed to send alert via ${channelName}:`, error);
        }
      }
    }
  }
}

// 告警检查定时任务
const alertService = new AlertService();

// 每分钟检查一次
setInterval(async () => {
  const metrics = await collectCurrentMetrics();
  await alertService.checkAlerts(metrics);
}, 60000);

Grafana 仪表盘配置

可观测性仪表盘配置示例:

# Grafana Dashboard JSON
{
  "dashboard": {
    "title": "Koa.js Application Overview",
    "panels": [
      {
        "title": "Requests Per Second",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "{{method}} {{route}} - {{status}}"
          }
        ]
      },
      {
        "title": "Response Time (P50/P95/P99)",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "P50"
          },
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "P95"
          },
          {
            "expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "P99"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(http_requests_total{status=~'5..'}[5m]) / rate(http_requests_total[5m]) * 100",
            "legendFormat": "Error Rate %"
          }
        ]
      },
      {
        "title": "Active Connections",
        "type": "graph",
        "targets": [
          {
            "expr": "active_connections",
            "legendFormat": "Connections"
          }
        ]
      },
      {
        "title": "Business Operations",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(business_operations_total[5m])",
            "legendFormat": "{{operation}} - {{status}}"
          }
        ]
      }
    ]
  }
}

总结

Koa.js 云原生可观测性实践的核心价值:

通过构建完善的的可观测性体系,可以实现从被动响应到主动预防的转变,大幅提升系统的稳定性和运维效率。

← 下一篇:Koa.js微服务架构与容器化部署实战