Koa.js云原生可观测性实践与监控告警
引言
在云原生环境下,系统的可观测性是保障服务稳定运行的关键。通过日志、指标、追踪三位一体的可观测性体系,可以快速定位问题、洞察系统行为、预防潜在风险。
可观测性三大支柱
云原生应用的可观测性包含三个核心维度:
| 类型 | 工具 | 用途 |
|---|---|---|
| 日志 | ELK、Loki | 问题排查、审计追踪 |
| 指标 | Prometheus、Grafana | 性能监控、容量规划 |
| 追踪 | Jaeger、Zipkin | 请求链路、性能分析 |
结构化日志实现
在 Koa.js 中实现结构化日志:
// 结构化日志服务
const pino = require('pino');
class LoggerService {
constructor(options = {}) {
this.logger = pino({
level: process.env.LOG_LEVEL || 'info',
formatters: {
level: (label) => ({ level: label })
},
timestamp: () => `,"timestamp":"${new Date().toISOString()}"`,
...options
});
// 请求日志中间件
this.requestLogger = this.createRequestLogger();
}
// 请求日志中间件
createRequestLogger() {
return async (ctx, next) => {
const start = Date.now();
const requestId = ctx.get('X-Request-ID') || this.generateId();
ctx.state.requestId = requestId;
// 等待请求完成
await next();
const duration = Date.now() - start;
// 结构化日志输出
this.logger.info({
type: 'request',
requestId,
method: ctx.method,
url: ctx.url,
status: ctx.status,
duration,
ip: ctx.ip,
userAgent: ctx.get('user-agent')
}, 'HTTP Request');
};
}
// 错误日志
error(err, context = {}) {
this.logger.error({
type: 'error',
requestId: context.requestId,
stack: err.stack,
message: err.message,
...context
}, 'Application Error');
}
// 业务日志
business(operation, data) {
this.logger.info({
type: 'business',
operation,
requestId: this.getRequestId(),
...data
}, `Business: ${operation}`);
}
generateId() {
return `req_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
}
getRequestId() {
return require('async_hooks').executionAsyncId();
}
}
// Koa 中间件使用
const logger = new LoggerService();
app.use(logger.requestLogger);
// 全局错误处理
app.use(async (ctx, next) => {
try {
await next();
} catch (err) {
logger.error(err, {
requestId: ctx.state.requestId,
url: ctx.url,
method: ctx.method
});
ctx.status = err.status || 500;
ctx.body = { error: err.message };
}
});
Prometheus 指标采集
集成 Prometheus 实现应用指标监控:
// Prometheus 指标服务
const promClient = require('prom-client');
class MetricsService {
constructor() {
// 创建注册表
this.register = new promClient.Registry();
// 添加默认指标
promClient.collectDefaultMetrics({ register: this.register });
// 自定义指标
this.setupCustomMetrics();
}
setupCustomMetrics() {
// HTTP 请求计数器
this.httpRequestsTotal = new promClient.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status'],
registers: [this.register]
});
// HTTP 请求延迟
this.httpRequestDuration = new promClient.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status'],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
registers: [this.register]
});
// 业务指标
this.businessOperations = new promClient.Counter({
name: 'business_operations_total',
help: 'Total number of business operations',
labelNames: ['operation', 'status'],
registers: [this.register]
});
// 活跃连接数
this.activeConnections = new promClient.Gauge({
name: 'active_connections',
help: 'Number of active connections',
registers: [this.register]
});
// 队列长度
this.queueLength = new promClient.Gauge({
name: 'queue_length',
help: 'Current queue length',
labelNames: ['queue_name'],
registers: [this.register]
});
}
// 指标中间件
metricsMiddleware() {
return async (ctx, next) => {
const start = Date.now();
const route = ctx.routePath || ctx.path;
await next();
const duration = (Date.now() - start) / 1000;
// 记录指标
this.httpRequestsTotal.inc({
method: ctx.method,
route: route,
status: ctx.status
});
this.httpRequestDuration.observe({
method: ctx.method,
route: route,
status: ctx.status
}, duration);
};
}
// 记录业务指标
recordBusinessOperation(operation, status) {
this.businessOperations.inc({ operation, status });
}
// 获取指标数据
async getMetrics() {
return this.register.metrics();
}
}
// 指标自动采集端点
const metricsService = new MetricsService();
app.get('/metrics', async (ctx) => {
ctx.set('Content-Type', metricsService.register.contentType);
ctx.body = await metricsService.getMetrics();
});
// 使用中间件
app.use(metricsService.metricsMiddleware());
分布式追踪集成
使用 OpenTelemetry 实现分布式追踪:
// OpenTelemetry 追踪服务
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
const { KoaInstrumentation } = require('@opentelemetry/instrumentation-koa');
const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express');
const { resource from } = require('@opentelemetry/resources');
const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');
class TracingService {
constructor(serviceName) {
this.sdk = new NodeSDK({
resource: new resource({
[SemanticResourceAttributes.SERVICE_NAME]: serviceName,
[SemanticResourceAttributes.SERVICE_VERSION]: process.env.APP_VERSION || '1.0.0'
}),
instrumentations: [
new HttpInstrumentation(),
new KoaInstrumentation(),
new ExpressInstrumentation()
],
exporter: new JaegerExporter({
endpoint: process.env.JAEGER_ENDPOINT || 'http://localhost:14268/api/traces'
})
});
}
start() {
this.sdk.start();
// 优雅关闭
process.on('SIGTERM', () => {
this.sdk.shutdown().catch(console.error);
});
}
// 获取当前追踪上下文
getSpanContext() {
const { trace } = require('@opentelemetry/api');
return trace.getSpanContext();
}
// 创建自定义 span
createSpan(name, attributes = {}) {
const tracer = require('@opentelemetry/api').trace.getTracer('koa-app');
return tracer.startSpan(name, { attributes });
}
}
// 追踪中间件
class TracingMiddleware {
constructor(tracingService) {
this.tracing = tracingService;
}
middleware() {
return async (ctx, next) => {
// 确保有追踪上下文
const parentSpan = this.tracing.getSpanContext();
const span = this.tracing.createSpan(`${ctx.method} ${ctx.path}`, {
'http.method': ctx.method,
'http.url': ctx.url,
'http.route': ctx.routePath || ctx.path
});
try {
await next();
span.setAttribute('http.status_code', ctx.status);
} catch (error) {
span.setAttribute('error', true);
span.setAttribute('error.message', error.message);
throw error;
} finally {
span.end();
}
};
}
}
告警通知系统
构建多渠道告警通知:
// 告警服务
class AlertService {
constructor() {
this.channels = {
email: new EmailChannel(),
webhook: new WebhookChannel(),
sms: new SMSChannel(),
dingtalk: new DingtalkChannel()
};
this.rules = this.loadAlertRules();
}
// 告警规则配置
loadAlertRules() {
return [
{
name: 'high_error_rate',
condition: (metrics) => metrics.errorRate > 0.05,
severity: 'critical',
channels: ['email', 'dingtalk'],
message: '错误率超过5%'
},
{
name: 'high_latency',
condition: (metrics) => metrics.p99Latency > 2000,
severity: 'warning',
channels: ['webhook', 'dingtalk'],
message: 'P99延迟超过2秒'
},
{
name: 'low_availability',
condition: (metrics) => metrics.availability < 0.99,
severity: 'critical',
channels: ['email', 'sms', 'dingtalk'],
message: '可用性低于99%'
},
{
name: 'high_memory',
condition: (metrics) => metrics.memoryUsage > 0.85,
severity: 'warning',
channels: ['dingtalk'],
message: '内存使用率超过85%'
}
];
}
// 检查告警
async checkAlerts(metrics) {
for (const rule of this.rules) {
if (rule.condition(metrics)) {
await this.sendAlert(rule, metrics);
}
}
}
// 发送告警
async sendAlert(rule, metrics) {
const alert = {
name: rule.name,
severity: rule.severity,
message: rule.message,
timestamp: new Date().toISOString(),
metrics: metrics,
environment: process.env.NODE_ENV
};
// 发送到各个渠道
for (const channelName of rule.channels) {
const channel = this.channels[channelName];
if (channel) {
try {
await channel.send(alert);
} catch (error) {
console.error(`Failed to send alert via ${channelName}:`, error);
}
}
}
}
}
// 告警检查定时任务
const alertService = new AlertService();
// 每分钟检查一次
setInterval(async () => {
const metrics = await collectCurrentMetrics();
await alertService.checkAlerts(metrics);
}, 60000);
Grafana 仪表盘配置
可观测性仪表盘配置示例:
# Grafana Dashboard JSON
{
"dashboard": {
"title": "Koa.js Application Overview",
"panels": [
{
"title": "Requests Per Second",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{route}} - {{status}}"
}
]
},
{
"title": "Response Time (P50/P95/P99)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "P50"
},
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "P95"
},
{
"expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "P99"
}
]
},
{
"title": "Error Rate",
"type": "stat",
"targets": [
{
"expr": "rate(http_requests_total{status=~'5..'}[5m]) / rate(http_requests_total[5m]) * 100",
"legendFormat": "Error Rate %"
}
]
},
{
"title": "Active Connections",
"type": "graph",
"targets": [
{
"expr": "active_connections",
"legendFormat": "Connections"
}
]
},
{
"title": "Business Operations",
"type": "graph",
"targets": [
{
"expr": "rate(business_operations_total[5m])",
"legendFormat": "{{operation}} - {{status}}"
}
]
}
]
}
}
总结
Koa.js 云原生可观测性实践的核心价值:
- 快速定位:结构化日志和分布式追踪快速定位问题根因
- 性能优化:指标监控帮助识别性能瓶颈
- 主动预防:告警系统提前发现潜在问题
- 容量规划:历史数据支持容量规划和决策
- 服务保障:全链路可观测性提升服务可靠性
通过构建完善的的可观测性体系,可以实现从被动响应到主动预防的转变,大幅提升系统的稳定性和运维效率。