从零实现一个全链路监控平台:Metrics与Alerting
2026/6/27 1:27:34 网站建设 项目流程

前言

你有没有想过:当系统出现故障时,你怎么知道是哪个服务、哪个接口出了问题?CPU飙高、内存泄漏、接口变慢——这些怎么才能提前发现?

全链路监控平台是可观测性的三大支柱之一(Metrics + Logging + Tracing)。今天我们从零实现:

· 指标采集(Counter、Gauge、Histogram)
· 指标聚合与存储
· 告警规则引擎
· 告警通知
· 可视化Dashboard

---

一、监控平台核心原理

1. 架构图

```
┌─────────────────────────────────────────────────────────────┐
│ 应用服务 │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │ 指标采集 │→│ 指标聚合 │→│ 指标上报 │ │
│ └─────────┘ └─────────┘ └─────────┘ │
└─────────────────────────────────────────────────────────────┘


┌─────────────────────────────────────────────────────────────┐
│ 监控中心 │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ 指标存储 │ │ 规则引擎 │ │ 告警通知 │ │
│ │ (时序数据) │ │ (阈值判断) │ │ (钉钉/邮件) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────────┘


┌─────────────┐
│ Dashboard │
│ (可视化) │
└─────────────┘
```

2. 核心概念

概念 说明 示例
Counter 只增不减的计数器 请求总数
Gauge 可增可减的测量值 CPU使用率、内存使用量
Histogram 分布统计 请求延迟(P50/P95/P99)
Label 维度标签 服务名、接口名、状态码

---

二、完整代码实现

1. 基础数据结构

```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
#include <errno.h>
#include <math.h>

#define MAX_METRIC_NAME 128
#define MAX_LABELS 8
#define MAX_LABEL_KEY 32
#define MAX_LABEL_VALUE 64
#define MAX_HISTOGRAM_BUCKETS 20
#define MAX_ALERTS 100

// 指标类型
typedef enum {
METRIC_COUNTER = 0,
METRIC_GAUGE,
METRIC_HISTOGRAM,
METRIC_SUMMARY
} metric_type_t;

// 标签
typedef struct label {
char key[MAX_LABEL_KEY];
char value[MAX_LABEL_VALUE];
} label_t;

// 指标值
typedef struct metric_value {
metric_type_t type;
char name[MAX_METRIC_NAME];
label_t labels[MAX_LABELS];
int label_count;
double value;
double sum;
double count;
double buckets[MAX_HISTOGRAM_BUCKETS];
double bucket_upper[MAX_HISTOGRAM_BUCKETS];
int bucket_count;
time_t timestamp;
struct metric_value *next;
} metric_value_t;

// 告警规则
typedef struct alert_rule {
char name[64];
char metric_name[MAX_METRIC_NAME];
char condition[16]; // ">", "<", ">=", "<=", "=="
double threshold;
int for_seconds; // 持续时间
char severity[16]; // "critical", "warning", "info"
char message[256];
struct alert_rule *next;
} alert_rule_t;

// 告警事件
typedef struct alert_event {
char rule_name[64];
char metric_name[MAX_METRIC_NAME];
char severity[16];
char message[256];
double current_value;
time_t start_time;
time_t end_time;
int active;
struct alert_event *next;
} alert_event_t;

// 监控平台
typedef struct monitor_platform {
metric_value_t *metrics;
alert_rule_t *alert_rules;
alert_event_t *alert_events;
pthread_mutex_t mutex;
int retention_days;
int running;
} monitor_platform_t;
```

2. 指标采集

```c
// 创建监控平台
monitor_platform_t *monitor_create(void) {
monitor_platform_t *mp = malloc(sizeof(monitor_platform_t));
memset(mp, 0, sizeof(monitor_platform_t));
mp->retention_days = 7;
mp->running = 1;
pthread_mutex_init(&mp->mutex, NULL);
printf("监控平台启动\n");
return mp;
}

// 创建Counter
void metric_counter_add(monitor_platform_t *mp, const char *name,
label_t *labels, int label_count, double delta) {
pthread_mutex_lock(&mp->mutex);

metric_value_t *m = mp->metrics;
while (m) {
if (strcmp(m->name, name) == 0 && m->type == METRIC_COUNTER) {
// 检查标签匹配
int match = 1;
if (m->label_count == label_count) {
for (int i = 0; i < label_count; i++) {
if (strcmp(m->labels[i].key, labels[i].key) != 0 ||
strcmp(m->labels[i].value, labels[i].value) != 0) {
match = 0;
break;
}
}
} else {
match = 0;
}
if (match) {
m->value += delta;
m->timestamp = time(NULL);
pthread_mutex_unlock(&mp->mutex);
return;
}
}
m = m->next;
}

// 创建新指标
m = malloc(sizeof(metric_value_t));
m->type = METRIC_COUNTER;
strcpy(m->name, name);
m->label_count = label_count;
for (int i = 0; i < label_count && i < MAX_LABELS; i++) {
strcpy(m->labels[i].key, labels[i].key);
strcpy(m->labels[i].value, labels[i].value);
}
m->value = delta;
m->timestamp = time(NULL);
m->next = mp->metrics;
mp->metrics = m;

pthread_mutex_unlock(&mp->mutex);
}

// 设置Gauge
void metric_gauge_set(monitor_platform_t *mp, const char *name,
label_t *labels, int label_count, double value) {
pthread_mutex_lock(&mp->mutex);

metric_value_t *m = mp->metrics;
while (m) {
if (strcmp(m->name, name) == 0 && m->type == METRIC_GAUGE) {
int match = 1;
if (m->label_count == label_count) {
for (int i = 0; i < label_count; i++) {
if (strcmp(m->labels[i].key, labels[i].key) != 0 ||
strcmp(m->labels[i].value, labels[i].value) != 0) {
match = 0;
break;
}
}
} else {
match = 0;
}
if (match) {
m->value = value;
m->timestamp = time(NULL);
pthread_mutex_unlock(&mp->mutex);
return;
}
}
m = m->next;
}

m = malloc(sizeof(metric_value_t));
m->type = METRIC_GAUGE;
strcpy(m->name, name);
m->label_count = label_count;
for (int i = 0; i < label_count && i < MAX_LABELS; i++) {
strcpy(m->labels[i].key, labels[i].key);
strcpy(m->labels[i].value, labels[i].value);
}
m->value = value;
m->timestamp = time(NULL);
m->next = mp->metrics;
mp->metrics = m;

pthread_mutex_unlock(&mp->mutex);
}

// 记录Histogram观测值
void metric_histogram_observe(monitor_platform_t *mp, const char *name,
label_t *labels, int label_count, double value) {
pthread_mutex_lock(&mp->mutex);

metric_value_t *m = mp->metrics;
while (m) {
if (strcmp(m->name, name) == 0 && m->type == METRIC_HISTOGRAM) {
int match = 1;
if (m->label_count == label_count) {
for (int i = 0; i < label_count; i++) {
if (strcmp(m->labels[i].key, labels[i].key) != 0 ||
strcmp(m->labels[i].value, labels[i].value) != 0) {
match = 0;
break;
}
}
} else {
match = 0;
}
if (match) {
m->sum += value;
m->count++;
// 分配桶
for (int i = 0; i < m->bucket_count; i++) {
if (value <= m->bucket_upper[i]) {
m->buckets[i]++;
break;
}
}
m->timestamp = time(NULL);
pthread_mutex_unlock(&mp->mutex);
return;
}
}
m = m->next;
}

m = malloc(sizeof(metric_value_t));
m->type = METRIC_HISTOGRAM;
strcpy(m->name, name);
m->label_count = label_count;
for (int i = 0; i < label_count && i < MAX_LABELS; i++) {
strcpy(m->labels[i].key, labels[i].key);
strcpy(m->labels[i].value, labels[i].value);
}
m->sum = value;
m->count = 1;
// 默认桶分布:0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10
double default_buckets[] = {0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10};
m->bucket_count = 11;
for (int i = 0; i < m->bucket_count; i++) {
m->bucket_upper[i] = default_buckets[i];
m->buckets[i] = (value <= default_buckets[i]) ? 1 : 0;
}
m->timestamp = time(NULL);
m->next = mp->metrics;
mp->metrics = m;

pthread_mutex_unlock(&mp->mutex);
}
```

3. 告警规则引擎

```c
// 添加告警规则
void monitor_add_alert_rule(monitor_platform_t *mp, const char *name,
const char *metric_name, const char *condition,
double threshold, int for_seconds,
const char *severity, const char *message) {
pthread_mutex_lock(&mp->mutex);

alert_rule_t *rule = malloc(sizeof(alert_rule_t));
strcpy(rule->name, name);
strcpy(rule->metric_name, metric_name);
strcpy(rule->condition, condition);
rule->threshold = threshold;
rule->for_seconds = for_seconds;
strcpy(rule->severity, severity);
strcpy(rule->message, message);
rule->next = mp->alert_rules;
mp->alert_rules = rule;

pthread_mutex_unlock(&mp->mutex);
printf("[告警] 添加规则: %s (%s %s %.2f)\n",
name, metric_name, condition, threshold);
}

// 检查告警条件
int check_condition(double value, const char *condition, double threshold) {
if (strcmp(condition, ">") == 0) return value > threshold;
if (strcmp(condition, "<") == 0) return value < threshold;
if (strcmp(condition, ">=") == 0) return value >= threshold;
if (strcmp(condition, "<=") == 0) return value <= threshold;
if (strcmp(condition, "==") == 0) return fabs(value - threshold) < 0.0001;
return 0;
}

// 评估告警规则
void monitor_evaluate_alerts(monitor_platform_t *mp) {
pthread_mutex_lock(&mp->mutex);

time_t now = time(NULL);
alert_rule_t *rule = mp->alert_rules;

while (rule) {
// 查找对应的指标
metric_value_t *m = mp->metrics;
while (m) {
if (strcmp(m->name, rule->metric_name) == 0) {
int triggered = check_condition(m->value, rule->condition, rule->threshold);

if (triggered) {
// 检查是否已存在告警
alert_event_t *evt = mp->alert_events;
int found = 0;
while (evt) {
if (strcmp(evt->rule_name, rule->name) == 0 && evt->active) {
found = 1;
break;
}
evt = evt->next;
}

if (!found) {
// 创建新告警
alert_event_t *new_evt = malloc(sizeof(alert_event_t));
strcpy(new_evt->rule_name, rule->name);
strcpy(new_evt->metric_name, rule->metric_name);
strcpy(new_evt->severity, rule->severity);
snprintf(new_evt->message, sizeof(new_evt->message),
"%s (当前值: %.2f)", rule->message, m->value);
new_evt->current_value = m->value;
new_evt->start_time = now;
new_evt->end_time = 0;
new_evt->active = 1;
new_evt->next = mp->alert_events;
mp->alert_events = new_evt;

printf("[告警] %s: %s (%.2f %s %.2f)\n",
rule->severity, rule->name, m->value,
rule->condition, rule->threshold);
}
} else {
// 关闭告警
alert_event_t *evt = mp->alert_events;
while (evt) {
if (strcmp(evt->rule_name, rule->name) == 0 && evt->active) {
evt->active = 0;
evt->end_time = now;
printf("[告警] %s 已恢复\n", rule->name);
}
evt = evt->next;
}
}
}
m = m->next;
}
rule = rule->next;
}

pthread_mutex_unlock(&mp->mutex);
}
```

4. 监控线程

```c
// 监控评估线程
void *monitor_eval_thread(void *arg) {
monitor_platform_t *mp = (monitor_platform_t*)arg;

while (mp->running) {
sleep(10); // 每10秒评估一次
monitor_evaluate_alerts(mp);
}
return NULL;
}
```

5. Dashboard生成

```c
// 生成HTML Dashboard
void monitor_generate_dashboard(monitor_platform_t *mp, const char *filename) {
FILE *fp = fopen(filename, "w");
if (!fp) return;

fprintf(fp, "<!DOCTYPE html>\n");
fprintf(fp, "<html><head><title>监控Dashboard</title>\n");
fprintf(fp, "<style>\n");
fprintf(fp, "body{font-family:monospace;padding:20px;background:#1a1a2e;color:#eee}\n");
fprintf(fp, ".metric{background:#16213e;padding:15px;margin:10px 0;border-radius:8px}\n");
fprintf(fp, ".metric-name{color:#e94560;font-weight:bold}\n");
fprintf(fp, ".metric-value{color:#0f3460;font-size:24px}\n");
fprintf(fp, ".label{color:#aaa;font-size:12px}\n");
fprintf(fp, ".critical{color:#ff6b6b}\n");
fprintf(fp, ".warning{color:#ffd93d}\n");
fprintf(fp, ".info{color:#6bcb77}\n");
fprintf(fp, "</style></head><body>\n");

fprintf(fp, "<h1>📊 监控Dashboard</h1>\n");

// 显示指标
pthread_mutex_lock(&mp->mutex);

metric_value_t *m = mp->metrics;
while (m) {
fprintf(fp, "<div class='metric'>\n");
fprintf(fp, " <span class='metric-name'>%s</span>\n", m->name);

// 显示标签
if (m->label_count > 0) {
fprintf(fp, " <span class='label'>");
for (int i = 0; i < m->label_count; i++) {
fprintf(fp, "%s=%s ", m->labels[i].key, m->labels[i].value);
}
fprintf(fp, "</span>\n");
}

// 显示值
if (m->type == METRIC_COUNTER) {
fprintf(fp, " <div class='metric-value'>%.0f</div>\n", m->value);
} else if (m->type == METRIC_GAUGE) {
fprintf(fp, " <div class='metric-value'>%.2f</div>\n", m->value);
} else if (m->type == METRIC_HISTOGRAM) {
fprintf(fp, " <div class='metric-value'>count=%.0f, sum=%.2f</div>\n",
m->count, m->sum);
}
fprintf(fp, "</div>\n");
m = m->next;
}

// 显示告警
fprintf(fp, "<h2>🚨 活跃告警</h2>\n");
alert_event_t *evt = mp->alert_events;
while (evt) {
if (evt->active) {
const char *cls = (strcmp(evt->severity, "critical") == 0) ? "critical" :
(strcmp(evt->severity, "warning") == 0) ? "warning" : "info";
fprintf(fp, "<div class='metric %s'>%s: %s</div>\n",
cls, evt->severity, evt->message);
}
evt = evt->next;
}

pthread_mutex_unlock(&mp->mutex);

fprintf(fp, "</body></html>\n");
fclose(fp);
printf("[Dashboard] 已生成: %s\n", filename);
}
```

6. 测试代码

```c
void test_monitor() {
printf("=== 全链路监控平台测试 ===\n\n");

monitor_platform_t *mp = monitor_create();

// 添加告警规则
monitor_add_alert_rule(mp, "cpu_high", "cpu_usage", ">", 80.0, 30,
"critical", "CPU使用率过高");
monitor_add_alert_rule(mp, "error_rate_high", "error_rate", ">", 5.0, 60,
"warning", "错误率过高");
monitor_add_alert_rule(mp, "memory_low", "memory_free", "<", 1024.0, 60,
"critical", "可用内存不足");

// 启动监控线程
pthread_t eval_tid;
pthread_create(&eval_tid, NULL, monitor_eval_thread, mp);

// 模拟指标采集
printf("[模拟] 开始采集指标...\n");

for (int i = 0; i < 50; i++) {
label_t labels[2];
strcpy(labels[0].key, "service");
strcpy(labels[0].value, "order-service");
strcpy(labels[1].key, "env");
strcpy(labels[1].value, "prod");

// 模拟Counter
metric_counter_add(mp, "http_requests_total", labels, 2, 10 + rand() % 50);

// 模拟Gauge
double cpu = 30 + (rand() % 80);
metric_gauge_set(mp, "cpu_usage", labels, 2, cpu);

// 模拟Histogram
double latency = (rand() % 1000) / 100.0;
metric_histogram_observe(mp, "request_duration_ms", labels, 2, latency);

// 模拟错误率
double error_rate = (rand() % 10) / 100.0;
metric_gauge_set(mp, "error_rate", labels, 2, error_rate * 100);

// 模拟内存
double memory_free = 500 + (rand() % 4000);
metric_gauge_set(mp, "memory_free", labels, 2, memory_free);

usleep(100000);

if (i % 10 == 0) {
printf("[模拟] 采集 %d/50\n", i);
}
}

sleep(2);

// 生成Dashboard
monitor_generate_dashboard(mp, "dashboard.html");

mp->running = 0;
pthread_join(eval_tid, NULL);

printf("\n✅ 测试完成,打开 dashboard.html 查看监控面板\n");
free(mp);
}

int main() {
srand(time(NULL));
test_monitor();
return 0;
}
```

---

三、编译和运行

```bash
gcc -o monitor monitor.c -lpthread -lm
./monitor
```

---

四、Prometheus vs 本实现

特性 本实现 Prometheus
指标采集 ✅ ✅
多类型 Counter/Gauge/Histogram Counter/Gauge/Histogram/Summary
标签支持 ✅ ✅
告警规则 ✅ ✅ (Alertmanager)
持久化 ❌ ✅ (TSDB)
查询语言 ❌ ✅ (PromQL)
可视化 ✅ (简单) ✅ (Grafana)
服务发现 ❌ ✅

---

五、总结

通过这篇文章,你学会了:

· 监控平台的核心原理(指标采集、告警、可视化)
· 三种指标类型(Counter、Gauge、Histogram)
· 告警规则引擎
· 标签(维度)的使用
· Dashboard生成

全链路监控是可观测性的核心。掌握它,你就拥有了提前发现系统问题的能力。

下一篇预告:《从零实现一个分布式任务调度平台:XXL-JOB的核心设计》

---

评论区分享一下你用监控系统发现过什么问题~

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询