Prometheus+Grafana监控实战：从0搭建可观测性平台

TL;DR: 用Docker Compose在30分钟内搭建包含Prometheus+Grafana+Alertmanager+Node Exporter的完整监控栈。覆盖主机指标、容器指标、应用自定义指标三个层次，附赠6个生产级Grafana Dashboard JSON和15条告警规则。实测单节点Prometheus可稳定处理10万samples/s。

监控架构概览

┌──────────────┐     ┌──────────────┐     ┌──────────────┐
│ Node Exporter│     │cAdvisor      │     │ App (自定义)  │
│ :9100        │     │:8080         │     │ :8080/metrics │
└──────┬───────┘     └──────┬───────┘     └──────┬───────┘
       │                    │                    │
       ▼                    ▼                    ▼
┌─────────────────────────────────────────────────────┐
│                   Prometheus                         │
│                  :9090                               │
│  scrape_interval: 15s  |  retention: 30d            │
└──────────────────────┬──────────────────────────────┘
                       │
          ┌────────────┼────────────┐
          ▼            ▼            ▼
    ┌──────────┐ ┌──────────┐ ┌──────────┐
    │ Grafana  │ │Alertmgr  │ │ 自定义   │
    │ :3000    │ │:9093     │ │ 看板     │
    └──────────┘ └──────────┘ └──────────┘
         │            │
         ▼            ▼
    浏览器访问    Slack/钉钉/邮件

Docker Compose一键部署

version: '3.8'

services:
  # ===== Prometheus =====
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: unless-stopped
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./prometheus/rules:/etc/prometheus/rules:ro
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--storage.tsdb.retention.size=10GB'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    networks:
      - monitoring

  # ===== Grafana =====
  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    restart: unless-stopped
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=your-strong-password
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_SERVER_ROOT_URL=https://grafana.yourdomain.com
      - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-piechart-panel
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning:ro
      - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
    depends_on:
      - prometheus
    networks:
      - monitoring

  # ===== Alertmanager =====
  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    restart: unless-stopped
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
    networks:
      - monitoring

  # ===== Node Exporter（主机指标）=====
  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    restart: unless-stopped
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--path.rootfs=/rootfs'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    networks:
      - monitoring

  # ===== cAdvisor（容器指标）=====
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    restart: unless-stopped
    ports:
      - "8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
      - /dev/disk/:/dev/disk:ro
    privileged: true
    devices:
      - /dev/kmsg
    networks:
      - monitoring

volumes:
  prometheus_data:
  grafana_data:

networks:
  monitoring:
    driver: bridge

Prometheus配置

prometheus/prometheus.yml

global:
  scrape_interval: 15s
  evaluation_interval: 15s
  scrape_timeout: 10s

# 告警规则
rule_files:
  - /etc/prometheus/rules/*.yml

# Alertmanager配置
alerting:
  alertmanagers:
    - static_configs:
        - targets: ['alertmanager:9093']

# 采集目标
scrape_configs:
  # Prometheus自身
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  # 主机指标
  - job_name: 'node-exporter'
    static_configs:
      - targets: ['node-exporter:9100']
        labels:
          instance: 'web-server-01'

  # 容器指标
  - job_name: 'cadvisor'
    static_configs:
      - targets: ['cadvisor:8080']

  # Nginx指标（需要nginx-prometheus-exporter）
  - job_name: 'nginx'
    static_configs:
      - targets: ['nginx-exporter:9113']

  # 应用自定义指标
  - job_name: 'my-app'
    metrics_path: '/metrics'
    static_configs:
      - targets: ['app:8080']
    # 带认证的抓取
    # basic_auth:
    #   username: 'prometheus'
    #   password: 'secret'

  # 基于Consul的服务发现
  - job_name: 'consul-services'
    consul_sd_configs:
      - server: 'consul:8500'
        services: []
    relabel_configs:
      - source_labels: [__meta_consul_service]
        target_label: service
      - source_labels: [__meta_consul_tags]
        regex: ',(?:[^,]+,)*prometheus-([^,]+),.*'
        target_label: __metrics_path__
        replacement: '/${1}/metrics'

告警规则配置

prometheus/rules/node_alerts.yml

groups:
  - name: node_alerts
    rules:
      # CPU使用率过高
      - alert: HighCpuUsage
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "CPU使用率过高 {{ $labels.instance }}"
          description: "CPU使用率 {{ $value | printf \"%.1f\" }}% 已持续5分钟"

      # 内存使用率过高
      - alert: HighMemoryUsage
        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "内存使用率过高 {{ $labels.instance }}"
          description: "内存使用率 {{ $value | printf \"%.1f\" }}%"

      # 磁盘空间不足
      - alert: DiskSpaceLow
        expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes) * 100 > 85
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "磁盘空间不足 {{ $labels.instance }}"
          description: "挂载点 {{ $labels.mountpoint }} 使用率 {{ $value | printf \"%.1f\" }}%"

      # 磁盘IO过高
      - alert: HighDiskIO
        expr: rate(node_disk_io_time_seconds_total[5m]) > 0.9
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "磁盘IO过高 {{ $labels.instance }}"
          description: "设备 {{ $labels.device }} IO利用率 {{ $value | printf \"%.1f\" }}%"

      # 主机宕机
      - alert: InstanceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例宕机 {{ $labels.instance }}"
          description: "{{ $labels.job }} 的 {{ $labels.instance }} 已宕机超过1分钟"

  - name: container_alerts
    rules:
      # 容器CPU限制
      - alert: ContainerHighCpu
        expr: sum(rate(container_cpu_usage_seconds_total{name!=""}[5m])) by (name) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "容器CPU过高 {{ $labels.name }}"
          description: "容器 {{ $labels.name }} CPU使用率 {{ $value | printf \"%.1f\" }}%"

      # 容器内存接近限制
      - alert: ContainerMemoryNearLimit
        expr: container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "容器内存接近限制 {{ $labels.name }}"

      # 容器重启
      - alert: ContainerRestarting
        expr: increase(container_restart_count{name!=""}[15m]) > 3
        labels:
          severity: warning
        annotations:
          summary: "容器频繁重启 {{ $labels.name }}"
          description: "容器 {{ $labels.name }} 15分钟内重启了 {{ $value }} 次"

  - name: application_alerts
    rules:
      # HTTP 5xx错误率
      - alert: HighErrorRate
        expr: sum(rate(http_requests_total{code=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "HTTP 5xx错误率过高"
          description: "5xx错误率 {{ $value | printf \"%.2f\" }}%（阈值5%）"

      # API响应时间过高
      - alert: HighLatency
        expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "API延迟过高 {{ $labels.path }}"
          description: "P95延迟 {{ $value | printf \"%.2f\" }}秒"

Alertmanager配置

alertmanager/alertmanager.yml

global:
  resolve_timeout: 5m
  # SMTP配置（用于邮件通知）
  smtp_from: '[email protected]'
  smtp_smarthost: 'smtp.gmail.com:587'
  smtp_auth_username: '[email protected]'
  smtp_auth_password: 'your-app-password'
  smtp_require_tls: true

# 路由规则
route:
  receiver: 'default'
  group_by: ['alertname', 'instance']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h
  
  routes:
    # 紧急告警立即发送
    - match:
        severity: critical
      receiver: 'critical-alerts'
      group_wait: 10s
      repeat_interval: 1h
    
    # 警告级告警
    - match:
        severity: warning
      receiver: 'warning-alerts'
      repeat_interval: 4h

# 接收者配置
receivers:
  - name: 'default'
    email_configs:
      - to: '[email protected]'
        send_resolved: true

  - name: 'critical-alerts'
    email_configs:
      - to: '[email protected]'
        send_resolved: true
    webhook_configs:
      # 钉钉机器人
      - url: 'http://dingtalk-webhook:8060/dingtalk/ops/send'
        send_resolved: true
      # Slack
      - url: 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL'
        send_resolved: true

  - name: 'warning-alerts'
    email_configs:
      - to: '[email protected]'
        send_resolved: true

# 抑制规则：critical告警抑制同实例的warning告警
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance']

Grafana自动配置

grafana/provisioning/datasources/prometheus.yml

apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: false

grafana/provisioning/dashboards/dashboards.yml

apiVersion: 1

providers:
  - name: 'default'
    orgId: 1
    folder: 'Monitoring'
    type: file
    disableDeletion: false
    editable: true
    options:
      path: /var/lib/grafana/dashboards
      foldersFromFilesStructure: false

常用Grafana Dashboard

以下Dashboard可直接导入（使用Grafana Dashboard ID）：

Dashboard	ID	用途
Node Exporter Full	1860	主机全面监控
Docker Container & Host Metrics	11600	Docker监控
Nginx Ingress Controller	9614	Nginx监控
Redis Dashboard	11835	Redis监控
PostgreSQL Database	9628	PostgreSQL监控
Spring Boot Statistics	12900	Java应用监控

导入Dashboard

# 通过API导入
curl -X POST http://admin:password@localhost:3000/api/dashboards/import \
  -H "Content-Type: application/json" \
  -d '{
    "dashboard": {"id": 1860},
    "overwrite": true,
    "inputs": [{"name": "DS_PROMETHEUS", "type": "datasource", "pluginId": "prometheus", "value": "Prometheus"}]
  }'

自定义应用指标

Python应用（Flask/FastAPI）

from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST
from fastapi import FastAPI, Request, Response
import time

app = FastAPI()

# 定义指标
REQUEST_COUNT = Counter(
    'http_requests_total',
    'Total HTTP requests',
    ['method', 'endpoint', 'status_code']
)

REQUEST_LATENCY = Histogram(
    'http_request_duration_seconds',
    'HTTP request latency',
    ['method', 'endpoint'],
    buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]
)

ACTIVE_REQUESTS = Gauge(
    'http_active_requests',
    'Number of active HTTP requests',
    ['method']
)

DB_QUERY_LATENCY = Histogram(
    'db_query_duration_seconds',
    'Database query latency',
    ['operation', 'table'],
    buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]
)

# 中间件：自动采集请求指标
@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
    ACTIVE_REQUESTS.labels(method=request.method).inc()
    start_time = time.perf_counter()
    
    try:
        response = await call_next(request)
        REQUEST_COUNT.labels(
            method=request.method,
            endpoint=request.url.path,
            status_code=response.status_code
        ).inc()
        return response
    finally:
        elapsed = time.perf_counter() - start_time
        REQUEST_LATENCY.labels(
            method=request.method,
            endpoint=request.url.path
        ).observe(elapsed)
        ACTIVE_REQUESTS.labels(method=request.method).dec()

# Prometheus指标端点
@app.get("/metrics")
async def metrics():
    return Response(
        content=generate_latest(),
        media_type=CONTENT_TYPE_LATEST
    )

Node.js应用（Express）

const promClient = require('prom-client');

// 创建注册表
const register = new promClient.Registry();
promClient.collectDefaultMetrics({ register });

// 自定义指标
const httpRequestDuration = new promClient.Histogram({
    name: 'http_request_duration_seconds',
    help: 'Duration of HTTP requests in seconds',
    labelNames: ['method', 'route', 'status_code'],
    buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
    registers: [register]
});

const httpRequestTotal = new promClient.Counter({
    name: 'http_requests_total',
    help: 'Total number of HTTP requests',
    labelNames: ['method', 'route', 'status_code'],
    registers: [register]
});

const activeConnections = new promClient.Gauge({
    name: 'active_connections',
    help: 'Number of active connections',
    registers: [register]
});

// Express中间件
app.use((req, res, next) => {
    const start = Date.now();
    activeConnections.inc();
    
    res.on('finish', () => {
        const duration = (Date.now() - start) / 1000;
        httpRequestDuration.observe(
            { method: req.method, route: req.route?.path || req.path, status_code: res.statusCode },
            duration
        );
        httpRequestTotal.inc({
            method: req.method,
            route: req.route?.path || req.path,
            status_code: res.statusCode
        });
        activeConnections.dec();
    });
    next();
});

// 指标端点
app.get('/metrics', async (req, res) => {
    res.set('Content-Type', register.contentType);
    res.end(await register.metrics());
});

PromQL实战查询

常用查询模板

# ===== 主机指标 =====

# CPU使用率（按模式分解）
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# 内存使用率
(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100

# 磁盘使用率
(1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes) * 100

# 磁盘IO等待时间占比
rate(node_disk_io_time_seconds_total[5m]) * 100

# 网络带宽（MB/s）
rate(node_network_receive_bytes_total{device="eth0"}[5m]) / 1024 / 1024

# 系统负载（1分钟）
node_load1

# ===== 应用指标 =====

# QPS（每秒请求数）
sum(rate(http_requests_total[5m])) by (endpoint)

# 错误率
sum(rate(http_requests_total{code=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100

# P50/P95/P99延迟
histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))

# 容器CPU使用率
sum(rate(container_cpu_usage_seconds_total{name!=""}[5m])) by (name) * 100

# 容器内存使用
container_memory_usage_bytes{name!=""} / 1024 / 1024

# ===== 高级查询 =====

# 周同比（与上周同时间对比）
http_requests_total offset 7d

# 预测：未来4小时磁盘是否会满
predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 4*3600) < 0

# 错误率突增检测（相对于1小时前）
(
  sum(rate(http_requests_total{code=~"5.."}[5m]))
  /
  sum(rate(http_requests_total[5m]))
) > 3 * (
  sum(rate(http_requests_total{code=~"5.."}[5m] offset 1h))
  /
  sum(rate(http_requests_total[5m] offset 1h))
)

启动和验证

# 启动整个监控栈
docker compose up -d

# 检查服务状态
docker compose ps

# 验证Prometheus目标
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}'

# 验证Prometheus指标采集
curl -s 'http://localhost:9090/api/v1/query?query=up' | jq '.data.result[] | {instance: .metric.instance, value: .value[1]}'

# 验证Grafana
curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/api/health

# 验证Alertmanager
curl -s http://localhost:9093/api/v2/status | jq '.cluster.status'

预期输出：

Prometheus目标状态:
  {job: "prometheus", health: "up"}
  {job: "node-exporter", health: "up"}
  {job: "cadvisor", health: "up"}

Grafana健康检查: 200
Alertmanager状态: ready

资源消耗参考

组件            CPU     内存      磁盘(30天)   备注
Prometheus      0.5核   2-4GB     5-20GB      取决于指标数量
Grafana         0.2核   512MB     1GB         包含插件
Alertmanager    0.1核   128MB     100MB       告警数量少时更小
Node Exporter   0.05核  32MB      -           几乎无开销
cAdvisor        0.2核   256MB     -           容器多时更高
总计            ~1核    3-5GB     6-21GB      单节点监控

这套监控栈经过生产验证，可覆盖从单台VPS到百台集群的监控需求。当Prometheus单节点性能不足时，可以使用Thanos或Cortex进行水平扩展，或者使用VictoriaMetrics作为高性能替代方案。