Deploymnet Prometheus+Grafana in single machine use docker-compose service

Deploymnet Prometheus+Grafana in single machine use docker-compose service

Collect and report node status in grafana dashboard, through node-exporter and process-exporter to promethues tsdb

  • Setup and install relative components
 1#update the system and install docker component
 2yum update -y
 3yum install -y yum-utils
 4yum-config-manager     --add-repo     https://download.docker.com/linux/centos/docker-ce.repo
 5yum install -y docker-ce docker-ce-cli containerd.io
 6systemctl enable docker
 7systemctl start docker
 8mkdir -p /data/monitor/{alertmanager,prometheus/{config,data},consul/data,grafana/data}
 9chmown -R 1000:1000 /data/monitor
10chmod -R 777 /data/monitor/grafana
  • Download the monitoring relative docker images
1docker pull prom/prometheus:latest
2docker pull grafana/grafana:latest
3docker pull prom/alertmanager:latest
4docker pull prom/pushgateway:latest
5docker pull consul:latest
  • Prepare docker-compose.yml service configuration file
 1version: '2.1'
 2
 3networks:
 4  monitor-net:
 5    driver: bridge
 6
 7services:
 8  consul:
 9    image: consul:latest
10    container_name: consul
11    command: agent -dev -bind=0.0.0.0 -client=0.0.0.0
12    restart: unless-stopped
13    volumes:
14     - /data/monitor/consul/data:/consul/data
15    ports:
16     - '8500:8500'
17    networks:
18     - monitor-net
19    labels:
20      org.label-schema.group: "monitoring" 
21
22  prometheus:
23    image: prom/prometheus:latest
24    container_name: prometheus
25    volumes:
26      - /data/monitor/prometheus/config:/etc/prometheus
27      - /data/monitor/prometheus/data:/prometheus:rw
28    command:
29      - '--config.file=/etc/prometheus/prometheus.yml'
30      - '--storage.tsdb.path=/prometheus'
31      - '--web.console.libraries=/etc/prometheus/console_libraries'
32      - '--web.console.templates=/etc/prometheus/consoles'
33      - '--storage.tsdb.retention.time=200h'
34      - '--web.enable-lifecycle'
35    restart: unless-stopped
36    links:
37      - 'consul:consul'
38    ports:
39      - '9090:9090'
40    networks:
41      - monitor-net
42    labels:
43      org.label-schema.group: "monitoring"
44
45  alertmanager:
46    image: prom/alertmanager:latest
47    container_name: alertmanager
48    volumes:
49      - /data/monitor/alertmanager:/etc/alertmanager
50    command:
51      - '--config.file=/etc/alertmanager/config.yml'
52      - '--storage.path=/alertmanager'
53    restart: unless-stopped
54    ports:
55      - '9093:9093'
56    networks:
57      - monitor-net
58    labels:
59      org.label-schema.group: "monitoring"
60
61  grafana:
62    image: grafana/grafana:latest
63    container_name: grafana
64    volumes:
65      - /data/monitor/grafana/data:/var/lib/grafana:rw
66      - /data/monitor/grafana/provisioning:/etc/grafana/provisioning
67    environment:
68      - GF_SECURITY_ADMIN_USER=${ADMIN_USER}
69      - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD}
70      - GF_USERS_ALLOW_SIGN_UP=false
71    restart: unless-stopped
72    ports:
73      - '3000:3000'
74    networks:
75      - monitor-net
76    labels:
77      org.label-schema.group: "monitoring"
78
79  pushgateway:
80    image: prom/pushgateway:latest
81    container_name: pushgateway
82    restart: unless-stopped
83    ports:
84      - '9091:0991'
85    networks:
86      - monitor-net
87    labels:
88      org.label-schema.group: "monitoring"
  • Config the promethues scrape the metrics The path is: /data/monitor/promethues/config include the promethues.yml and node_down.yml alert rules file
 1global:
 2  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
 3  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
 4  # scrape_timeout is set to the global default (10s).
 5
 6# Alertmanager configuration
 7alerting:
 8  alertmanagers:
 9  - scheme: http
10    static_configs:
11    - targets:
12       - 'alertmanager:9093'
13
14# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
15rule_files:
16  - "node_down.yml"
17  # - "first_rules.yml"
18  # - "second_rules.yml"
19
20# A scrape configuration containing exactly one endpoint to scrape:
21# Here it's Prometheus itself.
22scrape_configs:
23  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
24  - job_name: 'prometheus'
25    scrape_interval: 10s
26    static_configs:
27        - targets: ['localhost:9090']
28
29  - job_name: 'node'
30    scrape_interval: 5s
31    static_configs:
32      - targets: ['node_exporter:9100']
33# use consul discovery target and drop the default consul target retain the defined include 'exporter' tags target
34  - job_name: 'consul-prometheus'
35    consul_sd_configs:
36      - server: 'consul:8500'
37        services: []
38    relabel_configs:
39    - source_labels: [__meta_consul_tags]
40      regex: .*exporter.*
41      action: keep

the config/node_down.yml

 1groups:
 2- name: node_down
 3  rules:
 4  - alert: InstanceDown
 5    expr: up == 0
 6    for: 1m
 7    labels:
 8      user: test
 9    annotations:
10      summary: "Instance {{ $labels.instance }} down"
11      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
  • The /data/monitor/alertmanager/config.yml look like this following
 1global:
 2  smtp_smarthost: 'smtp.xxxx.com:25'
 3  smtp_from: 'xxxx@test.com'
 4  smtp_auth_username: 'xxx@test.com'
 5  smtp_auth_password: 'TPP***'
 6  smtp_require_tls: false
 7
 8route:
 9  group_by: ['alertname']
10  group_wait: 10s
11  group_interval: 10s
12  repeat_interval: 10m
13  receiver: live-monitoring
14
15receivers:
16  - name: 'live-monitoring'
17    email_configs:
18      - to: 'receives@test.com'
  • Define the launch script and refer to default credentials
1#!/usr/bin/env bash
2
3export ADMIN_USER=test
4export ADMIN_PASSWORD='xxxxxxx'
5docker-compose up -d
  • Configuration the export and expose the metrics and regist to consul
 1#The service config file `node_exporter.service`
 2[Unit]
 3Description=node_exporter
 4Documentation=https://prometheus.io/
 5After=network.target
 6
 7[Service]
 8Type=simple
 9ExecStart=/usr/local/node_exporter-1.2.2.linux-amd64/node_exporter
10Restart=on-failure
11
12[Install]
13WantedBy=mulser.target
14
15#Regist the endpoint to consul
16curl -X PUT -d '{"id": "node-exporter","name": "node-exporter-test","address": "test.node-exporter.com","port": 9100,"tags": ["exporter","node"],"checks": [{"http": "http://test.node-exporter.com:9100/metrics", "interval": "5s"}]}'  http://test.consul.com:8500/v1/agent/service/register
17curl -X PUT -d '{"id": "process-exporter","name": "process-exporter-test","address": "test.process-exporter.com","port": 9256,"tags": ["exporter","process"],"checks": [{"http": "http://test.process-exporter.com:9256/metrics", "interval": "5s"}]}'  http:///test.consul.com:8500/v1/agent/service/register