Loading deploy/docker-compose.dev.yml +5 −0 Original line number Diff line number Diff line Loading @@ -3,3 +3,8 @@ version: '3.5' volumes: prometheus: name: ${PROMETHEUS_VOL_NAME:-prometheus-vol} configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.dev.yml deploy/docker-compose.prod.yml +5 −0 Original line number Diff line number Diff line Loading @@ -15,3 +15,8 @@ volumes: backing: relocatable size: ${PROMETHEUS_VOL_SIZE:-10} ebstype: ${PROMETHEUS_VOL_EBS_TYPE:-gp2} configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.prod.yml deploy/docker-compose.tmpl.yml +2 −6 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ services: - source: service_rules target: /etc/prometheus/swarm_service.rules.yml healthcheck: test: wget --spider http://localhost:9090 test: wget --spider -q http://localhost:9090 interval: 30s timeout: 10s retries: 3 Loading @@ -41,7 +41,7 @@ services: traefik.backend: prometheus traefik.port: '9090' restart_policy: delay: 30s delay: 10s window: 2m resources: limits: Loading @@ -61,10 +61,6 @@ networks: external: true configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.yml task_rules: name: ${TASK_RULES_NAME:-task_rules} file: ./rules/swarm_task.rules.yml Loading deploy/rules/swarm_node.rules.dev.yml 0 → 100644 +65 −0 Original line number Diff line number Diff line groups: - name: swarm_node rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 80 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/mnt/data"} - node_filesystem_free_bytes{mountpoint="/mnt/data"}) * 100 / node_filesystem_size_bytes{mountpoint="/mnt/data"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/mnt/data"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' No newline at end of file deploy/rules/swarm_node.rules.yml→deploy/rules/swarm_node.rules.prod.yml +3 −3 Original line number Diff line number Diff line Loading @@ -22,8 +22,8 @@ groups: $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name) expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: Loading @@ -33,7 +33,7 @@ groups: $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 6 * 3600) * ON(instance) expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: Loading Loading
deploy/docker-compose.dev.yml +5 −0 Original line number Diff line number Diff line Loading @@ -3,3 +3,8 @@ version: '3.5' volumes: prometheus: name: ${PROMETHEUS_VOL_NAME:-prometheus-vol} configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.dev.yml
deploy/docker-compose.prod.yml +5 −0 Original line number Diff line number Diff line Loading @@ -15,3 +15,8 @@ volumes: backing: relocatable size: ${PROMETHEUS_VOL_SIZE:-10} ebstype: ${PROMETHEUS_VOL_EBS_TYPE:-gp2} configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.prod.yml
deploy/docker-compose.tmpl.yml +2 −6 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ services: - source: service_rules target: /etc/prometheus/swarm_service.rules.yml healthcheck: test: wget --spider http://localhost:9090 test: wget --spider -q http://localhost:9090 interval: 30s timeout: 10s retries: 3 Loading @@ -41,7 +41,7 @@ services: traefik.backend: prometheus traefik.port: '9090' restart_policy: delay: 30s delay: 10s window: 2m resources: limits: Loading @@ -61,10 +61,6 @@ networks: external: true configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.yml task_rules: name: ${TASK_RULES_NAME:-task_rules} file: ./rules/swarm_task.rules.yml Loading
deploy/rules/swarm_node.rules.dev.yml 0 → 100644 +65 −0 Original line number Diff line number Diff line groups: - name: swarm_node rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 80 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/mnt/data"} - node_filesystem_free_bytes{mountpoint="/mnt/data"}) * 100 / node_filesystem_size_bytes{mountpoint="/mnt/data"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/mnt/data"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' No newline at end of file
deploy/rules/swarm_node.rules.yml→deploy/rules/swarm_node.rules.prod.yml +3 −3 Original line number Diff line number Diff line Loading @@ -22,8 +22,8 @@ groups: $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name) expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: Loading @@ -33,7 +33,7 @@ groups: $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 6 * 3600) * ON(instance) expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: Loading