Commit 32076a5c authored by Pedro Eduardo Trujillo's avatar Pedro Eduardo Trujillo
Browse files

Corrige métricas de disco y diferencia por entorno

Actualiza otros valores menores de despliegue.
parent 3f8a3ad2
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -3,3 +3,8 @@ version: '3.5'
volumes:
  prometheus:
    name: ${PROMETHEUS_VOL_NAME:-prometheus-vol}

configs:
  node_rules:
    name: ${NODE_RULES_NAME:-node_rules}
    file: ./rules/swarm_node.rules.dev.yml
+5 −0
Original line number Diff line number Diff line
@@ -15,3 +15,8 @@ volumes:
      backing: relocatable
      size: ${PROMETHEUS_VOL_SIZE:-10}
      ebstype: ${PROMETHEUS_VOL_EBS_TYPE:-gp2}

configs:
  node_rules:
    name: ${NODE_RULES_NAME:-node_rules}
    file: ./rules/swarm_node.rules.prod.yml
+2 −6
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@ services:
      - source: service_rules
        target: /etc/prometheus/swarm_service.rules.yml
    healthcheck:
      test: wget --spider http://localhost:9090
      test: wget --spider -q http://localhost:9090
      interval: 30s
      timeout: 10s
      retries: 3
@@ -41,7 +41,7 @@ services:
        traefik.backend: prometheus
        traefik.port: '9090'
      restart_policy:
        delay: 30s
        delay: 10s
        window: 2m
      resources:
        limits:
@@ -61,10 +61,6 @@ networks:
    external: true

configs:
  node_rules:
    name: ${NODE_RULES_NAME:-node_rules}
    file: ./rules/swarm_node.rules.yml

  task_rules:
    name: ${TASK_RULES_NAME:-task_rules}
    file: ./rules/swarm_task.rules.yml
+65 −0
Original line number Diff line number Diff line
groups:
- name: swarm_node
  rules:
  - alert: node_cpu_usage
    expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
      node_meta * 100) BY (node_name)) > 80
    for: 1m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
        $value}}%.
      summary: CPU alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_memory_usage
    expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
    for: 1m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
        $value}}%.
      summary: Memory alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
        $value}}%.
      summary: Disk alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_disk_fill_rate_6h
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
    for: 1h
    labels:
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk is going to fill up in
        6h.
      summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_docker_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint="/mnt/data"} - node_filesystem_free_bytes{mountpoint="/mnt/data"})
      * 100 / node_filesystem_size_bytes{mountpoint="/mnt/data"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
        $value}}%.
      summary: Disk alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_docker_disk_fill_rate_6h
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/mnt/data"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
    for: 1h
    labels:
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk is going to fill up in
        6h.
      summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'
 No newline at end of file
+3 −3
Original line number Diff line number Diff line
@@ -22,8 +22,8 @@ groups:
        $value}}%.
      summary: Memory alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})
      * 100 / node_filesystem_size_bytes{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name)
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    labels:
@@ -33,7 +33,7 @@ groups:
        $value}}%.
      summary: Disk alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_disk_fill_rate_6h
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 6 * 3600) * ON(instance)
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
    for: 1h
    labels: