Commit 09c45737 authored by Pedro Eduardo Trujillo's avatar Pedro Eduardo Trujillo
Browse files

Actualiza y corrige reglas de alerta

parent b497f3fa
Loading
Loading
Loading
Loading
+7 −7
Original line number Diff line number Diff line
@@ -2,8 +2,8 @@ groups:
- name: swarm_node
  rules:
  - alert: node_cpu_usage
    expr: 100 - (avg(irate(node_cpu{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
      node_meta * 100) BY (node_name)) > 70
    expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
      node_meta * 100) BY (node_name)) > 50
    for: 1m
    labels:
      severity: warning
@@ -12,7 +12,7 @@ groups:
        $value}}%.
      summary: CPU alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_memory_usage
    expr: sum(((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal)
    expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
    for: 1m
    labels:
@@ -22,8 +22,8 @@ groups:
        $value}}%.
      summary: Memory alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_disk_usage
    expr: ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
      * 100 / node_filesystem_size{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name)
    expr: ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})
      * 100 / node_filesystem_size_bytes{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    labels:
@@ -33,7 +33,7 @@ groups:
        $value}}%.
      summary: Disk alert for Swarm node '{{ $labels.node_name }}'
  - alert: node_disk_fill_rate_6h
    expr: predict_linear(node_filesystem_free{mountpoint="/"}[1h], 6 * 3600) * ON(instance)
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
    for: 1h
    labels:
+6 −3
Original line number Diff line number Diff line
@@ -13,11 +13,14 @@ groups:
      summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}'
        on '{{ $labels.container_label_com_docker_swarm_node_id }}'

  - alert: task_high_memory_usage_90
    expr: (sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
  - record: task_memory_usage_percent
    expr: 100 * (sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
      BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) /
      sum(container_spec_memory_limit_bytes{container_label_com_docker_swarm_task_name=~".+"})
      BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)) > 0.9
      BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id))

  - alert: task_high_memory_usage_90
    expr: task_memory_usage_percent > 90
    for: 1m
    annotations:
      description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{