Loading deploy/rules/swarm_node.rules.yml +7 −7 Original line number Diff line number Diff line Loading @@ -2,8 +2,8 @@ groups: - name: swarm_node rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 70 expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 50 for: 1m labels: severity: warning Loading @@ -12,7 +12,7 @@ groups: $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m labels: Loading @@ -22,8 +22,8 @@ groups: $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}) * 100 / node_filesystem_size{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name) expr: ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: Loading @@ -33,7 +33,7 @@ groups: $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free{mountpoint="/"}[1h], 6 * 3600) * ON(instance) expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: Loading deploy/rules/swarm_task.rules.yml +6 −3 Original line number Diff line number Diff line Loading @@ -13,11 +13,14 @@ groups: summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' - alert: task_high_memory_usage_90 expr: (sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) - record: task_memory_usage_percent expr: 100 * (sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) / sum(container_spec_memory_limit_bytes{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)) > 0.9 BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)) - alert: task_high_memory_usage_90 expr: task_memory_usage_percent > 90 for: 1m annotations: description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ Loading Loading
deploy/rules/swarm_node.rules.yml +7 −7 Original line number Diff line number Diff line Loading @@ -2,8 +2,8 @@ groups: - name: swarm_node rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 70 expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 50 for: 1m labels: severity: warning Loading @@ -12,7 +12,7 @@ groups: $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m labels: Loading @@ -22,8 +22,8 @@ groups: $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}) * 100 / node_filesystem_size{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name) expr: ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: Loading @@ -33,7 +33,7 @@ groups: $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free{mountpoint="/"}[1h], 6 * 3600) * ON(instance) expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: Loading
deploy/rules/swarm_task.rules.yml +6 −3 Original line number Diff line number Diff line Loading @@ -13,11 +13,14 @@ groups: summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' - alert: task_high_memory_usage_90 expr: (sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) - record: task_memory_usage_percent expr: 100 * (sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) / sum(container_spec_memory_limit_bytes{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)) > 0.9 BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)) - alert: task_high_memory_usage_90 expr: task_memory_usage_percent > 90 for: 1m annotations: description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ Loading