Loading deploy/rules/swarm_job.rules.yml +2 −2 Original line number Diff line number Diff line groups: - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_job.rules.yml - name: swarm_job rules: - alert: backup_db_not_created expr: time() - backup_created_date_seconds{} > 86400 Loading @@ -9,7 +9,7 @@ groups: description: Error creating backup for '{{ $labels.label }}' database. summary: Error creating backup for '{{ $labels.label }}' database. - alert: certificate_renew_not_attemped - alert: certificate_renew_not_attempted expr: time() - certificates_valid_date_seconds{} > 604800 labels: severity: warning Loading deploy/rules/swarm_node.rules.yml +2 −2 Original line number Diff line number Diff line groups: - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_node.rules.yml - name: swarm_node rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 50 node_meta * 100) BY (node_name)) > 70 for: 1m labels: severity: warning Loading deploy/rules/swarm_service.rules.yml +7 −7 Original line number Diff line number Diff line groups: - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_service.rules.yml - name: swarm_service rules: - alert: service_in_restart_loop expr: avg without(instance)(changes(process_start_time_seconds[1h]) > bool 3) > 0.1 for: 10m labels: severity: ticket severity: warning annotations: description: The service '{{ $labels.label }}' has restarted more 3 times in last hour. summary: The service '{{ $labels.label }}' has restarted more 3 times in last hour. description: Service '{{ $labels.label }}' has been restarted 3 times in last hour. summary: Service '{{ $labels.label }}' has been restarted 3 times in last hour. - alert: service_down expr: up == 0 for: 5m labels: severity: severity severity: warning annotations: description: Instance {{ $labels.instance }} down. summary: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." No newline at end of file description: Instance '{{ $labels.instance }}' of job {{ $labels.job }} has been down for more than 5 minutes. summary: Instance '{{ $labels.instance }}' of job {{ $labels.job }} has been down for more than 5 minutes. No newline at end of file deploy/rules/swarm_task.rules.yml +16 −13 Original line number Diff line number Diff line groups: - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml - name: swarm_task rules: - alert: task_high_cpu_usage_50 - alert: task_high_cpu_usage_80 expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m])) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) * 100 > 50 for: 1m * 100 > 80 for: 5m annotations: description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ $labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize $value}}%.' summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' - alert: task_high_memory_usage_1g expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09 summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' - alert: task_high_memory_usage_90 expr: (sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) / sum(container_spec_memory_limit_bytes{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)) > 0.9 for: 1m annotations: description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ $labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize $value}}.' summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' $labels.container_label_com_docker_swarm_node_id }}'' memory usage is at {{ humanize $value}}%.' summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' Loading
deploy/rules/swarm_job.rules.yml +2 −2 Original line number Diff line number Diff line groups: - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_job.rules.yml - name: swarm_job rules: - alert: backup_db_not_created expr: time() - backup_created_date_seconds{} > 86400 Loading @@ -9,7 +9,7 @@ groups: description: Error creating backup for '{{ $labels.label }}' database. summary: Error creating backup for '{{ $labels.label }}' database. - alert: certificate_renew_not_attemped - alert: certificate_renew_not_attempted expr: time() - certificates_valid_date_seconds{} > 604800 labels: severity: warning Loading
deploy/rules/swarm_node.rules.yml +2 −2 Original line number Diff line number Diff line groups: - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_node.rules.yml - name: swarm_node rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 50 node_meta * 100) BY (node_name)) > 70 for: 1m labels: severity: warning Loading
deploy/rules/swarm_service.rules.yml +7 −7 Original line number Diff line number Diff line groups: - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_service.rules.yml - name: swarm_service rules: - alert: service_in_restart_loop expr: avg without(instance)(changes(process_start_time_seconds[1h]) > bool 3) > 0.1 for: 10m labels: severity: ticket severity: warning annotations: description: The service '{{ $labels.label }}' has restarted more 3 times in last hour. summary: The service '{{ $labels.label }}' has restarted more 3 times in last hour. description: Service '{{ $labels.label }}' has been restarted 3 times in last hour. summary: Service '{{ $labels.label }}' has been restarted 3 times in last hour. - alert: service_down expr: up == 0 for: 5m labels: severity: severity severity: warning annotations: description: Instance {{ $labels.instance }} down. summary: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." No newline at end of file description: Instance '{{ $labels.instance }}' of job {{ $labels.job }} has been down for more than 5 minutes. summary: Instance '{{ $labels.instance }}' of job {{ $labels.job }} has been down for more than 5 minutes. No newline at end of file
deploy/rules/swarm_task.rules.yml +16 −13 Original line number Diff line number Diff line groups: - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml - name: swarm_task rules: - alert: task_high_cpu_usage_50 - alert: task_high_cpu_usage_80 expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m])) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) * 100 > 50 for: 1m * 100 > 80 for: 5m annotations: description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ $labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize $value}}%.' summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' - alert: task_high_memory_usage_1g expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09 summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' - alert: task_high_memory_usage_90 expr: (sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) / sum(container_spec_memory_limit_bytes{container_label_com_docker_swarm_task_name=~".+"}) BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)) > 0.9 for: 1m annotations: description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ $labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize $value}}.' summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' $labels.container_label_com_docker_swarm_node_id }}'' memory usage is at {{ humanize $value}}%.' summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'