Commit e41879c3 authored by Pedro Eduardo Trujillo's avatar Pedro Eduardo Trujillo
Browse files

Retoca definición de alertas

parent 23ba9e46
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
groups:
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_job.rules.yml
- name: swarm_job
  rules:
  - alert: backup_db_not_created
    expr: time() - backup_created_date_seconds{} > 86400
@@ -9,7 +9,7 @@ groups:
      description: Error creating backup for '{{ $labels.label }}' database.
      summary: Error creating backup for '{{ $labels.label }}' database.

  - alert: certificate_renew_not_attemped
  - alert: certificate_renew_not_attempted
    expr: time() - certificates_valid_date_seconds{} > 604800
    labels:
      severity: warning
+2 −2
Original line number Diff line number Diff line
groups:
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_node.rules.yml
- name: swarm_node
  rules:
  - alert: node_cpu_usage
    expr: 100 - (avg(irate(node_cpu{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
      node_meta * 100) BY (node_name)) > 50
      node_meta * 100) BY (node_name)) > 70
    for: 1m
    labels:
      severity: warning
+7 −7
Original line number Diff line number Diff line
groups:
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_service.rules.yml
- name: swarm_service
  rules:
  - alert: service_in_restart_loop
    expr: avg without(instance)(changes(process_start_time_seconds[1h]) > bool 3) > 0.1
    for: 10m
    labels:
      severity: ticket
      severity: warning
    annotations:
      description: The service '{{ $labels.label }}' has restarted more 3 times in last hour.
      summary: The service '{{ $labels.label }}' has restarted more 3 times in last hour.
      description: Service '{{ $labels.label }}' has been restarted 3 times in last hour.
      summary: Service '{{ $labels.label }}' has been restarted 3 times in last hour.

  - alert: service_down
    expr: up == 0
    for: 5m
    labels:
      severity: severity
      severity: warning
    annotations:
      description: Instance {{ $labels.instance }} down.
      summary: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
 No newline at end of file
      description: Instance '{{ $labels.instance }}' of job {{ $labels.job }} has been down for more than 5 minutes.
      summary: Instance '{{ $labels.instance }}' of job {{ $labels.job }} has been down for more than 5 minutes.
 No newline at end of file
+16 −13
Original line number Diff line number Diff line
groups:
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml
- name: swarm_task
  rules:
  - alert: task_high_cpu_usage_50
  - alert: task_high_cpu_usage_80
    expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m]))
      BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
      * 100 > 50
    for: 1m
      * 100 > 80
    for: 5m
    annotations:
      description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
        $labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize
        $value}}%.'
      summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
        }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
  - alert: task_high_memory_usage_1g
    expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
      BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09
      summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}'
        on '{{ $labels.container_label_com_docker_swarm_node_id }}'

  - alert: task_high_memory_usage_90
    expr: (sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
      BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) /
      sum(container_spec_memory_limit_bytes{container_label_com_docker_swarm_task_name=~".+"})
      BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)) > 0.9
    for: 1m
    annotations:
      description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
        $labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize
        $value}}.'
      summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
        }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
        $labels.container_label_com_docker_swarm_node_id }}'' memory usage is at {{ humanize
        $value}}%.'
      summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name }}'
        on '{{ $labels.container_label_com_docker_swarm_node_id }}'