Commit 3db64a77 authored by Pedro Eduardo Trujillo's avatar Pedro Eduardo Trujillo
Browse files

Actualiza y refina todas las reglas de alerta

parent 5d828a01
Loading
Loading
Loading
Loading
+11 −12
Original line number Diff line number Diff line
@@ -5,39 +5,38 @@ groups:
    expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes)
      / elasticsearch_filesystem_data_size_bytes

  - record: elasticsearch_filesystem_data_free_percent
    expr: 100 - elasticsearch_filesystem_data_used_percent

  - alert: ElasticsearchTooFewNodesRunning
    expr: elasticsearch_cluster_health_number_of_nodes < 3
    for: 5m
    labels:
      severity: critical
    annotations:
      description: There are only {{$value}} (< 3) Elasticsearch nodes running
      description: There are only {{$value}} Elasticsearch nodes running in cluster '{{$labels.cluster}}'
      summary: Elasticsearch running on less than 3 nodes

  - alert: ElasticsearchHeapTooHigh
    expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9
    expr: 100 * (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) > 80
    for: 15m
    labels:
      severity: critical
    annotations:
      description: The heap usage is over 90% for 15m in node '{{$labels.name}}'
      summary: Elasticsearch node {{$labels.name}} heap usage is too high
      description: The heap usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      summary: Elasticsearch node {{$labels.name}} heap usage is too high (> 80%)

  - alert: ElasticsearchHighFilesystemDataUsedPercent
    expr: elasticsearch_filesystem_data_used_percent > 80
    for: 1m
    labels:
      severity: critical
    annotations:
      description: Filesystem usage is over 80% ({{$value}}%) in node '{{$labels.name}}'
      summary: Elasticsearch filesystem usage is too high
      description: Filesystem usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      summary: Elasticsearch node {{$labels.name}} filesystem usage is too high (> 80%)

  - alert: ElasticsearchGarbageCollectionTooSlow
    expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 1
    expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 0.5
    for: 1m
    labels:
      severity: critical
    annotations:
      description: GC is taking more than 1 second to complete in node '{{$labels.name}}' ({{$labels.gc}} - {{$value}}s)
      summary: Elasticsearch memory usage is too heavy and GC is taking too much time
      description: GC ({{$labels.gc}} space) is taking {{humanizeDuration $value}} to complete in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      summary: Elasticsearch node {{$labels.name}} memory usage is too heavy and GC is taking too much time (> 500 ms)
+24 −6
Original line number Diff line number Diff line
groups:
- name: kafka
  rules:
  - alert: KafkaConsumerLagTooHigh
  - alert: KafkaConsumerHighLag
    expr: kafka_consumergroup_lag_sum > 1000
    for: 5m
    labels:
      severity: critical
      severity: warning
    annotations:
      description: Consumer '{{$labels.consumergroup}}' has {{$value}} pending messages (> 1000) from '{{$labels.topic}}' topic
      summary: Kafka consumer '{{$labels.consumergroup}}' is not consuming enough messages from '{{$labels.topic}}'
      description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic
      summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 1K)

  - alert: KafkaConsumerLagIncreasing
    expr: delta(kafka_consumergroup_lag_sum[5m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 5 minutes by {{humanize $value}} for '{{$labels.topic}}' topic
      summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}'

  - alert: KafkaConsumerTooHighLag
    expr: kafka_consumergroup_lag_sum > 10000
    for: 5m
    labels:
      severity: critical
    annotations:
      description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic
      summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 10K)

  - alert: KafkaConsumerLagIncreasingTooMuch
    expr: delta(kafka_consumergroup_lag_sum[15m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      description: Consumer '{{$labels.consumergroup}}' lag increased in last 5 minutes by {{$value}} for '{{$labels.topic}}' topic
      summary: Kafka consumer '{{$labels.consumergroup}}' consuming ratio is not enough for '{{$labels.topic}}'
      description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 15 minutes by {{humanize $value}} for '{{$labels.topic}}' topic
      summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}'
+15 −18
Original line number Diff line number Diff line
groups:
- name: scheduled_job
  rules:
  - alert: backup_db_not_created
    expr: time() - backup_created_date_seconds{} > 86400
  - alert: BackupDatabaseNotCreated
    expr: time() - backup_created_date_seconds{} > 129600
    labels:
      severity: warning
    annotations:
      description: Error creating backup for '{{ $labels.label }}' database.
      summary: Error creating backup for '{{ $labels.label }}' database.
      description: Backup for '{{$labels.label}}' database delayed for {{humanizeDuration $value}}

  - alert: certificate_renew_not_attempted
    expr: time() - certificates_valid_date_seconds{} > 604800
  - alert: CertificateRenewalNotAttempted
    expr: time() - certificates_valid_date_seconds{} > 907200
    labels:
      severity: warning
    annotations:
      description: Error attemping to renew '{{ $labels.label }}' certificate.
      summary: Error attemping to renew '{{ $labels.label }}' certificate.
      description: Renewal attempt for '{{$labels.label}}' certificate delayed for {{humanizeDuration $value}}

  - alert: certificate_not_renewed
  - alert: CertificateNotRenewed
    expr: time() - certificates_updated_date_seconds{} > 5788800
    labels:
      severity: warning
      severity: critical
    annotations:
      description: Error, '{{ $labels.label }}' certificate near expiry.
      summary: Error, '{{ $labels.label }}' certificate near expiry.
      description: Certificate of '{{$labels.label}}' not renewed since {{humanizeDuration $value}}, expiry date is 3 months

  - alert: ElasticsearchSnapshotNotPerformed
    expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 86400
    expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 129600
    labels:
      severity: warning
    annotations:
      description: Last snapshot for '{{ $labels.repository }}' repository is too old.
      summary: Scheduled Elasticsearch snapshot is not working.
      description: Snapshot for '{{$labels.repository}}' repository delayed for {{humanizeDuration $value}}
      summary: Scheduled Elasticsearch snapshot creation is not working

  - alert: ElasticsearchSnapshotCleanupNotPerformed
    expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 64
    expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 100
    labels:
      severity: warning
    annotations:
      description: Too many snapshots ({{$value}}) found for '{{ $labels.repository }}' repository.
      summary: Scheduled Elasticsearch snapshot cleanup is not working.
      description: Too many snapshots ({{humanize $value}}) found for '{{$labels.repository}}' repository
      summary: Scheduled Elasticsearch snapshot cleanup is not working
+22 −28
Original line number Diff line number Diff line
@@ -3,36 +3,33 @@ groups:
  rules:
  - alert: node_cpu_usage
    expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
      node_meta * 100) BY (node_name)) > 80
    for: 1m
      node_meta * 100) BY (node_name)) > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}%
      summary: CPU alert for Swarm node '{{$labels.node_name}}'

  - alert: node_memory_usage
    expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
    for: 1m
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}%
      summary: Memory alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    for: 5m
    labels:
      severity: warning
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}%
      summary: Disk alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_fill_rate_6h
@@ -42,29 +39,26 @@ groups:
    labels:
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk is going to fill up in
        6h.
      description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h
      summary: Disk fill alert for Swarm node '{{$labels.node_name}}'

  - alert: node_docker_disk_usage
  - alert: SwarmNodeDockerDiskUsage
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"} - node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    for: 5m
    labels:
      severity: warning
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) usage is at {{ humanize
        $value}}%.
      summary: Disk (Docker mountpoint) alert for Swarm node '{{ $labels.node_name }}'
      description: Swarm node {{$labels.node_name}} Docker mountpoint disk usage is at {{humanize $value}}%
      summary: Docker mountpoint disk alert for Swarm node '{{$labels.node_name}}'

  - alert: node_docker_disk_fill_rate_6h
  - alert: SwarmNodeDockerDiskFillRate
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
    for: 1h
    labels:
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) is going to fill up in
        6h.
      summary: Disk (Docker mountpoint) fill alert for Swarm node '{{ $labels.node_name }}'
      description: Swarm node {{$labels.node_name}} Docker mountpoint disk is going to fill up in 6h
      summary: Docker mountpoint disk fill alert for Swarm node '{{$labels.node_name}}'
+22 −28
Original line number Diff line number Diff line
@@ -3,36 +3,33 @@ groups:
  rules:
  - alert: node_cpu_usage
    expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
      node_meta * 100) BY (node_name)) > 80
    for: 1m
      node_meta * 100) BY (node_name)) > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}%
      summary: CPU alert for Swarm node '{{$labels.node_name}}'

  - alert: node_memory_usage
    expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
    for: 1m
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}%
      summary: Memory alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    for: 5m
    labels:
      severity: warning
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}%
      summary: Disk alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_fill_rate_6h
@@ -42,29 +39,26 @@ groups:
    labels:
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk is going to fill up in
        6h.
      description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h
      summary: Disk fill alert for Swarm node '{{$labels.node_name}}'

  - alert: node_ebs_disk_usage
  - alert: SwarmNodeEbsDiskUsage
    expr: ((node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"} - node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"})
      * 100 / node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    for: 5m
    labels:
      severity: warning
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' usage is at {{humanize $value}}%
      summary: EBS disk alert for Swarm node '{{$labels.node_name}}'

  - alert: node_ebs_disk_fill_rate_6h
  - alert: SwarmNodeEbsDiskFillRate
    expr: predict_linear(node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
    for: 1h
    labels:
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} is going to fill up in
        6h.
      description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' is going to fill up in 6h
      summary: EBS disk fill alert for Swarm node '{{$labels.node_name}}'
Loading