Commit 111283ab authored by Pedro Eduardo Trujillo's avatar Pedro Eduardo Trujillo
Browse files

Merge branch 'dev' into 'master'

Cambia humanize por precisión, retoca detalles

See merge request redmic-project/metric/prometheus!21
parents c2337010 aef700d5
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -20,7 +20,7 @@ groups:
    labels:
      severity: critical
    annotations:
      description: The heap usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      description: The heap usage is at {{$value | printf "%.1f"}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      summary: Elasticsearch node {{$labels.name}} heap usage is too high (> 80%)

  - alert: ElasticsearchHighFilesystemDataUsedPercent
@@ -29,11 +29,11 @@ groups:
    labels:
      severity: critical
    annotations:
      description: Filesystem usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      description: Filesystem usage is at {{$value | printf "%.1f"}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      summary: Elasticsearch node {{$labels.name}} filesystem usage is too high (> 80%)

  - alert: ElasticsearchGarbageCollectionTooSlow
    expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 0.5
    expr: irate(elasticsearch_jvm_gc_collection_seconds_sum[5m]) > 0.5
    for: 1m
    labels:
      severity: critical
+5 −5
Original line number Diff line number Diff line
@@ -2,12 +2,12 @@ groups:
- name: kafka
  rules:
  - alert: KafkaConsumerLagIncreasing
    expr: kafka_consumergroup_lag_sum / delta(kafka_consumergroup_current_offset_sum[5m]) >= 0.6
    expr: kafka_consumergroup_lag_sum / delta(kafka_consumergroup_current_offset_sum[5m]) > 0.6
    for: 3m
    labels:
      severity: warning
    annotations:
      description: Lag ratio of consumer group '{{$labels.consumergroup}}' is at {{humanize $value}} for '{{$labels.topic}}' topic
      description: Lag ratio of consumer group '{{$labels.consumergroup}}' is at {{$value | printf "%.3f"}} for '{{$labels.topic}}' topic
      summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}' (lag ratio > 0.6)

  - alert: KafkaConsumerTooHighLag
@@ -20,10 +20,10 @@ groups:
      summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming messages from '{{$labels.topic}}' (lag > 1M)

  - alert: KafkaConsumerLagIncreasingTooMuch
    expr: kafka_consumergroup_lag_sum / delta(kafka_consumergroup_current_offset_sum[5m]) >= 0.8
    expr: kafka_consumergroup_lag_sum / delta(kafka_consumergroup_current_offset_sum[5m]) >= 1
    for: 3m
    labels:
      severity: critical
    annotations:
      description: Lag ratio of consumer group '{{$labels.consumergroup}}' is at {{humanize $value}} for '{{$labels.topic}}' topic
      summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}' (lag ratio > 0.8)
      description: Lag ratio of consumer group '{{$labels.consumergroup}}' is at {{$value | printf "%.3f"}} for '{{$labels.topic}}' topic
      summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}' (lag ratio >= 1)
+10 −5
Original line number Diff line number Diff line
@@ -2,28 +2,32 @@ groups:
- name: scheduled_job
  rules:
  - alert: BackupDatabaseNotCreated
    expr: time() - backup_created_date_seconds{} > 129600
    expr: time() - backup_created_date_seconds > 129600
    for: 1h
    labels:
      severity: warning
    annotations:
      description: Backup for '{{$labels.label}}' database delayed for {{humanizeDuration $value}}

  - alert: CertificateRenewalNotAttempted
    expr: time() - certificates_valid_date_seconds{} > 907200
    expr: time() - certificates_valid_date_seconds > 907200
    for: 1h
    labels:
      severity: warning
    annotations:
      description: Renewal attempt for '{{$labels.label}}' certificate delayed for {{humanizeDuration $value}}

  - alert: CertificateNotRenewed
    expr: time() - certificates_updated_date_seconds{} > 5788800
    expr: time() - certificates_updated_date_seconds > 5788800
    for: 1h
    labels:
      severity: critical
    annotations:
      description: Certificate of '{{$labels.label}}' not renewed since {{humanizeDuration $value}}, expiry date is 3 months

  - alert: ElasticsearchSnapshotNotPerformed
    expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 129600
    expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp > 129600
    for: 1h
    labels:
      severity: warning
    annotations:
@@ -31,7 +35,8 @@ groups:
      summary: Scheduled Elasticsearch snapshot creation is not working

  - alert: ElasticsearchSnapshotCleanupNotPerformed
    expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 100
    expr: elasticsearch_snapshot_stats_number_of_snapshots > 100
    for: 1h
    labels:
      severity: warning
    annotations:
+4 −4
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ groups:
    labels:
      severity: warning
    annotations:
      description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}%
      description: Swarm node {{$labels.node_name}} CPU usage is at {{$value | printf "%.1f"}}%
      summary: CPU alert for Swarm node '{{$labels.node_name}}'

  - alert: node_memory_usage
@@ -18,7 +18,7 @@ groups:
    labels:
      severity: warning
    annotations:
      description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}%
      description: Swarm node {{$labels.node_name}} memory usage is at {{$value | printf "%.1f"}}%
      summary: Memory alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_usage
@@ -29,7 +29,7 @@ groups:
    labels:
      severity: critical
    annotations:
      description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}%
      description: Swarm node {{$labels.node_name}} disk usage is at {{$value | printf "%.1f"}}%
      summary: Disk alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_fill_rate_6h
@@ -50,7 +50,7 @@ groups:
    labels:
      severity: critical
    annotations:
      description: Swarm node {{$labels.node_name}} Docker mountpoint disk usage is at {{humanize $value}}%
      description: Swarm node {{$labels.node_name}} Docker mountpoint disk usage is at {{$value | printf "%.1f"}}%
      summary: Docker mountpoint disk alert for Swarm node '{{$labels.node_name}}'

  - alert: SwarmNodeDockerDiskFillRate
+4 −4
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ groups:
    labels:
      severity: warning
    annotations:
      description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}%
      description: Swarm node {{$labels.node_name}} CPU usage is at {{$value | printf "%.1f"}}%
      summary: CPU alert for Swarm node '{{$labels.node_name}}'

  - alert: node_memory_usage
@@ -18,7 +18,7 @@ groups:
    labels:
      severity: warning
    annotations:
      description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}%
      description: Swarm node {{$labels.node_name}} memory usage is at {{$value | printf "%.1f"}}%
      summary: Memory alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_usage
@@ -29,7 +29,7 @@ groups:
    labels:
      severity: critical
    annotations:
      description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}%
      description: Swarm node {{$labels.node_name}} disk usage is at {{$value | printf "%.1f"}}%
      summary: Disk alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_fill_rate_6h
@@ -50,7 +50,7 @@ groups:
    labels:
      severity: critical
    annotations:
      description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' usage is at {{humanize $value}}%
      description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' usage is at {{$value | printf "%.1f"}}%
      summary: EBS disk alert for Swarm node '{{$labels.node_name}}'

  - alert: SwarmNodeEbsDiskFillRate
Loading