Commit 3ad408a4 authored by Pedro Eduardo Trujillo's avatar Pedro Eduardo Trujillo
Browse files

Merge branch 'dev' into 'master'

Dev

See merge request redmic-project/metric/prometheus!19
parents 602ab6d2 876aa7d2
Loading
Loading
Loading
Loading

LICENSE

0 → 100644
+21 −0
Original line number Diff line number Diff line
MIT License

Copyright (c) 2019 REDMIC Project / Metric

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+11 −12
Original line number Diff line number Diff line
@@ -5,39 +5,38 @@ groups:
    expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes)
      / elasticsearch_filesystem_data_size_bytes

  - record: elasticsearch_filesystem_data_free_percent
    expr: 100 - elasticsearch_filesystem_data_used_percent

  - alert: ElasticsearchTooFewNodesRunning
    expr: elasticsearch_cluster_health_number_of_nodes < 3
    for: 5m
    labels:
      severity: critical
    annotations:
      description: There are only {{$value}} (< 3) Elasticsearch nodes running
      description: There are only {{$value}} Elasticsearch nodes running in cluster '{{$labels.cluster}}'
      summary: Elasticsearch running on less than 3 nodes

  - alert: ElasticsearchHeapTooHigh
    expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9
    expr: 100 * (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) > 80
    for: 15m
    labels:
      severity: critical
    annotations:
      description: The heap usage is over 90% for 15m in node '{{$labels.name}}'
      summary: Elasticsearch node {{$labels.name}} heap usage is too high
      description: The heap usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      summary: Elasticsearch node {{$labels.name}} heap usage is too high (> 80%)

  - alert: ElasticsearchHighFilesystemDataUsedPercent
    expr: elasticsearch_filesystem_data_used_percent > 80
    for: 1m
    labels:
      severity: critical
    annotations:
      description: Filesystem usage is over 80% ({{$value}}%) in node '{{$labels.name}}'
      summary: Elasticsearch filesystem usage is too high
      description: Filesystem usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      summary: Elasticsearch node {{$labels.name}} filesystem usage is too high (> 80%)

  - alert: ElasticsearchGarbageCollectionTooSlow
    expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 1
    expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 0.5
    for: 1m
    labels:
      severity: critical
    annotations:
      description: GC is taking more than 1 second to complete in node '{{$labels.name}}' ({{$labels.gc}} - {{$value}}s)
      summary: Elasticsearch memory usage is too heavy and GC is taking too much time
      description: GC ({{$labels.gc}} space) is taking {{humanizeDuration $value}} to complete in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
      summary: Elasticsearch node {{$labels.name}} memory usage is too heavy and GC is taking too much time (> 500 ms)
+24 −6
Original line number Diff line number Diff line
groups:
- name: kafka
  rules:
  - alert: KafkaConsumerLagTooHigh
  - alert: KafkaConsumerHighLag
    expr: kafka_consumergroup_lag_sum > 1000
    for: 5m
    labels:
      severity: critical
      severity: warning
    annotations:
      description: Consumer '{{$labels.consumergroup}}' has {{$value}} pending messages (> 1000) from '{{$labels.topic}}' topic
      summary: Kafka consumer '{{$labels.consumergroup}}' is not consuming enough messages from '{{$labels.topic}}'
      description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic
      summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 1K)

  - alert: KafkaConsumerLagIncreasing
    expr: delta(kafka_consumergroup_lag_sum[5m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 5 minutes by {{humanize $value}} for '{{$labels.topic}}' topic
      summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}'

  - alert: KafkaConsumerTooHighLag
    expr: kafka_consumergroup_lag_sum > 10000
    for: 5m
    labels:
      severity: critical
    annotations:
      description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic
      summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 10K)

  - alert: KafkaConsumerLagIncreasingTooMuch
    expr: delta(kafka_consumergroup_lag_sum[15m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      description: Consumer '{{$labels.consumergroup}}' lag increased in last 5 minutes by {{$value}} for '{{$labels.topic}}' topic
      summary: Kafka consumer '{{$labels.consumergroup}}' consuming ratio is not enough for '{{$labels.topic}}'
      description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 15 minutes by {{humanize $value}} for '{{$labels.topic}}' topic
      summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}'
+15 −18
Original line number Diff line number Diff line
groups:
- name: scheduled_job
  rules:
  - alert: backup_db_not_created
    expr: time() - backup_created_date_seconds{} > 86400
  - alert: BackupDatabaseNotCreated
    expr: time() - backup_created_date_seconds{} > 129600
    labels:
      severity: warning
    annotations:
      description: Error creating backup for '{{ $labels.label }}' database.
      summary: Error creating backup for '{{ $labels.label }}' database.
      description: Backup for '{{$labels.label}}' database delayed for {{humanizeDuration $value}}

  - alert: certificate_renew_not_attempted
    expr: time() - certificates_valid_date_seconds{} > 604800
  - alert: CertificateRenewalNotAttempted
    expr: time() - certificates_valid_date_seconds{} > 907200
    labels:
      severity: warning
    annotations:
      description: Error attemping to renew '{{ $labels.label }}' certificate.
      summary: Error attemping to renew '{{ $labels.label }}' certificate.
      description: Renewal attempt for '{{$labels.label}}' certificate delayed for {{humanizeDuration $value}}

  - alert: certificate_not_renewed
  - alert: CertificateNotRenewed
    expr: time() - certificates_updated_date_seconds{} > 5788800
    labels:
      severity: warning
      severity: critical
    annotations:
      description: Error, '{{ $labels.label }}' certificate near expiry.
      summary: Error, '{{ $labels.label }}' certificate near expiry.
      description: Certificate of '{{$labels.label}}' not renewed since {{humanizeDuration $value}}, expiry date is 3 months

  - alert: ElasticsearchSnapshotNotPerformed
    expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 86400
    expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 129600
    labels:
      severity: warning
    annotations:
      description: Last snapshot for '{{ $labels.repository }}' repository is too old.
      summary: Scheduled Elasticsearch snapshot is not working.
      description: Snapshot for '{{$labels.repository}}' repository delayed for {{humanizeDuration $value}}
      summary: Scheduled Elasticsearch snapshot creation is not working

  - alert: ElasticsearchSnapshotCleanupNotPerformed
    expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 64
    expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 100
    labels:
      severity: warning
    annotations:
      description: Too many snapshots ({{$value}}) found for '{{ $labels.repository }}' repository.
      summary: Scheduled Elasticsearch snapshot cleanup is not working.
      description: Too many snapshots ({{humanize $value}}) found for '{{$labels.repository}}' repository
      summary: Scheduled Elasticsearch snapshot cleanup is not working
+22 −28
Original line number Diff line number Diff line
@@ -3,36 +3,33 @@ groups:
  rules:
  - alert: node_cpu_usage
    expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
      node_meta * 100) BY (node_name)) > 80
    for: 1m
      node_meta * 100) BY (node_name)) > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}%
      summary: CPU alert for Swarm node '{{$labels.node_name}}'

  - alert: node_memory_usage
    expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
    for: 1m
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}%
      summary: Memory alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    for: 5m
    labels:
      severity: warning
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
        $value}}%.
      description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}%
      summary: Disk alert for Swarm node '{{$labels.node_name}}'

  - alert: node_disk_fill_rate_6h
@@ -42,29 +39,26 @@ groups:
    labels:
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk is going to fill up in
        6h.
      description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h
      summary: Disk fill alert for Swarm node '{{$labels.node_name}}'

  - alert: node_docker_disk_usage
  - alert: SwarmNodeDockerDiskUsage
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"} - node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"}) * ON(instance) GROUP_LEFT(node_name)
      node_meta > 85
    for: 1m
    for: 5m
    labels:
      severity: warning
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) usage is at {{ humanize
        $value}}%.
      summary: Disk (Docker mountpoint) alert for Swarm node '{{ $labels.node_name }}'
      description: Swarm node {{$labels.node_name}} Docker mountpoint disk usage is at {{humanize $value}}%
      summary: Docker mountpoint disk alert for Swarm node '{{$labels.node_name}}'

  - alert: node_docker_disk_fill_rate_6h
  - alert: SwarmNodeDockerDiskFillRate
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
    for: 1h
    labels:
      severity: critical
    annotations:
      description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) is going to fill up in
        6h.
      summary: Disk (Docker mountpoint) fill alert for Swarm node '{{ $labels.node_name }}'
      description: Swarm node {{$labels.node_name}} Docker mountpoint disk is going to fill up in 6h
      summary: Docker mountpoint disk fill alert for Swarm node '{{$labels.node_name}}'
Loading