Loading deploy/rules/elasticsearch.rules.yml +11 −12 Original line number Diff line number Diff line Loading @@ -5,39 +5,38 @@ groups: expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes - record: elasticsearch_filesystem_data_free_percent expr: 100 - elasticsearch_filesystem_data_used_percent - alert: ElasticsearchTooFewNodesRunning expr: elasticsearch_cluster_health_number_of_nodes < 3 for: 5m labels: severity: critical annotations: description: There are only {{$value}} (< 3) Elasticsearch nodes running description: There are only {{$value}} Elasticsearch nodes running in cluster '{{$labels.cluster}}' summary: Elasticsearch running on less than 3 nodes - alert: ElasticsearchHeapTooHigh expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9 expr: 100 * (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) > 80 for: 15m labels: severity: critical annotations: description: The heap usage is over 90% for 15m in node '{{$labels.name}}' summary: Elasticsearch node {{$labels.name}} heap usage is too high description: The heap usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}' summary: Elasticsearch node {{$labels.name}} heap usage is too high (> 80%) - alert: ElasticsearchHighFilesystemDataUsedPercent expr: elasticsearch_filesystem_data_used_percent > 80 for: 1m labels: severity: critical annotations: description: Filesystem usage is over 80% ({{$value}}%) in node '{{$labels.name}}' summary: Elasticsearch filesystem usage is too high description: Filesystem usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}' summary: Elasticsearch node {{$labels.name}} filesystem usage is too high (> 80%) - alert: ElasticsearchGarbageCollectionTooSlow expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 1 expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 0.5 for: 1m labels: severity: critical annotations: description: GC is taking more than 1 second to complete in node '{{$labels.name}}' ({{$labels.gc}} - {{$value}}s) summary: Elasticsearch memory usage is too heavy and GC is taking too much time description: GC ({{$labels.gc}} space) is taking {{humanizeDuration $value}} to complete in node '{{$labels.name}}' from cluster '{{$labels.cluster}}' summary: Elasticsearch node {{$labels.name}} memory usage is too heavy and GC is taking too much time (> 500 ms) deploy/rules/kafka.rules.yml +24 −6 Original line number Diff line number Diff line groups: - name: kafka rules: - alert: KafkaConsumerLagTooHigh - alert: KafkaConsumerHighLag expr: kafka_consumergroup_lag_sum > 1000 for: 5m labels: severity: critical severity: warning annotations: description: Consumer '{{$labels.consumergroup}}' has {{$value}} pending messages (> 1000) from '{{$labels.topic}}' topic summary: Kafka consumer '{{$labels.consumergroup}}' is not consuming enough messages from '{{$labels.topic}}' description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 1K) - alert: KafkaConsumerLagIncreasing expr: delta(kafka_consumergroup_lag_sum[5m]) > 0 for: 5m labels: severity: warning annotations: description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 5 minutes by {{humanize $value}} for '{{$labels.topic}}' topic summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}' - alert: KafkaConsumerTooHighLag expr: kafka_consumergroup_lag_sum > 10000 for: 5m labels: severity: critical annotations: description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 10K) - alert: KafkaConsumerLagIncreasingTooMuch expr: delta(kafka_consumergroup_lag_sum[15m]) > 0 for: 5m labels: severity: critical annotations: description: Consumer '{{$labels.consumergroup}}' lag increased in last 5 minutes by {{$value}} for '{{$labels.topic}}' topic summary: Kafka consumer '{{$labels.consumergroup}}' consuming ratio is not enough for '{{$labels.topic}}' description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 15 minutes by {{humanize $value}} for '{{$labels.topic}}' topic summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}' deploy/rules/scheduled_job.rules.yml +15 −18 Original line number Diff line number Diff line groups: - name: scheduled_job rules: - alert: backup_db_not_created expr: time() - backup_created_date_seconds{} > 86400 - alert: BackupDatabaseNotCreated expr: time() - backup_created_date_seconds{} > 129600 labels: severity: warning annotations: description: Error creating backup for '{{ $labels.label }}' database. summary: Error creating backup for '{{ $labels.label }}' database. description: Backup for '{{$labels.label}}' database delayed for {{humanizeDuration $value}} - alert: certificate_renew_not_attempted expr: time() - certificates_valid_date_seconds{} > 604800 - alert: CertificateRenewalNotAttempted expr: time() - certificates_valid_date_seconds{} > 907200 labels: severity: warning annotations: description: Error attemping to renew '{{ $labels.label }}' certificate. summary: Error attemping to renew '{{ $labels.label }}' certificate. description: Renewal attempt for '{{$labels.label}}' certificate delayed for {{humanizeDuration $value}} - alert: certificate_not_renewed - alert: CertificateNotRenewed expr: time() - certificates_updated_date_seconds{} > 5788800 labels: severity: warning severity: critical annotations: description: Error, '{{ $labels.label }}' certificate near expiry. summary: Error, '{{ $labels.label }}' certificate near expiry. description: Certificate of '{{$labels.label}}' not renewed since {{humanizeDuration $value}}, expiry date is 3 months - alert: ElasticsearchSnapshotNotPerformed expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 86400 expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 129600 labels: severity: warning annotations: description: Last snapshot for '{{ $labels.repository }}' repository is too old. summary: Scheduled Elasticsearch snapshot is not working. description: Snapshot for '{{$labels.repository}}' repository delayed for {{humanizeDuration $value}} summary: Scheduled Elasticsearch snapshot creation is not working - alert: ElasticsearchSnapshotCleanupNotPerformed expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 64 expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 100 labels: severity: warning annotations: description: Too many snapshots ({{$value}}) found for '{{ $labels.repository }}' repository. summary: Scheduled Elasticsearch snapshot cleanup is not working. description: Too many snapshots ({{humanize $value}}) found for '{{$labels.repository}}' repository summary: Scheduled Elasticsearch snapshot cleanup is not working deploy/rules/swarm_node.rules.dev.yml +22 −28 Original line number Diff line number Diff line Loading @@ -3,36 +3,33 @@ groups: rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 80 for: 1m node_meta * 100) BY (node_name)) > 90 for: 5m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}% summary: CPU alert for Swarm node '{{$labels.node_name}}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90 for: 5m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}% summary: Memory alert for Swarm node '{{$labels.node_name}}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m for: 5m labels: severity: warning severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}% summary: Disk alert for Swarm node '{{$labels.node_name}}' - alert: node_disk_fill_rate_6h Loading @@ -42,29 +39,26 @@ groups: labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h summary: Disk fill alert for Swarm node '{{$labels.node_name}}' - alert: node_docker_disk_usage - alert: SwarmNodeDockerDiskUsage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"} - node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m for: 5m labels: severity: warning severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) usage is at {{ humanize $value}}%. summary: Disk (Docker mountpoint) alert for Swarm node '{{ $labels.node_name }}' description: Swarm node {{$labels.node_name}} Docker mountpoint disk usage is at {{humanize $value}}% summary: Docker mountpoint disk alert for Swarm node '{{$labels.node_name}}' - alert: node_docker_disk_fill_rate_6h - alert: SwarmNodeDockerDiskFillRate expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) is going to fill up in 6h. summary: Disk (Docker mountpoint) fill alert for Swarm node '{{ $labels.node_name }}' description: Swarm node {{$labels.node_name}} Docker mountpoint disk is going to fill up in 6h summary: Docker mountpoint disk fill alert for Swarm node '{{$labels.node_name}}' deploy/rules/swarm_node.rules.prod.yml +22 −28 Original line number Diff line number Diff line Loading @@ -3,36 +3,33 @@ groups: rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 80 for: 1m node_meta * 100) BY (node_name)) > 90 for: 5m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}% summary: CPU alert for Swarm node '{{$labels.node_name}}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90 for: 5m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}% summary: Memory alert for Swarm node '{{$labels.node_name}}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m for: 5m labels: severity: warning severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}% summary: Disk alert for Swarm node '{{$labels.node_name}}' - alert: node_disk_fill_rate_6h Loading @@ -42,29 +39,26 @@ groups: labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h summary: Disk fill alert for Swarm node '{{$labels.node_name}}' - alert: node_ebs_disk_usage - alert: SwarmNodeEbsDiskUsage expr: ((node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"} - node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}) * 100 / node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m for: 5m labels: severity: warning severity: critical annotations: description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' usage is at {{humanize $value}}% summary: EBS disk alert for Swarm node '{{$labels.node_name}}' - alert: node_ebs_disk_fill_rate_6h - alert: SwarmNodeEbsDiskFillRate expr: predict_linear(node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} is going to fill up in 6h. description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' is going to fill up in 6h summary: EBS disk fill alert for Swarm node '{{$labels.node_name}}' Loading
deploy/rules/elasticsearch.rules.yml +11 −12 Original line number Diff line number Diff line Loading @@ -5,39 +5,38 @@ groups: expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes - record: elasticsearch_filesystem_data_free_percent expr: 100 - elasticsearch_filesystem_data_used_percent - alert: ElasticsearchTooFewNodesRunning expr: elasticsearch_cluster_health_number_of_nodes < 3 for: 5m labels: severity: critical annotations: description: There are only {{$value}} (< 3) Elasticsearch nodes running description: There are only {{$value}} Elasticsearch nodes running in cluster '{{$labels.cluster}}' summary: Elasticsearch running on less than 3 nodes - alert: ElasticsearchHeapTooHigh expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9 expr: 100 * (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) > 80 for: 15m labels: severity: critical annotations: description: The heap usage is over 90% for 15m in node '{{$labels.name}}' summary: Elasticsearch node {{$labels.name}} heap usage is too high description: The heap usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}' summary: Elasticsearch node {{$labels.name}} heap usage is too high (> 80%) - alert: ElasticsearchHighFilesystemDataUsedPercent expr: elasticsearch_filesystem_data_used_percent > 80 for: 1m labels: severity: critical annotations: description: Filesystem usage is over 80% ({{$value}}%) in node '{{$labels.name}}' summary: Elasticsearch filesystem usage is too high description: Filesystem usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}' summary: Elasticsearch node {{$labels.name}} filesystem usage is too high (> 80%) - alert: ElasticsearchGarbageCollectionTooSlow expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 1 expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 0.5 for: 1m labels: severity: critical annotations: description: GC is taking more than 1 second to complete in node '{{$labels.name}}' ({{$labels.gc}} - {{$value}}s) summary: Elasticsearch memory usage is too heavy and GC is taking too much time description: GC ({{$labels.gc}} space) is taking {{humanizeDuration $value}} to complete in node '{{$labels.name}}' from cluster '{{$labels.cluster}}' summary: Elasticsearch node {{$labels.name}} memory usage is too heavy and GC is taking too much time (> 500 ms)
deploy/rules/kafka.rules.yml +24 −6 Original line number Diff line number Diff line groups: - name: kafka rules: - alert: KafkaConsumerLagTooHigh - alert: KafkaConsumerHighLag expr: kafka_consumergroup_lag_sum > 1000 for: 5m labels: severity: critical severity: warning annotations: description: Consumer '{{$labels.consumergroup}}' has {{$value}} pending messages (> 1000) from '{{$labels.topic}}' topic summary: Kafka consumer '{{$labels.consumergroup}}' is not consuming enough messages from '{{$labels.topic}}' description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 1K) - alert: KafkaConsumerLagIncreasing expr: delta(kafka_consumergroup_lag_sum[5m]) > 0 for: 5m labels: severity: warning annotations: description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 5 minutes by {{humanize $value}} for '{{$labels.topic}}' topic summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}' - alert: KafkaConsumerTooHighLag expr: kafka_consumergroup_lag_sum > 10000 for: 5m labels: severity: critical annotations: description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 10K) - alert: KafkaConsumerLagIncreasingTooMuch expr: delta(kafka_consumergroup_lag_sum[15m]) > 0 for: 5m labels: severity: critical annotations: description: Consumer '{{$labels.consumergroup}}' lag increased in last 5 minutes by {{$value}} for '{{$labels.topic}}' topic summary: Kafka consumer '{{$labels.consumergroup}}' consuming ratio is not enough for '{{$labels.topic}}' description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 15 minutes by {{humanize $value}} for '{{$labels.topic}}' topic summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}'
deploy/rules/scheduled_job.rules.yml +15 −18 Original line number Diff line number Diff line groups: - name: scheduled_job rules: - alert: backup_db_not_created expr: time() - backup_created_date_seconds{} > 86400 - alert: BackupDatabaseNotCreated expr: time() - backup_created_date_seconds{} > 129600 labels: severity: warning annotations: description: Error creating backup for '{{ $labels.label }}' database. summary: Error creating backup for '{{ $labels.label }}' database. description: Backup for '{{$labels.label}}' database delayed for {{humanizeDuration $value}} - alert: certificate_renew_not_attempted expr: time() - certificates_valid_date_seconds{} > 604800 - alert: CertificateRenewalNotAttempted expr: time() - certificates_valid_date_seconds{} > 907200 labels: severity: warning annotations: description: Error attemping to renew '{{ $labels.label }}' certificate. summary: Error attemping to renew '{{ $labels.label }}' certificate. description: Renewal attempt for '{{$labels.label}}' certificate delayed for {{humanizeDuration $value}} - alert: certificate_not_renewed - alert: CertificateNotRenewed expr: time() - certificates_updated_date_seconds{} > 5788800 labels: severity: warning severity: critical annotations: description: Error, '{{ $labels.label }}' certificate near expiry. summary: Error, '{{ $labels.label }}' certificate near expiry. description: Certificate of '{{$labels.label}}' not renewed since {{humanizeDuration $value}}, expiry date is 3 months - alert: ElasticsearchSnapshotNotPerformed expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 86400 expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 129600 labels: severity: warning annotations: description: Last snapshot for '{{ $labels.repository }}' repository is too old. summary: Scheduled Elasticsearch snapshot is not working. description: Snapshot for '{{$labels.repository}}' repository delayed for {{humanizeDuration $value}} summary: Scheduled Elasticsearch snapshot creation is not working - alert: ElasticsearchSnapshotCleanupNotPerformed expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 64 expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 100 labels: severity: warning annotations: description: Too many snapshots ({{$value}}) found for '{{ $labels.repository }}' repository. summary: Scheduled Elasticsearch snapshot cleanup is not working. description: Too many snapshots ({{humanize $value}}) found for '{{$labels.repository}}' repository summary: Scheduled Elasticsearch snapshot cleanup is not working
deploy/rules/swarm_node.rules.dev.yml +22 −28 Original line number Diff line number Diff line Loading @@ -3,36 +3,33 @@ groups: rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 80 for: 1m node_meta * 100) BY (node_name)) > 90 for: 5m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}% summary: CPU alert for Swarm node '{{$labels.node_name}}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90 for: 5m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}% summary: Memory alert for Swarm node '{{$labels.node_name}}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m for: 5m labels: severity: warning severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}% summary: Disk alert for Swarm node '{{$labels.node_name}}' - alert: node_disk_fill_rate_6h Loading @@ -42,29 +39,26 @@ groups: labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h summary: Disk fill alert for Swarm node '{{$labels.node_name}}' - alert: node_docker_disk_usage - alert: SwarmNodeDockerDiskUsage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"} - node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m for: 5m labels: severity: warning severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) usage is at {{ humanize $value}}%. summary: Disk (Docker mountpoint) alert for Swarm node '{{ $labels.node_name }}' description: Swarm node {{$labels.node_name}} Docker mountpoint disk usage is at {{humanize $value}}% summary: Docker mountpoint disk alert for Swarm node '{{$labels.node_name}}' - alert: node_docker_disk_fill_rate_6h - alert: SwarmNodeDockerDiskFillRate expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) is going to fill up in 6h. summary: Disk (Docker mountpoint) fill alert for Swarm node '{{ $labels.node_name }}' description: Swarm node {{$labels.node_name}} Docker mountpoint disk is going to fill up in 6h summary: Docker mountpoint disk fill alert for Swarm node '{{$labels.node_name}}'
deploy/rules/swarm_node.rules.prod.yml +22 −28 Original line number Diff line number Diff line Loading @@ -3,36 +3,33 @@ groups: rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 80 for: 1m node_meta * 100) BY (node_name)) > 90 for: 5m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}% summary: CPU alert for Swarm node '{{$labels.node_name}}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90 for: 5m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}% summary: Memory alert for Swarm node '{{$labels.node_name}}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m for: 5m labels: severity: warning severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}% summary: Disk alert for Swarm node '{{$labels.node_name}}' - alert: node_disk_fill_rate_6h Loading @@ -42,29 +39,26 @@ groups: labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h summary: Disk fill alert for Swarm node '{{$labels.node_name}}' - alert: node_ebs_disk_usage - alert: SwarmNodeEbsDiskUsage expr: ((node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"} - node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}) * 100 / node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m for: 5m labels: severity: warning severity: critical annotations: description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} usage is at {{ humanize $value}}%. description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' usage is at {{humanize $value}}% summary: EBS disk alert for Swarm node '{{$labels.node_name}}' - alert: node_ebs_disk_fill_rate_6h - alert: SwarmNodeEbsDiskFillRate expr: predict_linear(node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} is going to fill up in 6h. description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' is going to fill up in 6h summary: EBS disk fill alert for Swarm node '{{$labels.node_name}}'