Actualiza y refina todas las reglas de alerta (3db64a77) · Commits · REDMIC Project / Metric / Prometheus

deploy/rules/elasticsearch.rules.yml

+11 −12

Original line number	Diff line number	Diff line
		@@ -5,39 +5,38 @@ groups:
		expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes)
		/ elasticsearch_filesystem_data_size_bytes

		- record: elasticsearch_filesystem_data_free_percent
		expr: 100 - elasticsearch_filesystem_data_used_percent

		- alert: ElasticsearchTooFewNodesRunning
		expr: elasticsearch_cluster_health_number_of_nodes < 3
		for: 5m
		labels:
		severity: critical
		annotations:
		description: There are only {{$value}} (< 3) Elasticsearch nodes running
		description: There are only {{$value}} Elasticsearch nodes running in cluster '{{$labels.cluster}}'
		summary: Elasticsearch running on less than 3 nodes

		- alert: ElasticsearchHeapTooHigh
		expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9
		expr: 100 * (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) > 80
		for: 15m
		labels:
		severity: critical
		annotations:
		description: The heap usage is over 90% for 15m in node '{{$labels.name}}'
		summary: Elasticsearch node {{$labels.name}} heap usage is too high
		description: The heap usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
		summary: Elasticsearch node {{$labels.name}} heap usage is too high (> 80%)

		- alert: ElasticsearchHighFilesystemDataUsedPercent
		expr: elasticsearch_filesystem_data_used_percent > 80
		for: 1m
		labels:
		severity: critical
		annotations:
		description: Filesystem usage is over 80% ({{$value}}%) in node '{{$labels.name}}'
		summary: Elasticsearch filesystem usage is too high
		description: Filesystem usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
		summary: Elasticsearch node {{$labels.name}} filesystem usage is too high (> 80%)

		- alert: ElasticsearchGarbageCollectionTooSlow
		expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 1
		expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 0.5
		for: 1m
		labels:
		severity: critical
		annotations:
		description: GC is taking more than 1 second to complete in node '{{$labels.name}}' ({{$labels.gc}} - {{$value}}s)
		summary: Elasticsearch memory usage is too heavy and GC is taking too much time
		description: GC ({{$labels.gc}} space) is taking {{humanizeDuration $value}} to complete in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
		summary: Elasticsearch node {{$labels.name}} memory usage is too heavy and GC is taking too much time (> 500 ms)

deploy/rules/kafka.rules.yml

+24 −6

Original line number	Diff line number	Diff line
		groups:
		- name: kafka
		rules:
		- alert: KafkaConsumerLagTooHigh
		- alert: KafkaConsumerHighLag
		expr: kafka_consumergroup_lag_sum > 1000
		for: 5m
		labels:
		severity: critical
		severity: warning
		annotations:
		description: Consumer '{{$labels.consumergroup}}' has {{$value}} pending messages (> 1000) from '{{$labels.topic}}' topic
		summary: Kafka consumer '{{$labels.consumergroup}}' is not consuming enough messages from '{{$labels.topic}}'
		description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic
		summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 1K)

		- alert: KafkaConsumerLagIncreasing
		expr: delta(kafka_consumergroup_lag_sum[5m]) > 0
		for: 5m
		labels:
		severity: warning
		annotations:
		description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 5 minutes by {{humanize $value}} for '{{$labels.topic}}' topic
		summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}'

		- alert: KafkaConsumerTooHighLag
		expr: kafka_consumergroup_lag_sum > 10000
		for: 5m
		labels:
		severity: critical
		annotations:
		description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic
		summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 10K)

		- alert: KafkaConsumerLagIncreasingTooMuch
		expr: delta(kafka_consumergroup_lag_sum[15m]) > 0
		for: 5m
		labels:
		severity: critical
		annotations:
		description: Consumer '{{$labels.consumergroup}}' lag increased in last 5 minutes by {{$value}} for '{{$labels.topic}}' topic
		summary: Kafka consumer '{{$labels.consumergroup}}' consuming ratio is not enough for '{{$labels.topic}}'
		description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 15 minutes by {{humanize $value}} for '{{$labels.topic}}' topic
		summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}'

deploy/rules/scheduled_job.rules.yml

+15 −18

Original line number	Diff line number	Diff line
		groups:
		- name: scheduled_job
		rules:
		- alert: backup_db_not_created
		expr: time() - backup_created_date_seconds{} > 86400
		- alert: BackupDatabaseNotCreated
		expr: time() - backup_created_date_seconds{} > 129600
		labels:
		severity: warning
		annotations:
		description: Error creating backup for '{{ $labels.label }}' database.
		summary: Error creating backup for '{{ $labels.label }}' database.
		description: Backup for '{{$labels.label}}' database delayed for {{humanizeDuration $value}}

		- alert: certificate_renew_not_attempted
		expr: time() - certificates_valid_date_seconds{} > 604800
		- alert: CertificateRenewalNotAttempted
		expr: time() - certificates_valid_date_seconds{} > 907200
		labels:
		severity: warning
		annotations:
		description: Error attemping to renew '{{ $labels.label }}' certificate.
		summary: Error attemping to renew '{{ $labels.label }}' certificate.
		description: Renewal attempt for '{{$labels.label}}' certificate delayed for {{humanizeDuration $value}}

		- alert: certificate_not_renewed
		- alert: CertificateNotRenewed
		expr: time() - certificates_updated_date_seconds{} > 5788800
		labels:
		severity: warning
		severity: critical
		annotations:
		description: Error, '{{ $labels.label }}' certificate near expiry.
		summary: Error, '{{ $labels.label }}' certificate near expiry.
		description: Certificate of '{{$labels.label}}' not renewed since {{humanizeDuration $value}}, expiry date is 3 months

		- alert: ElasticsearchSnapshotNotPerformed
		expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 86400
		expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 129600
		labels:
		severity: warning
		annotations:
		description: Last snapshot for '{{ $labels.repository }}' repository is too old.
		summary: Scheduled Elasticsearch snapshot is not working.
		description: Snapshot for '{{$labels.repository}}' repository delayed for {{humanizeDuration $value}}
		summary: Scheduled Elasticsearch snapshot creation is not working

		- alert: ElasticsearchSnapshotCleanupNotPerformed
		expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 64
		expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 100
		labels:
		severity: warning
		annotations:
		description: Too many snapshots ({{$value}}) found for '{{ $labels.repository }}' repository.
		summary: Scheduled Elasticsearch snapshot cleanup is not working.
		description: Too many snapshots ({{humanize $value}}) found for '{{$labels.repository}}' repository
		summary: Scheduled Elasticsearch snapshot cleanup is not working

deploy/rules/swarm_node.rules.dev.yml

+22 −28

Original line number	Diff line number	Diff line
		@@ -3,36 +3,33 @@ groups:
		rules:
		- alert: node_cpu_usage
		expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
		node_meta * 100) BY (node_name)) > 80
		for: 1m
		node_meta * 100) BY (node_name)) > 90
		for: 5m
		labels:
		severity: warning
		annotations:
		description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}%
		summary: CPU alert for Swarm node '{{$labels.node_name}}'

		- alert: node_memory_usage
		expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
		* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
		for: 1m
		* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90
		for: 5m
		labels:
		severity: warning
		annotations:
		description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}%
		summary: Memory alert for Swarm node '{{$labels.node_name}}'

		- alert: node_disk_usage
		expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
		* 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
		node_meta > 85
		for: 1m
		for: 5m
		labels:
		severity: warning
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}%
		summary: Disk alert for Swarm node '{{$labels.node_name}}'

		- alert: node_disk_fill_rate_6h
		@@ -42,29 +39,26 @@ groups:
		labels:
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk is going to fill up in
		6h.
		description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h
		summary: Disk fill alert for Swarm node '{{$labels.node_name}}'

		- alert: node_docker_disk_usage
		- alert: SwarmNodeDockerDiskUsage
		expr: ((node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"} - node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"})
		* 100 / node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"}) * ON(instance) GROUP_LEFT(node_name)
		node_meta > 85
		for: 1m
		for: 5m
		labels:
		severity: warning
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) usage is at {{ humanize
		$value}}%.
		summary: Disk (Docker mountpoint) alert for Swarm node '{{ $labels.node_name }}'
		description: Swarm node {{$labels.node_name}} Docker mountpoint disk usage is at {{humanize $value}}%
		summary: Docker mountpoint disk alert for Swarm node '{{$labels.node_name}}'

		- alert: node_docker_disk_fill_rate_6h
		- alert: SwarmNodeDockerDiskFillRate
		expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}[1h], 6 * 3600) * ON(instance)
		GROUP_LEFT(node_name) node_meta < 0
		for: 1h
		labels:
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) is going to fill up in
		6h.
		summary: Disk (Docker mountpoint) fill alert for Swarm node '{{ $labels.node_name }}'
		description: Swarm node {{$labels.node_name}} Docker mountpoint disk is going to fill up in 6h
		summary: Docker mountpoint disk fill alert for Swarm node '{{$labels.node_name}}'

deploy/rules/swarm_node.rules.prod.yml

+22 −28

Original line number	Diff line number	Diff line
		@@ -3,36 +3,33 @@ groups:
		rules:
		- alert: node_cpu_usage
		expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
		node_meta * 100) BY (node_name)) > 80
		for: 1m
		node_meta * 100) BY (node_name)) > 90
		for: 5m
		labels:
		severity: warning
		annotations:
		description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}%
		summary: CPU alert for Swarm node '{{$labels.node_name}}'

		- alert: node_memory_usage
		expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
		* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
		for: 1m
		* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90
		for: 5m
		labels:
		severity: warning
		annotations:
		description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}%
		summary: Memory alert for Swarm node '{{$labels.node_name}}'

		- alert: node_disk_usage
		expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
		* 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
		node_meta > 85
		for: 1m
		for: 5m
		labels:
		severity: warning
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}%
		summary: Disk alert for Swarm node '{{$labels.node_name}}'

		- alert: node_disk_fill_rate_6h
		@@ -42,29 +39,26 @@ groups:
		labels:
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk is going to fill up in
		6h.
		description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h
		summary: Disk fill alert for Swarm node '{{$labels.node_name}}'

		- alert: node_ebs_disk_usage
		- alert: SwarmNodeEbsDiskUsage
		expr: ((node_filesystem_size_bytes{mountpoint=~"./ebs/."} - node_filesystem_free_bytes{mountpoint=~"./ebs/."})
		* 100 / node_filesystem_size_bytes{mountpoint=~"./ebs/."}) * ON(instance) GROUP_LEFT(node_name)
		node_meta > 85
		for: 1m
		for: 5m
		labels:
		severity: warning
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' usage is at {{humanize $value}}%
		summary: EBS disk alert for Swarm node '{{$labels.node_name}}'

		- alert: node_ebs_disk_fill_rate_6h
		- alert: SwarmNodeEbsDiskFillRate
		expr: predict_linear(node_filesystem_free_bytes{mountpoint=~"./ebs/."}[1h], 6 * 3600) * ON(instance)
		GROUP_LEFT(node_name) node_meta < 0
		for: 1h
		labels:
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} is going to fill up in
		6h.
		description: Swarm node {{$labels.node_name}} EBS disk '{{$labels.mountpoint}}' is going to fill up in 6h
		summary: EBS disk fill alert for Swarm node '{{$labels.node_name}}'