Merge branch 'dev' into 'master' (3ad408a4) · Commits · REDMIC Project / Metric / Prometheus

LICENSE

0 → 100644

+21 −0

Original line number	Diff line number	Diff line
		MIT License

		Copyright (c) 2019 REDMIC Project / Metric

		Permission is hereby granted, free of charge, to any person obtaining a copy
		of this software and associated documentation files (the "Software"), to deal
		in the Software without restriction, including without limitation the rights
		to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
		copies of the Software, and to permit persons to whom the Software is
		furnished to do so, subject to the following conditions:

		The above copyright notice and this permission notice shall be included in all
		copies or substantial portions of the Software.

		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
		AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
		SOFTWARE.

deploy/rules/elasticsearch.rules.yml

+11 −12

Original line number	Diff line number	Diff line
		@@ -5,39 +5,38 @@ groups:
		expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes)
		/ elasticsearch_filesystem_data_size_bytes

		- record: elasticsearch_filesystem_data_free_percent
		expr: 100 - elasticsearch_filesystem_data_used_percent

		- alert: ElasticsearchTooFewNodesRunning
		expr: elasticsearch_cluster_health_number_of_nodes < 3
		for: 5m
		labels:
		severity: critical
		annotations:
		description: There are only {{$value}} (< 3) Elasticsearch nodes running
		description: There are only {{$value}} Elasticsearch nodes running in cluster '{{$labels.cluster}}'
		summary: Elasticsearch running on less than 3 nodes

		- alert: ElasticsearchHeapTooHigh
		expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9
		expr: 100 * (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) > 80
		for: 15m
		labels:
		severity: critical
		annotations:
		description: The heap usage is over 90% for 15m in node '{{$labels.name}}'
		summary: Elasticsearch node {{$labels.name}} heap usage is too high
		description: The heap usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
		summary: Elasticsearch node {{$labels.name}} heap usage is too high (> 80%)

		- alert: ElasticsearchHighFilesystemDataUsedPercent
		expr: elasticsearch_filesystem_data_used_percent > 80
		for: 1m
		labels:
		severity: critical
		annotations:
		description: Filesystem usage is over 80% ({{$value}}%) in node '{{$labels.name}}'
		summary: Elasticsearch filesystem usage is too high
		description: Filesystem usage is at {{humanize $value}}% in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
		summary: Elasticsearch node {{$labels.name}} filesystem usage is too high (> 80%)

		- alert: ElasticsearchGarbageCollectionTooSlow
		expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 1
		expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 0.5
		for: 1m
		labels:
		severity: critical
		annotations:
		description: GC is taking more than 1 second to complete in node '{{$labels.name}}' ({{$labels.gc}} - {{$value}}s)
		summary: Elasticsearch memory usage is too heavy and GC is taking too much time
		description: GC ({{$labels.gc}} space) is taking {{humanizeDuration $value}} to complete in node '{{$labels.name}}' from cluster '{{$labels.cluster}}'
		summary: Elasticsearch node {{$labels.name}} memory usage is too heavy and GC is taking too much time (> 500 ms)

deploy/rules/kafka.rules.yml

+24 −6

Original line number	Diff line number	Diff line
		groups:
		- name: kafka
		rules:
		- alert: KafkaConsumerLagTooHigh
		- alert: KafkaConsumerHighLag
		expr: kafka_consumergroup_lag_sum > 1000
		for: 5m
		labels:
		severity: critical
		severity: warning
		annotations:
		description: Consumer '{{$labels.consumergroup}}' has {{$value}} pending messages (> 1000) from '{{$labels.topic}}' topic
		summary: Kafka consumer '{{$labels.consumergroup}}' is not consuming enough messages from '{{$labels.topic}}'
		description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic
		summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 1K)

		- alert: KafkaConsumerLagIncreasing
		expr: delta(kafka_consumergroup_lag_sum[5m]) > 0
		for: 5m
		labels:
		severity: warning
		annotations:
		description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 5 minutes by {{humanize $value}} for '{{$labels.topic}}' topic
		summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}'

		- alert: KafkaConsumerTooHighLag
		expr: kafka_consumergroup_lag_sum > 10000
		for: 5m
		labels:
		severity: critical
		annotations:
		description: Consumer group '{{$labels.consumergroup}}' has {{humanize $value}} pending messages from '{{$labels.topic}}' topic
		summary: Kafka consumers from '{{$labels.consumergroup}}' group are not consuming enough messages from '{{$labels.topic}}' (lag > 10K)

		- alert: KafkaConsumerLagIncreasingTooMuch
		expr: delta(kafka_consumergroup_lag_sum[15m]) > 0
		for: 5m
		labels:
		severity: critical
		annotations:
		description: Consumer '{{$labels.consumergroup}}' lag increased in last 5 minutes by {{$value}} for '{{$labels.topic}}' topic
		summary: Kafka consumer '{{$labels.consumergroup}}' consuming ratio is not enough for '{{$labels.topic}}'
		description: Lag of consumer group '{{$labels.consumergroup}}' increased in last 15 minutes by {{humanize $value}} for '{{$labels.topic}}' topic
		summary: Consuming ratio of Kafka consumer group '{{$labels.consumergroup}}' is not enough for '{{$labels.topic}}'

deploy/rules/scheduled_job.rules.yml

+15 −18

Original line number	Diff line number	Diff line
		groups:
		- name: scheduled_job
		rules:
		- alert: backup_db_not_created
		expr: time() - backup_created_date_seconds{} > 86400
		- alert: BackupDatabaseNotCreated
		expr: time() - backup_created_date_seconds{} > 129600
		labels:
		severity: warning
		annotations:
		description: Error creating backup for '{{ $labels.label }}' database.
		summary: Error creating backup for '{{ $labels.label }}' database.
		description: Backup for '{{$labels.label}}' database delayed for {{humanizeDuration $value}}

		- alert: certificate_renew_not_attempted
		expr: time() - certificates_valid_date_seconds{} > 604800
		- alert: CertificateRenewalNotAttempted
		expr: time() - certificates_valid_date_seconds{} > 907200
		labels:
		severity: warning
		annotations:
		description: Error attemping to renew '{{ $labels.label }}' certificate.
		summary: Error attemping to renew '{{ $labels.label }}' certificate.
		description: Renewal attempt for '{{$labels.label}}' certificate delayed for {{humanizeDuration $value}}

		- alert: certificate_not_renewed
		- alert: CertificateNotRenewed
		expr: time() - certificates_updated_date_seconds{} > 5788800
		labels:
		severity: warning
		severity: critical
		annotations:
		description: Error, '{{ $labels.label }}' certificate near expiry.
		summary: Error, '{{ $labels.label }}' certificate near expiry.
		description: Certificate of '{{$labels.label}}' not renewed since {{humanizeDuration $value}}, expiry date is 3 months

		- alert: ElasticsearchSnapshotNotPerformed
		expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 86400
		expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 129600
		labels:
		severity: warning
		annotations:
		description: Last snapshot for '{{ $labels.repository }}' repository is too old.
		summary: Scheduled Elasticsearch snapshot is not working.
		description: Snapshot for '{{$labels.repository}}' repository delayed for {{humanizeDuration $value}}
		summary: Scheduled Elasticsearch snapshot creation is not working

		- alert: ElasticsearchSnapshotCleanupNotPerformed
		expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 64
		expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 100
		labels:
		severity: warning
		annotations:
		description: Too many snapshots ({{$value}}) found for '{{ $labels.repository }}' repository.
		summary: Scheduled Elasticsearch snapshot cleanup is not working.
		description: Too many snapshots ({{humanize $value}}) found for '{{$labels.repository}}' repository
		summary: Scheduled Elasticsearch snapshot cleanup is not working

deploy/rules/swarm_node.rules.dev.yml

+22 −28

Original line number	Diff line number	Diff line
		@@ -3,36 +3,33 @@ groups:
		rules:
		- alert: node_cpu_usage
		expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
		node_meta * 100) BY (node_name)) > 80
		for: 1m
		node_meta * 100) BY (node_name)) > 90
		for: 5m
		labels:
		severity: warning
		annotations:
		description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} CPU usage is at {{humanize $value}}%
		summary: CPU alert for Swarm node '{{$labels.node_name}}'

		- alert: node_memory_usage
		expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
		* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
		for: 1m
		* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 90
		for: 5m
		labels:
		severity: warning
		annotations:
		description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} memory usage is at {{humanize $value}}%
		summary: Memory alert for Swarm node '{{$labels.node_name}}'

		- alert: node_disk_usage
		expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
		* 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
		node_meta > 85
		for: 1m
		for: 5m
		labels:
		severity: warning
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
		$value}}%.
		description: Swarm node {{$labels.node_name}} disk usage is at {{humanize $value}}%
		summary: Disk alert for Swarm node '{{$labels.node_name}}'

		- alert: node_disk_fill_rate_6h
		@@ -42,29 +39,26 @@ groups:
		labels:
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk is going to fill up in
		6h.
		description: Swarm node {{$labels.node_name}} disk is going to fill up in 6h
		summary: Disk fill alert for Swarm node '{{$labels.node_name}}'

		- alert: node_docker_disk_usage
		- alert: SwarmNodeDockerDiskUsage
		expr: ((node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"} - node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"})
		* 100 / node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"}) * ON(instance) GROUP_LEFT(node_name)
		node_meta > 85
		for: 1m
		for: 5m
		labels:
		severity: warning
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) usage is at {{ humanize
		$value}}%.
		summary: Disk (Docker mountpoint) alert for Swarm node '{{ $labels.node_name }}'
		description: Swarm node {{$labels.node_name}} Docker mountpoint disk usage is at {{humanize $value}}%
		summary: Docker mountpoint disk alert for Swarm node '{{$labels.node_name}}'

		- alert: node_docker_disk_fill_rate_6h
		- alert: SwarmNodeDockerDiskFillRate
		expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}[1h], 6 * 3600) * ON(instance)
		GROUP_LEFT(node_name) node_meta < 0
		for: 1h
		labels:
		severity: critical
		annotations:
		description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) is going to fill up in
		6h.
		summary: Disk (Docker mountpoint) fill alert for Swarm node '{{ $labels.node_name }}'
		description: Swarm node {{$labels.node_name}} Docker mountpoint disk is going to fill up in 6h
		summary: Docker mountpoint disk fill alert for Swarm node '{{$labels.node_name}}'