Loading deploy/docker-compose.tmpl.yml +18 −12 Original line number Diff line number Diff line Loading @@ -12,11 +12,11 @@ services: - '--storage.tsdb.retention.time=${STORAGE_TSDB_RETENTION_TIME}' - '--storage.tsdb.retention.size=${STORAGE_TSDB_RETENTION_SIZE}' user: '0:0' environment: - JOBS networks: - metric-net - traefik-net environment: - JOBS volumes: - prometheus:/prometheus configs: Loading @@ -27,12 +27,14 @@ services: mode: 0744 - source: node_rules target: /etc/prometheus/swarm_node.rules.yml - source: task_rules target: /etc/prometheus/swarm_task.rules.yml - source: job_rules target: /etc/prometheus/swarm_job.rules.yml - source: service_rules target: /etc/prometheus/swarm_service.rules.yml - source: task_rules target: /etc/prometheus/swarm_task.rules.yml - source: scheduled_job_rules target: /etc/prometheus/scheduled_job.rules.yml - source: elasticsearch_rules target: /etc/prometheus/elasticsearch.rules.yml healthcheck: test: wget --spider -q http://localhost:9090 interval: 30s Loading Loading @@ -70,14 +72,18 @@ configs: name: ${PROMETHEUS_ENTRYPOINT_NAME:-prometheus-entrypoint} file: ./conf/entrypoint.sh service_rules: name: ${SERVICE_RULES_NAME:-service_rules} file: ./rules/swarm_service.rules.yml task_rules: name: ${TASK_RULES_NAME:-task_rules} file: ./rules/swarm_task.rules.yml job_rules: name: ${JOB_RULES_NAME:-job_rules} file: ./rules/swarm_job.rules.yml scheduled_job_rules: name: ${SCHEDULED_JOB_RULES_NAME:-scheduled_job_rules} file: ./rules/scheduled_job.rules.yml service_rules: name: ${SERVICE_RULES_NAME:-service_rules} file: ./rules/swarm_service.rules.yml elasticsearch_rules: name: ${ELASTICSEARCH_RULES_NAME:-elasticsearch_rules} file: ./rules/elasticsearch.rules.yml deploy/rules/elasticsearch.rules.yml 0 → 100644 +43 −0 Original line number Diff line number Diff line groups: - name: elasticsearch rules: - record: elasticsearch_filesystem_data_used_percent expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes - record: elasticsearch_filesystem_data_free_percent expr: 100 - elasticsearch_filesystem_data_used_percent - alert: ElasticsearchTooFewNodesRunning expr: elasticsearch_cluster_health_number_of_nodes < 3 for: 5m labels: severity: critical annotations: description: There are only {{$value}} (< 3) Elasticsearch nodes running summary: Elasticsearch running on less than 3 nodes - alert: ElasticsearchHeapTooHigh expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9 for: 15m labels: severity: critical annotations: description: The heap usage is over 90% for 15m in node '{{$labels.name}}' summary: Elasticsearch node {{$labels.name}} heap usage is too high - alert: ElasticsearchHighFilesystemDataUsedPercent expr: elasticsearch_filesystem_data_used_percent > 80 labels: severity: critical annotations: description: Filesystem usage is over 80% ({{$value}}%) in node '{{$labels.name}}' summary: Elasticsearch filesystem usage is too high - alert: ElasticsearchGarbageCollectionTooSlow expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 1 labels: severity: critical annotations: description: GC is taking more than 1 second to complete in node '{{$labels.name}}' ({{$labels.gc}} - {{$value}}s) summary: Elasticsearch memory usage is too heavy and GC is taking too much time deploy/rules/swarm_job.rules.yml→deploy/rules/scheduled_job.rules.yml +17 −1 Original line number Diff line number Diff line groups: - name: swarm_job - name: scheduled_job rules: - alert: backup_db_not_created expr: time() - backup_created_date_seconds{} > 86400 Loading @@ -24,3 +24,19 @@ groups: annotations: description: Error, '{{ $labels.label }}' certificate near expiry. summary: Error, '{{ $labels.label }}' certificate near expiry. - alert: ElasticsearchSnapshotNotPerformed expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 86400 labels: severity: warning annotations: description: Last snapshot for '{{ $labels.repository }}' repository is too old. summary: Scheduled Elasticsearch snapshot is not working. - alert: ElasticsearchSnapshotCleanupNotPerformed expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 64 labels: severity: warning annotations: description: Too many snapshots ({{$value}}) found for '{{ $labels.repository }}' repository. summary: Scheduled Elasticsearch snapshot cleanup is not working. deploy/rules/swarm_node.rules.dev.yml +5 −0 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ groups: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 Loading @@ -21,6 +22,7 @@ groups: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) Loading @@ -32,6 +34,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 Loading @@ -42,6 +45,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"} - node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"}) * ON(instance) GROUP_LEFT(node_name) Loading @@ -53,6 +57,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) usage is at {{ humanize $value}}%. summary: Disk (Docker mountpoint) alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 Loading deploy/rules/swarm_node.rules.prod.yml +5 −0 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ groups: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 Loading @@ -21,6 +22,7 @@ groups: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) Loading @@ -32,6 +34,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 Loading @@ -42,6 +45,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' - alert: node_ebs_disk_usage expr: ((node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"} - node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}) * 100 / node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"}) * ON(instance) GROUP_LEFT(node_name) Loading @@ -53,6 +57,7 @@ groups: description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} usage is at {{ humanize $value}}%. summary: EBS disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_ebs_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 Loading Loading
deploy/docker-compose.tmpl.yml +18 −12 Original line number Diff line number Diff line Loading @@ -12,11 +12,11 @@ services: - '--storage.tsdb.retention.time=${STORAGE_TSDB_RETENTION_TIME}' - '--storage.tsdb.retention.size=${STORAGE_TSDB_RETENTION_SIZE}' user: '0:0' environment: - JOBS networks: - metric-net - traefik-net environment: - JOBS volumes: - prometheus:/prometheus configs: Loading @@ -27,12 +27,14 @@ services: mode: 0744 - source: node_rules target: /etc/prometheus/swarm_node.rules.yml - source: task_rules target: /etc/prometheus/swarm_task.rules.yml - source: job_rules target: /etc/prometheus/swarm_job.rules.yml - source: service_rules target: /etc/prometheus/swarm_service.rules.yml - source: task_rules target: /etc/prometheus/swarm_task.rules.yml - source: scheduled_job_rules target: /etc/prometheus/scheduled_job.rules.yml - source: elasticsearch_rules target: /etc/prometheus/elasticsearch.rules.yml healthcheck: test: wget --spider -q http://localhost:9090 interval: 30s Loading Loading @@ -70,14 +72,18 @@ configs: name: ${PROMETHEUS_ENTRYPOINT_NAME:-prometheus-entrypoint} file: ./conf/entrypoint.sh service_rules: name: ${SERVICE_RULES_NAME:-service_rules} file: ./rules/swarm_service.rules.yml task_rules: name: ${TASK_RULES_NAME:-task_rules} file: ./rules/swarm_task.rules.yml job_rules: name: ${JOB_RULES_NAME:-job_rules} file: ./rules/swarm_job.rules.yml scheduled_job_rules: name: ${SCHEDULED_JOB_RULES_NAME:-scheduled_job_rules} file: ./rules/scheduled_job.rules.yml service_rules: name: ${SERVICE_RULES_NAME:-service_rules} file: ./rules/swarm_service.rules.yml elasticsearch_rules: name: ${ELASTICSEARCH_RULES_NAME:-elasticsearch_rules} file: ./rules/elasticsearch.rules.yml
deploy/rules/elasticsearch.rules.yml 0 → 100644 +43 −0 Original line number Diff line number Diff line groups: - name: elasticsearch rules: - record: elasticsearch_filesystem_data_used_percent expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes - record: elasticsearch_filesystem_data_free_percent expr: 100 - elasticsearch_filesystem_data_used_percent - alert: ElasticsearchTooFewNodesRunning expr: elasticsearch_cluster_health_number_of_nodes < 3 for: 5m labels: severity: critical annotations: description: There are only {{$value}} (< 3) Elasticsearch nodes running summary: Elasticsearch running on less than 3 nodes - alert: ElasticsearchHeapTooHigh expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9 for: 15m labels: severity: critical annotations: description: The heap usage is over 90% for 15m in node '{{$labels.name}}' summary: Elasticsearch node {{$labels.name}} heap usage is too high - alert: ElasticsearchHighFilesystemDataUsedPercent expr: elasticsearch_filesystem_data_used_percent > 80 labels: severity: critical annotations: description: Filesystem usage is over 80% ({{$value}}%) in node '{{$labels.name}}' summary: Elasticsearch filesystem usage is too high - alert: ElasticsearchGarbageCollectionTooSlow expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 1 labels: severity: critical annotations: description: GC is taking more than 1 second to complete in node '{{$labels.name}}' ({{$labels.gc}} - {{$value}}s) summary: Elasticsearch memory usage is too heavy and GC is taking too much time
deploy/rules/swarm_job.rules.yml→deploy/rules/scheduled_job.rules.yml +17 −1 Original line number Diff line number Diff line groups: - name: swarm_job - name: scheduled_job rules: - alert: backup_db_not_created expr: time() - backup_created_date_seconds{} > 86400 Loading @@ -24,3 +24,19 @@ groups: annotations: description: Error, '{{ $labels.label }}' certificate near expiry. summary: Error, '{{ $labels.label }}' certificate near expiry. - alert: ElasticsearchSnapshotNotPerformed expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 86400 labels: severity: warning annotations: description: Last snapshot for '{{ $labels.repository }}' repository is too old. summary: Scheduled Elasticsearch snapshot is not working. - alert: ElasticsearchSnapshotCleanupNotPerformed expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 64 labels: severity: warning annotations: description: Too many snapshots ({{$value}}) found for '{{ $labels.repository }}' repository. summary: Scheduled Elasticsearch snapshot cleanup is not working.
deploy/rules/swarm_node.rules.dev.yml +5 −0 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ groups: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 Loading @@ -21,6 +22,7 @@ groups: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) Loading @@ -32,6 +34,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 Loading @@ -42,6 +45,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"} - node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"}) * ON(instance) GROUP_LEFT(node_name) Loading @@ -53,6 +57,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) usage is at {{ humanize $value}}%. summary: Disk (Docker mountpoint) alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 Loading
deploy/rules/swarm_node.rules.prod.yml +5 −0 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ groups: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 Loading @@ -21,6 +22,7 @@ groups: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) Loading @@ -32,6 +34,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 Loading @@ -42,6 +45,7 @@ groups: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' - alert: node_ebs_disk_usage expr: ((node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"} - node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}) * 100 / node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"}) * ON(instance) GROUP_LEFT(node_name) Loading @@ -53,6 +57,7 @@ groups: description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} usage is at {{ humanize $value}}%. summary: EBS disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_ebs_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 Loading