Commit b987e500 authored by Pedro Eduardo Trujillo's avatar Pedro Eduardo Trujillo
Browse files

Reordena reglas y agrega nuevas para Elasticsearch

parent 4f1d4b34
Loading
Loading
Loading
Loading
+18 −12
Original line number Diff line number Diff line
@@ -12,11 +12,11 @@ services:
      - '--storage.tsdb.retention.time=${STORAGE_TSDB_RETENTION_TIME}'
      - '--storage.tsdb.retention.size=${STORAGE_TSDB_RETENTION_SIZE}'
    user: '0:0'
    environment:
      - JOBS
    networks:
      - metric-net
      - traefik-net
    environment:
      - JOBS
    volumes:
      - prometheus:/prometheus
    configs:
@@ -27,12 +27,14 @@ services:
        mode: 0744
      - source: node_rules
        target: /etc/prometheus/swarm_node.rules.yml
      - source: task_rules
        target: /etc/prometheus/swarm_task.rules.yml
      - source: job_rules
        target: /etc/prometheus/swarm_job.rules.yml
      - source: service_rules
        target: /etc/prometheus/swarm_service.rules.yml
      - source: task_rules
        target: /etc/prometheus/swarm_task.rules.yml
      - source: scheduled_job_rules
        target: /etc/prometheus/scheduled_job.rules.yml
      - source: elasticsearch_rules
        target: /etc/prometheus/elasticsearch.rules.yml
    healthcheck:
      test: wget --spider -q http://localhost:9090
      interval: 30s
@@ -70,14 +72,18 @@ configs:
    name: ${PROMETHEUS_ENTRYPOINT_NAME:-prometheus-entrypoint}
    file: ./conf/entrypoint.sh

  service_rules:
    name: ${SERVICE_RULES_NAME:-service_rules}
    file: ./rules/swarm_service.rules.yml

  task_rules:
    name: ${TASK_RULES_NAME:-task_rules}
    file: ./rules/swarm_task.rules.yml

  job_rules:
    name: ${JOB_RULES_NAME:-job_rules}
    file: ./rules/swarm_job.rules.yml
  scheduled_job_rules:
    name: ${SCHEDULED_JOB_RULES_NAME:-scheduled_job_rules}
    file: ./rules/scheduled_job.rules.yml

  service_rules:
    name: ${SERVICE_RULES_NAME:-service_rules}
    file: ./rules/swarm_service.rules.yml
  elasticsearch_rules:
    name: ${ELASTICSEARCH_RULES_NAME:-elasticsearch_rules}
    file: ./rules/elasticsearch.rules.yml
+43 −0
Original line number Diff line number Diff line
groups:
- name: elasticsearch
  rules:
  - record: elasticsearch_filesystem_data_used_percent
    expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes)
      / elasticsearch_filesystem_data_size_bytes

  - record: elasticsearch_filesystem_data_free_percent
    expr: 100 - elasticsearch_filesystem_data_used_percent

  - alert: ElasticsearchTooFewNodesRunning
    expr: elasticsearch_cluster_health_number_of_nodes < 3
    for: 5m
    labels:
      severity: critical
    annotations:
      description: There are only {{$value}} < 3 Elasticsearch nodes running
      summary: Elasticsearch running on less than 3 nodes

  - alert: ElasticsearchHeapTooHigh
    expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9
    for: 15m
    labels:
      severity: critical
    annotations:
      description: The heap usage is over 90% for 15m
      summary: Elasticsearch node {{$labels.node}} heap usage is high

  - alert: ElasticsearchHighFilesystemDataUsedPercent
    expr: elasticsearch_filesystem_data_used_percent > 0.8
    labels:
      severity: critical
    annotations:
      description: Filesystem usage is over 80% in node '{{$labels.name}}'
      summary: Elasticsearch filesystem usage is too high

  - alert: ElasticsearchGarbageCollectionTooSlow
    expr: irate(elasticsearch_jvm_gc_collection_seconds_sum{}[5m]) > 1
    labels:
      severity: critical
    annotations:
      description: GC is taking more than 1 second to complete in node '{{$labels.name}}'
      summary: Elasticsearch memory usage is too heavy and GC is taking too much time
+17 −1
Original line number Diff line number Diff line
groups:
- name: swarm_job
- name: scheduled_job
  rules:
  - alert: backup_db_not_created
    expr: time() - backup_created_date_seconds{} > 86400
@@ -24,3 +24,19 @@ groups:
    annotations:
      description: Error, '{{ $labels.label }}' certificate near expiry.
      summary: Error, '{{ $labels.label }}' certificate near expiry.

  - alert: ElasticsearchSnapshotNotPerformed
    expr: time() - elasticsearch_snapshot_stats_snapshot_end_time_timestamp{} > 86400
    labels:
      severity: warning
    annotations:
      description: Last snapshot for '{{ $labels.repository }}' repository is too old.
      summary: Scheduled Elasticsearch snapshot is not working.

  - alert: ElasticsearchSnapshotCleanupNotPerformed
    expr: elasticsearch_snapshot_stats_number_of_snapshots{} > 64
    labels:
      severity: warning
    annotations:
      description: Too many snapshots ({{$value}}) found for '{{ $labels.repository }}' repository.
      summary: Scheduled Elasticsearch snapshot cleanup is not working.
+5 −0
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
        $value}}%.
      summary: CPU alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_memory_usage
    expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
@@ -21,6 +22,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
        $value}}%.
      summary: Memory alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
@@ -32,6 +34,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
        $value}}%.
      summary: Disk alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_disk_fill_rate_6h
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
@@ -42,6 +45,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} disk is going to fill up in
        6h.
      summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_docker_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"} - node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs/mnt/data"}) * ON(instance) GROUP_LEFT(node_name)
@@ -53,6 +57,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} disk (Docker mountpoint) usage is at {{ humanize
        $value}}%.
      summary: Disk (Docker mountpoint) alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_docker_disk_fill_rate_6h
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs/mnt/data"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
+5 −0
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
        $value}}%.
      summary: CPU alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_memory_usage
    expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
      * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
@@ -21,6 +22,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
        $value}}%.
      summary: Memory alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
      * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
@@ -32,6 +34,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
        $value}}%.
      summary: Disk alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_disk_fill_rate_6h
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0
@@ -42,6 +45,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} disk is going to fill up in
        6h.
      summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_ebs_disk_usage
    expr: ((node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"} - node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"})
      * 100 / node_filesystem_size_bytes{mountpoint=~".*/ebs/.*"}) * ON(instance) GROUP_LEFT(node_name)
@@ -53,6 +57,7 @@ groups:
      description: Swarm node {{ $labels.node_name }} EBS disk {{ $labels.mountpoint }} usage is at {{ humanize
        $value}}%.
      summary: EBS disk alert for Swarm node '{{ $labels.node_name }}'

  - alert: node_ebs_disk_fill_rate_6h
    expr: predict_linear(node_filesystem_free_bytes{mountpoint=~".*/ebs/.*"}[1h], 6 * 3600) * ON(instance)
      GROUP_LEFT(node_name) node_meta < 0