Loading .gitlab-ci.yml +8 −10 Original line number Diff line number Diff line Loading @@ -4,8 +4,6 @@ stages: - test-package - deploy image: docker:stable services: - docker:dind Loading @@ -19,11 +17,13 @@ variables: check-rules: stage: check-rules image: docker:stable script: - ./check-rules.sh docker-build-commit-non-master-branches: stage: package image: docker:stable only: - branches except: Loading @@ -35,6 +35,7 @@ docker-build-commit-non-master-branches: docker-build-commit-master-branch: stage: package image: docker:stable only: - master script: Loading @@ -44,6 +45,7 @@ docker-build-commit-master-branch: container-scanning: stage: test-package image: docker:stable allow_failure: true only: - branches Loading @@ -65,11 +67,10 @@ container-scanning: deploy-supporting-branch-develop: stage: deploy image: registry.gitlab.com/redmic-project/docker/docker-deploy image: ${DOCKER_DEPLOY_IMAGE_NAME}:${DOCKER_DEPLOY_IMAGE_TAG} variables: SSH_REMOTE: ${DEV_SSH_REMOTE} COMPOSE_FILE: docker-compose.tmpl.yml:docker-compose.dev.yml UI_AUTH: ${DEV_UI_AUTH} PUBLIC_HOSTNAME: ${DEV_PUBLIC_HOSTNAME} script: - > Loading @@ -86,11 +87,10 @@ deploy-supporting-branch-develop: deploy-stable-branch-develop: stage: deploy image: registry.gitlab.com/redmic-project/docker/docker-deploy image: ${DOCKER_DEPLOY_IMAGE_NAME}:${DOCKER_DEPLOY_IMAGE_TAG} variables: SSH_REMOTE: ${DEV_SSH_REMOTE} COMPOSE_FILE: docker-compose.tmpl.yml:docker-compose.dev.yml UI_AUTH: ${DEV_UI_AUTH} PUBLIC_HOSTNAME: ${DEV_PUBLIC_HOSTNAME} script: - > Loading @@ -105,11 +105,10 @@ deploy-stable-branch-develop: deploy-supporting-branch-production: stage: deploy image: registry.gitlab.com/redmic-project/docker/docker-deploy image: ${DOCKER_DEPLOY_IMAGE_NAME}:${DOCKER_DEPLOY_IMAGE_TAG} variables: SSH_REMOTE: ${PRO_SSH_REMOTE} COMPOSE_FILE: docker-compose.tmpl.yml:docker-compose.prod.yml UI_AUTH: ${PRO_UI_AUTH} PUBLIC_HOSTNAME: ${PRO_PUBLIC_HOSTNAME} script: - > Loading @@ -126,11 +125,10 @@ deploy-supporting-branch-production: deploy-stable-branch-production: stage: deploy image: registry.gitlab.com/redmic-project/docker/docker-deploy image: ${DOCKER_DEPLOY_IMAGE_NAME}:${DOCKER_DEPLOY_IMAGE_TAG} variables: SSH_REMOTE: ${PRO_SSH_REMOTE} COMPOSE_FILE: docker-compose.tmpl.yml:docker-compose.prod.yml UI_AUTH: ${PRO_UI_AUTH} PUBLIC_HOSTNAME: ${PRO_PUBLIC_HOSTNAME} script: - > Loading deploy/docker-compose.dev.yml +5 −0 Original line number Diff line number Diff line Loading @@ -3,3 +3,8 @@ version: '3.5' volumes: prometheus: name: ${PROMETHEUS_VOL_NAME:-prometheus-vol} configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.dev.yml deploy/docker-compose.prod.yml +5 −0 Original line number Diff line number Diff line Loading @@ -15,3 +15,8 @@ volumes: backing: relocatable size: ${PROMETHEUS_VOL_SIZE:-10} ebstype: ${PROMETHEUS_VOL_EBS_TYPE:-gp2} configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.prod.yml deploy/docker-compose.tmpl.yml +2 −6 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ services: - source: service_rules target: /etc/prometheus/swarm_service.rules.yml healthcheck: test: wget --spider http://localhost:9090 test: wget --spider -q http://localhost:9090 interval: 30s timeout: 10s retries: 3 Loading @@ -41,7 +41,7 @@ services: traefik.backend: prometheus traefik.port: '9090' restart_policy: delay: 30s delay: 10s window: 2m resources: limits: Loading @@ -61,10 +61,6 @@ networks: external: true configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.yml task_rules: name: ${TASK_RULES_NAME:-task_rules} file: ./rules/swarm_task.rules.yml Loading deploy/rules/swarm_node.rules.dev.yml 0 → 100644 +65 −0 Original line number Diff line number Diff line groups: - name: swarm_node rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 80 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/mnt/data"} - node_filesystem_free_bytes{mountpoint="/mnt/data"}) * 100 / node_filesystem_size_bytes{mountpoint="/mnt/data"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/mnt/data"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' No newline at end of file Loading
.gitlab-ci.yml +8 −10 Original line number Diff line number Diff line Loading @@ -4,8 +4,6 @@ stages: - test-package - deploy image: docker:stable services: - docker:dind Loading @@ -19,11 +17,13 @@ variables: check-rules: stage: check-rules image: docker:stable script: - ./check-rules.sh docker-build-commit-non-master-branches: stage: package image: docker:stable only: - branches except: Loading @@ -35,6 +35,7 @@ docker-build-commit-non-master-branches: docker-build-commit-master-branch: stage: package image: docker:stable only: - master script: Loading @@ -44,6 +45,7 @@ docker-build-commit-master-branch: container-scanning: stage: test-package image: docker:stable allow_failure: true only: - branches Loading @@ -65,11 +67,10 @@ container-scanning: deploy-supporting-branch-develop: stage: deploy image: registry.gitlab.com/redmic-project/docker/docker-deploy image: ${DOCKER_DEPLOY_IMAGE_NAME}:${DOCKER_DEPLOY_IMAGE_TAG} variables: SSH_REMOTE: ${DEV_SSH_REMOTE} COMPOSE_FILE: docker-compose.tmpl.yml:docker-compose.dev.yml UI_AUTH: ${DEV_UI_AUTH} PUBLIC_HOSTNAME: ${DEV_PUBLIC_HOSTNAME} script: - > Loading @@ -86,11 +87,10 @@ deploy-supporting-branch-develop: deploy-stable-branch-develop: stage: deploy image: registry.gitlab.com/redmic-project/docker/docker-deploy image: ${DOCKER_DEPLOY_IMAGE_NAME}:${DOCKER_DEPLOY_IMAGE_TAG} variables: SSH_REMOTE: ${DEV_SSH_REMOTE} COMPOSE_FILE: docker-compose.tmpl.yml:docker-compose.dev.yml UI_AUTH: ${DEV_UI_AUTH} PUBLIC_HOSTNAME: ${DEV_PUBLIC_HOSTNAME} script: - > Loading @@ -105,11 +105,10 @@ deploy-stable-branch-develop: deploy-supporting-branch-production: stage: deploy image: registry.gitlab.com/redmic-project/docker/docker-deploy image: ${DOCKER_DEPLOY_IMAGE_NAME}:${DOCKER_DEPLOY_IMAGE_TAG} variables: SSH_REMOTE: ${PRO_SSH_REMOTE} COMPOSE_FILE: docker-compose.tmpl.yml:docker-compose.prod.yml UI_AUTH: ${PRO_UI_AUTH} PUBLIC_HOSTNAME: ${PRO_PUBLIC_HOSTNAME} script: - > Loading @@ -126,11 +125,10 @@ deploy-supporting-branch-production: deploy-stable-branch-production: stage: deploy image: registry.gitlab.com/redmic-project/docker/docker-deploy image: ${DOCKER_DEPLOY_IMAGE_NAME}:${DOCKER_DEPLOY_IMAGE_TAG} variables: SSH_REMOTE: ${PRO_SSH_REMOTE} COMPOSE_FILE: docker-compose.tmpl.yml:docker-compose.prod.yml UI_AUTH: ${PRO_UI_AUTH} PUBLIC_HOSTNAME: ${PRO_PUBLIC_HOSTNAME} script: - > Loading
deploy/docker-compose.dev.yml +5 −0 Original line number Diff line number Diff line Loading @@ -3,3 +3,8 @@ version: '3.5' volumes: prometheus: name: ${PROMETHEUS_VOL_NAME:-prometheus-vol} configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.dev.yml
deploy/docker-compose.prod.yml +5 −0 Original line number Diff line number Diff line Loading @@ -15,3 +15,8 @@ volumes: backing: relocatable size: ${PROMETHEUS_VOL_SIZE:-10} ebstype: ${PROMETHEUS_VOL_EBS_TYPE:-gp2} configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.prod.yml
deploy/docker-compose.tmpl.yml +2 −6 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ services: - source: service_rules target: /etc/prometheus/swarm_service.rules.yml healthcheck: test: wget --spider http://localhost:9090 test: wget --spider -q http://localhost:9090 interval: 30s timeout: 10s retries: 3 Loading @@ -41,7 +41,7 @@ services: traefik.backend: prometheus traefik.port: '9090' restart_policy: delay: 30s delay: 10s window: 2m resources: limits: Loading @@ -61,10 +61,6 @@ networks: external: true configs: node_rules: name: ${NODE_RULES_NAME:-node_rules} file: ./rules/swarm_node.rules.yml task_rules: name: ${TASK_RULES_NAME:-task_rules} file: ./rules/swarm_task.rules.yml Loading
deploy/rules/swarm_node.rules.dev.yml 0 → 100644 +65 −0 Original line number Diff line number Diff line groups: - name: swarm_node rules: - alert: node_cpu_usage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name)) > 80 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%. summary: CPU alert for Swarm node '{{ $labels.node_name }}' - alert: node_memory_usage expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%. summary: Memory alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_usage expr: ((node_filesystem_size_bytes{mountpoint="/mnt/data"} - node_filesystem_free_bytes{mountpoint="/mnt/data"}) * 100 / node_filesystem_size_bytes{mountpoint="/mnt/data"}) * ON(instance) GROUP_LEFT(node_name) node_meta > 85 for: 1m labels: severity: warning annotations: description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%. summary: Disk alert for Swarm node '{{ $labels.node_name }}' - alert: node_docker_disk_fill_rate_6h expr: predict_linear(node_filesystem_free_bytes{mountpoint="/mnt/data"}[1h], 6 * 3600) * ON(instance) GROUP_LEFT(node_name) node_meta < 0 for: 1h labels: severity: critical annotations: description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h. summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' No newline at end of file