Commit e331492c authored by Pedro Eduardo Trujillo's avatar Pedro Eduardo Trujillo
Browse files

Merge branch 'dev' into 'master'

Restaura settings de salud, dobla memoria cadvisor

See merge request redmic-project/metric/swarmprom-ci!18
parents c751e00f 275cfa45
Loading
Loading
Loading
Loading
+26 −38
Original line number Diff line number Diff line
stages:
  - check_rules
  - deploy

image: registry.gitlab.com/redmic-project/docker/docker-deploy
image: registry.gitlab.com/redmic-project/docker/docker-deploy:latest

services:
  - docker:dind
@@ -13,20 +12,9 @@ variables:
  DEPLOY_DIR_NAME: '.'
  STATUS_CHECK_DELAY: 60
  STACK: metric
  SERVICES_TO_CHECK: metric_dockerd-exporter metric_cadvisor metric_grafana metric_alertmanager metric_unsee metric_node-exporter metric_prometheus
  SERVICES_TO_CHECK: metric_dockerd-exporter metric_cadvisor metric_grafana metric_alertmanager metric_unsee metric_node-exporter metric_pushgateway

check-rule-dev:
  stage: check_rules
  script:
    - >
      docker run -v $(pwd)/prometheus/rules:/tmp dnanexus/promtool:1.0
      check rules /tmp/swarm_task.rules.yml
  environment:
    name: dev
  only:
    - dev

deploy-dev:
deploy-supporting-branch-develop:
  stage: deploy
  variables:
    SSH_REMOTE: ${DEV_SSH_REMOTE}
@@ -47,25 +35,13 @@ deploy-dev:
      SLACK_URL=${SLACK_URL} SLACK_CHANNEL=${SLACK_CHANNEL} SLACK_USER=${SLACK_USER}
  environment:
    name: dev
  only:
    - dev
  when: manual

check-rule-supporting-branch:
  stage: check_rules
  script:
    - >
      docker run -v $(pwd)/prometheus/rules:/tmp dnanexus/promtool:1.0
      check rules /tmp/swarm_task.rules.yml
  environment:
    name: dev
  only:
    - branches
  except:
    - master
    - dev
  when: manual

deploy-supporting-branch:
deploy-stable-branch-develop:
  stage: deploy
  variables:
    SSH_REMOTE: ${DEV_SSH_REMOTE}
@@ -87,25 +63,37 @@ deploy-supporting-branch:
  environment:
    name: dev
  only:
    - branches
  except:
    - master
    - dev
  when: manual


check-rule-pro:
  stage: check_rules
deploy-supporting-branch-production:
  stage: deploy
  variables:
    SSH_REMOTE: ${PRO_SSH_REMOTE}
    COMPOSE_FILE: docker-compose.tmpl.yml:docker-compose.prod.yml
    ADMIN_USER: ${PRO_ADMIN_USER}
    ADMIN_PASSWORD: ${PRO_ADMIN_PASSWORD}
    DOCKER_GWBRIDGE_IP: ${PRO_DOCKER_GWBRIDGE_IP}
    UI_AUTH: ${PRO_UI_AUTH}
    PUBLIC_HOSTNAME: ${PRO_PUBLIC_HOSTNAME}
    SLACK_URL: ${SLACK_URL}
    SLACK_CHANNEL: ${PRO_SLACK_CHANNEL}
    SLACK_USER: ${SLACK_USER}
  script:
    - create-nets.sh metric-net
    - >
      docker run -v $(pwd)/prometheus/rules:/tmp dnanexus/promtool:1.0
      check rules /tmp/swarm_task.rules.yml
      deploy.sh COMPOSE_FILE=${COMPOSE_FILE} ADMIN_USER=${ADMIN_USER} ADMIN_PASSWORD=${ADMIN_PASSWORD}
      DOCKER_GWBRIDGE_IP=${DOCKER_GWBRIDGE_IP} UI_AUTH=${UI_AUTH} PUBLIC_HOSTNAME=${PUBLIC_HOSTNAME}
      SLACK_URL=${SLACK_URL} SLACK_CHANNEL=${SLACK_CHANNEL} SLACK_USER=${SLACK_USER}
  environment:
    name: pro
  only:
    - branches
  except:
    - master
  when: manual

deploy-pro:
deploy-stable-branch-production:
  stage: deploy
  variables:
    SSH_REMOTE: ${PRO_SSH_REMOTE}
+1 −15
Original line number Diff line number Diff line
version: '3.5'

  pushgateway:
    name: pushgateway-vol

volumes:
  grafana:
    name: grafana-vol

  alertmanager:
    name: alertmanager-vol

  grafana-etc:
    name: grafana-etc-vol

  grafana-log:
    name: grafana-log-vol

  dockerd-exporter:
    name: dockerd-exporter-vol
+0 −15
Original line number Diff line number Diff line
@@ -14,9 +14,6 @@ services:
          - node.role == worker

volumes:
  pushgateway:
    name: pushgateway-vol

  grafana:
    name: grafana-vol
    driver: "cloudstor:aws"
@@ -24,15 +21,3 @@ volumes:
      backing: relocatable
      size: 1
      ebstype: gp2

  alertmanager:
    name: alertmanager-vol

  grafana-etc:
    name: grafana-etc-vol

  grafana-log:
    name: grafana-log-vol

  dockerd-exporter:
    name: dockerd-exporter-vol
+32 −13
Original line number Diff line number Diff line
@@ -35,6 +35,12 @@ services:
      - /var/run:/var/run
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    healthcheck:
      test: wget --spider -S -t 3 http://localhost:8080/metrics
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 2m
    deploy:
      mode: global
      restart_policy:
@@ -43,12 +49,12 @@ services:
      resources:
        limits:
          cpus: '0.5'
          memory: 128M
          memory: 256M
        reservations:
          memory: 64M

  grafana:
    image: stefanprodan/swarmprom-grafana:5.0.1
    image: stefanprodan/swarmprom-grafana:5.3.4
    environment:
      - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
      - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
@@ -72,10 +78,10 @@ services:
      mode: replicated
      replicas: 1
      labels:
        traefik.port: "3000"
        traefik.docker.network: traefik-net
        traefik.frontend.rule: Host:grafana.${PUBLIC_HOSTNAME}
        traefik.backend: grafana
        traefik.port: "3000"
      restart_policy:
        delay: 1m
        window: 3m
@@ -104,11 +110,11 @@ services:
      mode: replicated
      replicas: 1
      labels:
        traefik.port: "9093"
        traefik.docker.network: traefik-net
        traefik.frontend.auth.basic: ${UI_AUTH}
        traefik.frontend.auth.basic.users: ${UI_AUTH}
        traefik.frontend.rule: Host:alertmanager.${PUBLIC_HOSTNAME}
        traefik.backend: alertmanager
        traefik.port: "9093"
      placement:
        constraints:
          - node.role == manager
@@ -133,11 +139,11 @@ services:
      mode: replicated
      replicas: 1
      labels:
        traefik.port: "8080"
        traefik.docker.network: traefik-net
        traefik.frontend.auth.basic: ${UI_AUTH}
        traefik.frontend.auth.basic.users: ${UI_AUTH}
        traefik.frontend.rule: Host:unsee.${PUBLIC_HOSTNAME}
        traefik.backend: unsee
        traefik.port: "8080"
      restart_policy:
        delay: 1m
        window: 3m
@@ -149,15 +155,12 @@ services:
          memory: 64M

  node-exporter:
    image: stefanprodan/swarmprom-node-exporter:v0.15.2
    image: stefanprodan/swarmprom-node-exporter:v0.16.0
    command:
      - '--path.sysfs=/host/sys'
      - '--path.procfs=/host/proc'
      - '--collector.textfile.directory=/etc/node-exporter/'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
      # no collectors are explicitely enabled here, because the defaults are just fine,
      # see https://github.com/prometheus/node_exporter
      # disable ipvs collector because it barfs the node-exporter logs full with errors on my centos 7 vm's
      - '--no-collector.ipvs'
    environment:
      - NODE_ID={{.Node.ID}}
@@ -193,11 +196,11 @@ services:
      mode: replicated
      replicas: 1
      labels:
        traefik.port: "9091"
        traefik.docker.network: traefik-net
        traefik.frontend.auth.basic: ${UI_AUTH}
        traefik.frontend.auth.basic.users: ${UI_AUTH}
        traefik.frontend.rule: Host:pushgateway.${PUBLIC_HOSTNAME}
        traefik.backend: pushgateway
        traefik.port: "9091"
      restart_policy:
        delay: 1m
        window: 3m
@@ -215,6 +218,22 @@ networks:
  traefik-net:
    external: true

volumes:
  pushgateway:
    name: pushgateway-vol

  alertmanager:
    name: alertmanager-vol

  grafana-etc:
    name: grafana-etc-vol

  grafana-log:
    name: grafana-log-vol

  dockerd-exporter:
    name: dockerd-exporter-vol

configs:
  dockerd_config:
    file: ./swarmprom/dockerd-exporter/Caddyfile
+0 −32
Original line number Diff line number Diff line
groups:
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml
  rules:
  - alert: task_high_cpu_usage_50
    expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m]))
      BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
      * 100 > 50
    for: 1m
    annotations:
      description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
        $labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize
        $value}}%.'
      summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
        }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
  - alert: task_high_memory_usage_1g
    expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
      BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09
    for: 1m
    annotations:
      description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
        $labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize
        $value}}.'
      summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
        }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'

  - alert: backup_db_not_created
    expr: time() - backup_created_date_seconds{} > 86400
    labels:
      severity: warning
    annotations:
      description: Error creating backup for '{{ $labels.label }}' database.
      summary: Error creating backup for '{{ $labels.label }}' database.
 No newline at end of file
Loading