diff --git a/Docker.mk b/Docker.mk index 3246fb1..6337ae2 100644 --- a/Docker.mk +++ b/Docker.mk @@ -6,10 +6,12 @@ # By: maiboyer +#+ +:+ +#+ # # +#+#+#+#+#+ +#+ # # Created: 2025/06/11 18:10:26 by maiboyer #+# #+# # -# Updated: 2025/07/30 19:32:11 by maiboyer ### ########.fr # +# Updated: 2025/11/14 18:54:16 by maiboyer ### ########.fr # # # # **************************************************************************** # +.PHONY: logs + all: build docker compose up -d @@ -39,3 +41,23 @@ prune: clean -docker network prune -docker system prune -a +ES_URL ?= http://local.maix.me:9200 +KIBANA_URL ?= http://local.maix.me:5601 + +logs-setup: + @until curl -s "$(ES_URL)" > /dev/null 2>&1; do sleep 1; done; + + @curl -s -X PUT "$(ES_URL)/_ilm/policy/docker-logs-policy" \ + -H "Content-Type: application/json" \ + -d '{"policy":{"phases":{"hot":{"actions":{}},"delete":{"min_age":"7d","actions":{"delete":{}}}}}}' > /dev/null + + @curl -s -X PUT "$(ES_URL)/_template/docker-logs-template" \ + -H "Content-Type: application/json" \ + -d '{"index_patterns":["docker-*"],"settings":{"index.lifecycle.name":"docker-logs-policy"}}' > /dev/null + + @until curl -s "$(KIBANA_URL)/api/status" > /dev/null 2>&1; do sleep 1; done; + + @curl -s -X POST "$(KIBANA_URL)/api/saved_objects/index-pattern/docker-logs" \ + -H "kbn-xsrf: true" \ + -H "Content-Type: application/json" \ + -d '{"attributes":{"title":"docker-*","timeFieldName":"@timestamp"}}' > /dev/null diff --git a/Makefile b/Makefile index 1b6e282..fe1428d 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -# **************************************************************************** #make +# **************************************************************************** # # # # ::: :::::::: # # Makefile :+: :+: :+: # @@ -6,7 +6,7 @@ # By: rparodi +#+ +:+ +#+ # # +#+#+#+#+#+ +#+ # # Created: 2023/11/12 11:05:05 by rparodi #+# #+# # -# Updated: 2025/11/10 01:05:11 by maiboyer ### ########.fr # +# Updated: 2025/11/14 17:40:57 by maiboyer ### ########.fr # # # # **************************************************************************** # @@ -157,4 +157,4 @@ fnginx: nginx-dev/nginx-selfsigned.crt nginx-dev/nginx-selfsigned.key wait # phony -.PHONY: all clean fclean re header footer npm@install npm@clean npm@fclean npm@build sql tmux +.PHONY: all clean fclean re header footer npm@install npm@clean npm@fclean npm@build sql tmux logs diff --git a/docker-compose.yml b/docker-compose.yml index b62072e..1b166ef 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,6 +15,11 @@ services: - transcendance-network volumes: - static-volume:/volumes/static + logging: + driver: gelf + options: + gelf-address: "udp://127.0.0.1:12201" + tag: "{{.Name}}" # # The "entry point" as in it does all of this: @@ -37,6 +42,11 @@ services: environment: # this can stay the same for developpement. This is an alias to `localhost` - NGINX_DOMAIN=local.maix.me + logging: + driver: gelf + options: + gelf-address: "udp://127.0.0.1:12201" + tag: "{{.Name}}" ############### # ICONS # @@ -58,6 +68,11 @@ services: - JWT_SECRET=KRUGKIDROVUWG2ZAMJZG653OEBTG66BANJ2W24DTEBXXMZLSEB2GQZJANRQXU6JA - USER_ICONS_STORE=/volumes/store - DATABASE_DIR=/volumes/database + logging: + driver: gelf + options: + gelf-address: "udp://127.0.0.1:12201" + tag: "{{.Name}}" ############### @@ -80,7 +95,12 @@ services: - JWT_SECRET=KRUGKIDROVUWG2ZAMJZG653OEBTG66BANJ2W24DTEBXXMZLSEB2GQZJANRQXU6JA - DATABASE_DIR=/volumes/database - PROVIDER_FILE=/extra/providers.toml - + logging: + driver: gelf + options: + gelf-address: "udp://127.0.0.1:12201" + tag: "{{.Name}}" + ############### # CHAT # @@ -123,7 +143,11 @@ services: environment: - JWT_SECRET=KRUGKIDROVUWG2ZAMJZG653OEBTG66BANJ2W24DTEBXXMZLSEB2GQZJANRQXU6JA - DATABASE_DIR=/volumes/database - + logging: + driver: gelf + options: + gelf-address: "udp://127.0.0.1:12201" + tag: "{{.Name}}" ############### @@ -154,6 +178,11 @@ services: - GF_SERVER_ROOT_URL=http://local.maix.me:3000 - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER} - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASS} + logging: + driver: gelf + options: + gelf-address: "udp://127.0.0.1:12201" + tag: "{{.Name}}" prometheus: image: prom/prometheus:latest @@ -164,6 +193,11 @@ services: volumes: - ./monitoring/prometheus:/etc/prometheus/ restart: unless-stopped + logging: + driver: gelf + options: + gelf-address: "udp://127.0.0.1:12201" + tag: "{{.Name}}" cadvisor: image: gcr.io/cadvisor/cadvisor:latest @@ -178,6 +212,12 @@ services: - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro restart: unless-stopped + logging: + driver: gelf + options: + gelf-address: "udp://127.0.0.1:12201" + tag: "{{.Name}}" + blackbox: image: prom/blackbox-exporter:latest @@ -187,9 +227,70 @@ services: ports: - "9115:9115" restart: unless-stopped + logging: + driver: gelf + options: + gelf-address: "udp://127.0.0.1:12201" + tag: "{{.Name}}" + + + + ############### + # LOGS # + ############### + + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:7.17.23 + container_name: logs-elasticsearch + networks: + - monitoring + environment: + - discovery.type=single-node + - ES_JAVA_OPTS=-Xms512m -Xmx512m + - ELASTIC_PASSWORD=${ELASTIC_PASSWORD} + volumes: + - elastic-data:/usr/share/elasticsearch/data + - ./logs/elasticsearch:/setup + ports: + - "9200:9200" + command: ["/setup/bootstrap.sh"] + restart: unless-stopped + + logstash: + image: docker.elastic.co/logstash/logstash:7.17.23 + container_name: logs-logstash + depends_on: + - elasticsearch + networks: + - monitoring + volumes: + - ./logs/logstash/pipeline:/usr/share/logstash/pipeline + ports: + - "12201:12201/udp" + restart: unless-stopped + + kibana: + image: docker.elastic.co/kibana/kibana:7.17.23 + container_name: logs-kibana + depends_on: + - elasticsearch + networks: + - monitoring + environment: + - ELASTICSEARCH_HOSTS=http://elasticsearch:9200 + - SERVER_PUBLICBASEURL=http://local.maix.me:5601 + - ELASTICSEARCH_USERNAME=elastic + - ELASTIC_PASSWORD=${ELASTIC_PASSWORD} + ports: + - "5601:5601" + volumes: + - ./logs/kibana:/setup + command: ["/setup/bootstrap.sh"] + restart: unless-stopped volumes: images-volume: sqlite-volume: static-volume: grafana-data: + elastic-data: diff --git a/env.example b/env.example index b20bfaa..1c3d2d9 100644 --- a/env.example +++ b/env.example @@ -1,3 +1,5 @@ -GRAFANA_ADMIN_USER="" -GRAFANA_ADMIN_PASS="" -GRAFANA_WEBHOOK_URL="" +GRAFANA_ADMIN_USER= +GRAFANA_ADMIN_PASS= +GRAFANA_WEBHOOK_URL= + +ELASTIC_PASSWORD= diff --git a/logs/elasticsearch/bootstrap.sh b/logs/elasticsearch/bootstrap.sh new file mode 100755 index 0000000..6bfe807 --- /dev/null +++ b/logs/elasticsearch/bootstrap.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +setup_ilm() { + set -xe + until curl -s -f http://localhost:9200 >/dev/null; do + sleep 2; + done; + + curl -v -X PUT "localhost:9200/_ilm/policy/docker-logs-policy" \ + -H "Content-Type: application/json" \ + -d '@/setup/docker-logs-policy.json' + curl -v -X PUT "localhost:9200/_template/docker-logs-template" \ + -H "Content-Type: application/json" \ + -d '@/setup/docker-logs-template.json' + exit 0 +} + +setup_ilm & +exec /usr/local/bin/docker-entrypoint.sh eswrapper diff --git a/logs/elasticsearch/docker-logs-policy.json b/logs/elasticsearch/docker-logs-policy.json new file mode 100644 index 0000000..fe793d9 --- /dev/null +++ b/logs/elasticsearch/docker-logs-policy.json @@ -0,0 +1,15 @@ +{ + "policy": { + "phases": { + "hot": { + "actions": {} + }, + "delete": { + "min_age": "7d", + "actions": { + "delete": {} + } + } + } + } +} diff --git a/logs/elasticsearch/docker-logs-template.json b/logs/elasticsearch/docker-logs-template.json new file mode 100644 index 0000000..39e36df --- /dev/null +++ b/logs/elasticsearch/docker-logs-template.json @@ -0,0 +1 @@ +{"index_patterns":["docker-*"],"settings":{"index.lifecycle.name":"docker-logs-policy"}}} diff --git a/logs/kibana/bootstrap.sh b/logs/kibana/bootstrap.sh new file mode 100755 index 0000000..c9235d1 --- /dev/null +++ b/logs/kibana/bootstrap.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +kibana_setup() { + set -xe + until curl -s -f "localhost:5601/api/status"; do + sleep 2 + done + + curl -v -X POST "localhost:5601/api/saved_objects/_import?overwrite=true" \ + -H "kbn-xsrf: true" \ + --form file='@/setup/export.ndjson' + exit 0 +} +kibana_setup & +exec /usr/local/bin/kibana-docker diff --git a/logs/kibana/export.ndjson b/logs/kibana/export.ndjson new file mode 100644 index 0000000..36eb69b --- /dev/null +++ b/logs/kibana/export.ndjson @@ -0,0 +1,5 @@ +{"attributes":{"buildNum":47645,"defaultIndex":"docker-logs","defaultRoute":"/app/dashboards#/view/f1356840-c17c-11f0-92fb-4711317b9bee"},"coreMigrationVersion":"7.17.23","id":"7.17.23","migrationVersion":{"config":"7.13.0"},"references":[],"type":"config","updated_at":"2025-11-14T17:29:48.539Z","version":"WzE0Miw0XQ=="} +{"attributes":{"fieldAttrs":"{\"@timestamp\":{\"count\":3},\"command\":{\"count\":2},\"container_name\":{\"count\":1},\"level\":{\"count\":1},\"message\":{\"count\":1}}","fields":"[]","runtimeFieldMap":"{}","timeFieldName":"@timestamp","title":"docker-*","typeMeta":"{}"},"coreMigrationVersion":"7.17.23","id":"docker-logs","migrationVersion":{"index-pattern":"7.11.0"},"references":[],"type":"index-pattern","updated_at":"2025-11-14T17:26:47.450Z","version":"Wzc0LDRd"} +{"attributes":{"columns":["container_name","message","level"],"description":"test","grid":{},"hideChart":false,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"sort":[["@timestamp","asc"]],"title":"LogTable"},"coreMigrationVersion":"7.17.23","id":"b5a48950-c17c-11f0-92fb-4711317b9bee","migrationVersion":{"search":"7.9.3"},"references":[{"id":"docker-logs","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"search","updated_at":"2025-11-14T17:26:47.450Z","version":"Wzc1LDRd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filter\":[]}"},"optionsJSON":"{\"useMargins\":true,\"syncColors\":false,\"hidePanelTitles\":false}","panelsJSON":"[{\"version\":\"7.17.23\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":0,\"w\":24,\"h\":21,\"i\":\"9600aa15-1732-41da-a43c-723fdb1a97a0\"},\"panelIndex\":\"9600aa15-1732-41da-a43c-723fdb1a97a0\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsXY\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"docker-logs\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"docker-logs\",\"name\":\"indexpattern-datasource-layer-7b411268-3ed2-45f6-9067-b88364aba992\"}],\"state\":{\"visualization\":{\"legend\":{\"isVisible\":true,\"position\":\"right\"},\"valueLabels\":\"hide\",\"fittingFunction\":\"None\",\"yLeftExtent\":{\"mode\":\"full\"},\"yRightExtent\":{\"mode\":\"full\"},\"axisTitlesVisibilitySettings\":{\"x\":true,\"yLeft\":true,\"yRight\":true},\"tickLabelsVisibilitySettings\":{\"x\":true,\"yLeft\":true,\"yRight\":true},\"labelsOrientation\":{\"x\":0,\"yLeft\":0,\"yRight\":0},\"gridlinesVisibilitySettings\":{\"x\":true,\"yLeft\":true,\"yRight\":true},\"preferredSeriesType\":\"bar_stacked\",\"layers\":[{\"layerId\":\"7b411268-3ed2-45f6-9067-b88364aba992\",\"accessors\":[\"27ad7775-f44f-4d6c-b49d-5f8bebee33af\"],\"position\":\"top\",\"seriesType\":\"bar\",\"showGridlines\":false,\"layerType\":\"data\",\"xAccessor\":\"e4e3a367-7cd4-4ad6-95a7-824f0717503d\"}]},\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"7b411268-3ed2-45f6-9067-b88364aba992\":{\"columns\":{\"e4e3a367-7cd4-4ad6-95a7-824f0717503d\":{\"label\":\"Top values of container_name.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"container_name.keyword\",\"isBucketed\":true,\"params\":{\"size\":5,\"orderBy\":{\"type\":\"column\",\"columnId\":\"27ad7775-f44f-4d6c-b49d-5f8bebee33af\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"27ad7775-f44f-4d6c-b49d-5f8bebee33af\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"e4e3a367-7cd4-4ad6-95a7-824f0717503d\",\"27ad7775-f44f-4d6c-b49d-5f8bebee33af\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"Log Count\"},{\"version\":\"7.17.23\",\"type\":\"search\",\"gridData\":{\"x\":24,\"y\":0,\"w\":24,\"h\":21,\"i\":\"08f56117-4041-4282-af91-99a44941e06d\"},\"panelIndex\":\"08f56117-4041-4282-af91-99a44941e06d\",\"embeddableConfig\":{\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"Log Management\",\"panelRefName\":\"panel_08f56117-4041-4282-af91-99a44941e06d\"}]","timeRestore":false,"title":"Default","version":1},"coreMigrationVersion":"7.17.23","id":"f1356840-c17c-11f0-92fb-4711317b9bee","migrationVersion":{"dashboard":"7.17.3"},"references":[{"id":"docker-logs","name":"9600aa15-1732-41da-a43c-723fdb1a97a0:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"docker-logs","name":"9600aa15-1732-41da-a43c-723fdb1a97a0:indexpattern-datasource-layer-7b411268-3ed2-45f6-9067-b88364aba992","type":"index-pattern"},{"id":"b5a48950-c17c-11f0-92fb-4711317b9bee","name":"08f56117-4041-4282-af91-99a44941e06d:panel_08f56117-4041-4282-af91-99a44941e06d","type":"search"}],"type":"dashboard","updated_at":"2025-11-14T17:26:47.450Z","version":"Wzc2LDRd"} +{"excludedObjects":[],"excludedObjectsCount":0,"exportedCount":4,"missingRefCount":0,"missingReferences":[]} \ No newline at end of file diff --git a/logs/logstash/pipeline/logstash.conf b/logs/logstash/pipeline/logstash.conf new file mode 100644 index 0000000..d3a643c --- /dev/null +++ b/logs/logstash/pipeline/logstash.conf @@ -0,0 +1,22 @@ +input { + gelf { + port => 12201 + } +} + +filter { + mutate { + rename => { "[full_message]" => "message" } + } +} + +output { + elasticsearch { + hosts => ["http://elasticsearch:9200"] + index => "docker-%{[container_name]}-%{+YYYY.MM.dd}" + } + + stdout { + codec => rubydebug + } +} diff --git a/monitoring/grafana/alerting/policies.yaml b/monitoring/grafana/alerting/policies.yaml new file mode 100644 index 0000000..5eb5e95 --- /dev/null +++ b/monitoring/grafana/alerting/policies.yaml @@ -0,0 +1,5 @@ +routes: + receiver: discord-webhook + routes: + - matchers: + receiver: discord-webhook diff --git a/monitoring/grafana/alerting/rules.yaml b/monitoring/grafana/alerting/rules.yaml new file mode 100644 index 0000000..6158779 --- /dev/null +++ b/monitoring/grafana/alerting/rules.yaml @@ -0,0 +1,782 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: availability + folder: alert_rules.yml + interval: 1m + rules: + - uid: 14db4fe7-faf3-5629-9ee1-c5c189d75fec + title: InstanceDown + condition: threshold + data: + - refId: query + queryType: prometheus + relativeTimeRange: + from: 660 + to: 60 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + expr: up == 0 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + range: false + refId: query + - refId: prometheus_math + queryType: math + datasourceUid: __expr__ + model: + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: is_number($query) || is_nan($query) || is_inf($query) + intervalMs: 1000 + maxDataPoints: 43200 + refId: prometheus_math + type: math + - refId: threshold + queryType: threshold + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: prometheus_math + intervalMs: 1000 + maxDataPoints: 43200 + refId: threshold + type: threshold + noDataState: OK + execErrState: OK + for: 1m + annotations: + description: | + Instance {{ $labels.instance }} (job={{ $labels.job }}) has not responded to Prometheus scrapes for more than one minute. + summary: Instance {{ $labels.job }} down + labels: + __converted_prometheus_rule__: "true" + severity: critical + isPaused: false + missing_series_evals_to_resolve: 1 + - orgId: 1 + name: blackbox-probes + folder: alert_rules.yml + interval: 1m + rules: + - uid: c549c658-ce15-5d56-9842-07730bb11e15 + title: BlackboxProbeFailed + condition: threshold + data: + - refId: query + queryType: prometheus + relativeTimeRange: + from: 660 + to: 60 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + expr: probe_success == 0 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + range: false + refId: query + - refId: prometheus_math + queryType: math + datasourceUid: __expr__ + model: + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: is_number($query) || is_nan($query) || is_inf($query) + intervalMs: 1000 + maxDataPoints: 43200 + refId: prometheus_math + type: math + - refId: threshold + queryType: threshold + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: prometheus_math + intervalMs: 1000 + maxDataPoints: 43200 + refId: threshold + type: threshold + noDataState: OK + execErrState: OK + for: 30s + annotations: + description: | + The Blackbox probe for {{ $labels.instance }} has failed (probe_success = 0). + summary: Blackbox probe failed + labels: + __converted_prometheus_rule__: "true" + severity: critical + isPaused: false + missing_series_evals_to_resolve: 1 + - uid: 78a2ece6-4f7a-5496-9a59-6de4a56db201 + title: BlackboxHighLatency + condition: threshold + data: + - refId: query + queryType: prometheus + relativeTimeRange: + from: 660 + to: 60 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + expr: probe_duration_seconds > 1 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + range: false + refId: query + - refId: prometheus_math + queryType: math + datasourceUid: __expr__ + model: + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: is_number($query) || is_nan($query) || is_inf($query) + intervalMs: 1000 + maxDataPoints: 43200 + refId: prometheus_math + type: math + - refId: threshold + queryType: threshold + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: prometheus_math + intervalMs: 1000 + maxDataPoints: 43200 + refId: threshold + type: threshold + noDataState: OK + execErrState: OK + for: 2m + annotations: + description: | + The Blackbox probe to {{ $labels.instance }} has been taking more than 1 second to respond for over 2 minutes. + summary: High latency on a Blackbox probe + labels: + __converted_prometheus_rule__: "true" + severity: warning + isPaused: false + missing_series_evals_to_resolve: 1 + - uid: 00b5d799-0eef-59e9-9371-2a0bfb7df19b + title: BlackboxBadHTTPStatus + condition: threshold + data: + - refId: query + queryType: prometheus + relativeTimeRange: + from: 660 + to: 60 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + expr: probe_http_status_code != 200 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + range: false + refId: query + - refId: prometheus_math + queryType: math + datasourceUid: __expr__ + model: + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: is_number($query) || is_nan($query) || is_inf($query) + intervalMs: 1000 + maxDataPoints: 43200 + refId: prometheus_math + type: math + - refId: threshold + queryType: threshold + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: prometheus_math + intervalMs: 1000 + maxDataPoints: 43200 + refId: threshold + type: threshold + noDataState: OK + execErrState: OK + for: 1m + annotations: + description: | + The Blackbox probe to {{ $labels.instance }} is returning HTTP status {{ $value }} different from 200. + summary: Bad HTTP status code on a Blackbox probe + labels: + __converted_prometheus_rule__: "true" + severity: warning + isPaused: false + missing_series_evals_to_resolve: 1 + - orgId: 1 + name: container-resources + folder: alert_rules.yml + interval: 1m + rules: + - uid: 985c697f-e309-524c-9cd4-650a2045c279 + title: HighGlobalCPUUsage + condition: threshold + data: + - refId: query + queryType: prometheus + relativeTimeRange: + from: 660 + to: 60 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + expr: (sum(rate(container_cpu_user_seconds_total[5m])) * 100) > 80 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + range: false + refId: query + - refId: prometheus_math + queryType: math + datasourceUid: __expr__ + model: + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: is_number($query) || is_nan($query) || is_inf($query) + intervalMs: 1000 + maxDataPoints: 43200 + refId: prometheus_math + type: math + - refId: threshold + queryType: threshold + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: prometheus_math + intervalMs: 1000 + maxDataPoints: 43200 + refId: threshold + type: threshold + noDataState: OK + execErrState: OK + for: 5m + annotations: + description: | + Global CPU usage of containers has been above 80% for more than 5 minutes. Check which services are consuming the most resources. + summary: High global CPU usage for containers + labels: + __converted_prometheus_rule__: "true" + severity: warning + isPaused: false + missing_series_evals_to_resolve: 1 + - uid: 635d0ad1-10f2-51f4-9226-baf56557d870 + title: HighGlobalMemoryUsage + condition: threshold + data: + - refId: query + queryType: prometheus + relativeTimeRange: + from: 660 + to: 60 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + expr: (sum(container_memory_usage_bytes) / sum(machine_memory_bytes)) * 100 > 80 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + range: false + refId: query + - refId: prometheus_math + queryType: math + datasourceUid: __expr__ + model: + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: is_number($query) || is_nan($query) || is_inf($query) + intervalMs: 1000 + maxDataPoints: 43200 + refId: prometheus_math + type: math + - refId: threshold + queryType: threshold + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: prometheus_math + intervalMs: 1000 + maxDataPoints: 43200 + refId: threshold + type: threshold + noDataState: OK + execErrState: OK + for: 5m + annotations: + description: | + Global memory usage of containers has been above 80% for more than 5 minutes. + summary: High global memory usage for containers + labels: + __converted_prometheus_rule__: "true" + severity: warning + isPaused: false + missing_series_evals_to_resolve: 1 + - orgId: 1 + name: per-container-resources + folder: alert_rules.yml + interval: 1m + rules: + - uid: 3daf3f51-d4ad-5169-ace2-cdc1c43d8e4e + title: HighContainerCPUUsage + condition: threshold + data: + - refId: query + queryType: prometheus + relativeTimeRange: + from: 660 + to: 60 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + expr: rate(container_cpu_user_seconds_total[5m]) * 100 > 80 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + range: false + refId: query + - refId: prometheus_math + queryType: math + datasourceUid: __expr__ + model: + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: is_number($query) || is_nan($query) || is_inf($query) + intervalMs: 1000 + maxDataPoints: 43200 + refId: prometheus_math + type: math + - refId: threshold + queryType: threshold + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: prometheus_math + intervalMs: 1000 + maxDataPoints: 43200 + refId: threshold + type: threshold + noDataState: OK + execErrState: OK + for: 5m + annotations: + description: | + Container {{ $labels.name }} has been using more than 80% CPU for more than 5 minutes. + summary: High CPU usage on a container + labels: + __converted_prometheus_rule__: "true" + severity: warning + isPaused: false + missing_series_evals_to_resolve: 1 + - uid: 3202077e-ba84-5401-86fe-0fe6b0a4c26d + title: HighContainerMemoryUsage + condition: threshold + data: + - refId: query + queryType: prometheus + relativeTimeRange: + from: 660 + to: 60 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + expr: container_memory_usage_bytes > 500 * 1024 * 1024 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + range: false + refId: query + - refId: prometheus_math + queryType: math + datasourceUid: __expr__ + model: + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: is_number($query) || is_nan($query) || is_inf($query) + intervalMs: 1000 + maxDataPoints: 43200 + refId: prometheus_math + type: math + - refId: threshold + queryType: threshold + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + datasource: + IsPrunable: false + access: "" + apiVersion: "" + basicAuth: false + basicAuthUser: "" + created: "0001-01-01T00:00:00Z" + database: "" + id: -100 + isDefault: false + jsonData: {} + name: __expr__ + readOnly: false + secureJsonData: {} + type: __expr__ + uid: __expr__ + updated: "0001-01-01T00:00:00Z" + url: "" + user: "" + withCredentials: false + expression: prometheus_math + intervalMs: 1000 + maxDataPoints: 43200 + refId: threshold + type: threshold + noDataState: OK + execErrState: OK + for: 5m + annotations: + description: | + Container {{ $labels.name }} has been using more than 500 MB of RAM for more than 5 minutes. Adjust the threshold if necessary. + summary: High memory usage on a container + labels: + __converted_prometheus_rule__: "true" + severity: warning + isPaused: false + missing_series_evals_to_resolve: 1 diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index cfa0b0d..c0027f9 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -6,10 +6,6 @@ scrape_configs: static_configs: - targets: ['monitoring-prometheus:9090'] - - job_name: 'backend' - static_configs: - - targets: ['127.0.0.1:8888'] - - job_name: 'cadvisor' static_configs: - targets: ['monitoring-cadvisor:8080'] @@ -21,7 +17,6 @@ scrape_configs: static_configs: - targets: - - http://nginx - http://nginx/monitoring/ok - http://auth/monitoring - http://user/monitoring