ft_transcendence/monitoring/grafana/alerting/rules.yaml

782 lines
27 KiB
YAML

apiVersion: 1
groups:
- orgId: 1
name: availability
folder: alert_rules.yml
interval: 1m
rules:
- uid: 14db4fe7-faf3-5629-9ee1-c5c189d75fec
title: InstanceDown
condition: threshold
data:
- refId: query
queryType: prometheus
relativeTimeRange:
from: 660
to: 60
datasourceUid: prometheus
model:
datasource:
type: prometheus
uid: prometheus
expr: up == 0
instant: true
intervalMs: 1000
maxDataPoints: 43200
range: false
refId: query
- refId: prometheus_math
queryType: math
datasourceUid: __expr__
model:
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: is_number($query) || is_nan($query) || is_inf($query)
intervalMs: 1000
maxDataPoints: 43200
refId: prometheus_math
type: math
- refId: threshold
queryType: threshold
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: prometheus_math
intervalMs: 1000
maxDataPoints: 43200
refId: threshold
type: threshold
noDataState: OK
execErrState: OK
for: 1m
annotations:
description: |
Instance {{ $labels.instance }} (job={{ $labels.job }}) has not responded to Prometheus scrapes for more than one minute.
summary: Instance {{ $labels.job }} down
labels:
__converted_prometheus_rule__: "true"
severity: critical
isPaused: false
missing_series_evals_to_resolve: 1
- orgId: 1
name: blackbox-probes
folder: alert_rules.yml
interval: 1m
rules:
- uid: c549c658-ce15-5d56-9842-07730bb11e15
title: BlackboxProbeFailed
condition: threshold
data:
- refId: query
queryType: prometheus
relativeTimeRange:
from: 660
to: 60
datasourceUid: prometheus
model:
datasource:
type: prometheus
uid: prometheus
expr: probe_success == 0
instant: true
intervalMs: 1000
maxDataPoints: 43200
range: false
refId: query
- refId: prometheus_math
queryType: math
datasourceUid: __expr__
model:
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: is_number($query) || is_nan($query) || is_inf($query)
intervalMs: 1000
maxDataPoints: 43200
refId: prometheus_math
type: math
- refId: threshold
queryType: threshold
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: prometheus_math
intervalMs: 1000
maxDataPoints: 43200
refId: threshold
type: threshold
noDataState: OK
execErrState: OK
for: 30s
annotations:
description: |
The Blackbox probe for {{ $labels.instance }} has failed (probe_success = 0).
summary: Blackbox probe failed
labels:
__converted_prometheus_rule__: "true"
severity: critical
isPaused: false
missing_series_evals_to_resolve: 1
- uid: 78a2ece6-4f7a-5496-9a59-6de4a56db201
title: BlackboxHighLatency
condition: threshold
data:
- refId: query
queryType: prometheus
relativeTimeRange:
from: 660
to: 60
datasourceUid: prometheus
model:
datasource:
type: prometheus
uid: prometheus
expr: probe_duration_seconds > 1
instant: true
intervalMs: 1000
maxDataPoints: 43200
range: false
refId: query
- refId: prometheus_math
queryType: math
datasourceUid: __expr__
model:
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: is_number($query) || is_nan($query) || is_inf($query)
intervalMs: 1000
maxDataPoints: 43200
refId: prometheus_math
type: math
- refId: threshold
queryType: threshold
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: prometheus_math
intervalMs: 1000
maxDataPoints: 43200
refId: threshold
type: threshold
noDataState: OK
execErrState: OK
for: 2m
annotations:
description: |
The Blackbox probe to {{ $labels.instance }} has been taking more than 1 second to respond for over 2 minutes.
summary: High latency on a Blackbox probe
labels:
__converted_prometheus_rule__: "true"
severity: warning
isPaused: false
missing_series_evals_to_resolve: 1
- uid: 00b5d799-0eef-59e9-9371-2a0bfb7df19b
title: BlackboxBadHTTPStatus
condition: threshold
data:
- refId: query
queryType: prometheus
relativeTimeRange:
from: 660
to: 60
datasourceUid: prometheus
model:
datasource:
type: prometheus
uid: prometheus
expr: probe_http_status_code != 200
instant: true
intervalMs: 1000
maxDataPoints: 43200
range: false
refId: query
- refId: prometheus_math
queryType: math
datasourceUid: __expr__
model:
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: is_number($query) || is_nan($query) || is_inf($query)
intervalMs: 1000
maxDataPoints: 43200
refId: prometheus_math
type: math
- refId: threshold
queryType: threshold
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: prometheus_math
intervalMs: 1000
maxDataPoints: 43200
refId: threshold
type: threshold
noDataState: OK
execErrState: OK
for: 1m
annotations:
description: |
The Blackbox probe to {{ $labels.instance }} is returning HTTP status {{ $value }} different from 200.
summary: Bad HTTP status code on a Blackbox probe
labels:
__converted_prometheus_rule__: "true"
severity: warning
isPaused: false
missing_series_evals_to_resolve: 1
- orgId: 1
name: container-resources
folder: alert_rules.yml
interval: 1m
rules:
- uid: 985c697f-e309-524c-9cd4-650a2045c279
title: HighGlobalCPUUsage
condition: threshold
data:
- refId: query
queryType: prometheus
relativeTimeRange:
from: 660
to: 60
datasourceUid: prometheus
model:
datasource:
type: prometheus
uid: prometheus
expr: (sum(rate(container_cpu_user_seconds_total[5m])) * 100) > 80
instant: true
intervalMs: 1000
maxDataPoints: 43200
range: false
refId: query
- refId: prometheus_math
queryType: math
datasourceUid: __expr__
model:
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: is_number($query) || is_nan($query) || is_inf($query)
intervalMs: 1000
maxDataPoints: 43200
refId: prometheus_math
type: math
- refId: threshold
queryType: threshold
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: prometheus_math
intervalMs: 1000
maxDataPoints: 43200
refId: threshold
type: threshold
noDataState: OK
execErrState: OK
for: 5m
annotations:
description: |
Global CPU usage of containers has been above 80% for more than 5 minutes. Check which services are consuming the most resources.
summary: High global CPU usage for containers
labels:
__converted_prometheus_rule__: "true"
severity: warning
isPaused: false
missing_series_evals_to_resolve: 1
- uid: 635d0ad1-10f2-51f4-9226-baf56557d870
title: HighGlobalMemoryUsage
condition: threshold
data:
- refId: query
queryType: prometheus
relativeTimeRange:
from: 660
to: 60
datasourceUid: prometheus
model:
datasource:
type: prometheus
uid: prometheus
expr: (sum(container_memory_usage_bytes) / sum(machine_memory_bytes)) * 100 > 80
instant: true
intervalMs: 1000
maxDataPoints: 43200
range: false
refId: query
- refId: prometheus_math
queryType: math
datasourceUid: __expr__
model:
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: is_number($query) || is_nan($query) || is_inf($query)
intervalMs: 1000
maxDataPoints: 43200
refId: prometheus_math
type: math
- refId: threshold
queryType: threshold
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: prometheus_math
intervalMs: 1000
maxDataPoints: 43200
refId: threshold
type: threshold
noDataState: OK
execErrState: OK
for: 5m
annotations:
description: |
Global memory usage of containers has been above 80% for more than 5 minutes.
summary: High global memory usage for containers
labels:
__converted_prometheus_rule__: "true"
severity: warning
isPaused: false
missing_series_evals_to_resolve: 1
- orgId: 1
name: per-container-resources
folder: alert_rules.yml
interval: 1m
rules:
- uid: 3daf3f51-d4ad-5169-ace2-cdc1c43d8e4e
title: HighContainerCPUUsage
condition: threshold
data:
- refId: query
queryType: prometheus
relativeTimeRange:
from: 660
to: 60
datasourceUid: prometheus
model:
datasource:
type: prometheus
uid: prometheus
expr: rate(container_cpu_user_seconds_total[5m]) * 100 > 80
instant: true
intervalMs: 1000
maxDataPoints: 43200
range: false
refId: query
- refId: prometheus_math
queryType: math
datasourceUid: __expr__
model:
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: is_number($query) || is_nan($query) || is_inf($query)
intervalMs: 1000
maxDataPoints: 43200
refId: prometheus_math
type: math
- refId: threshold
queryType: threshold
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: prometheus_math
intervalMs: 1000
maxDataPoints: 43200
refId: threshold
type: threshold
noDataState: OK
execErrState: OK
for: 5m
annotations:
description: |
Container {{ $labels.name }} has been using more than 80% CPU for more than 5 minutes.
summary: High CPU usage on a container
labels:
__converted_prometheus_rule__: "true"
severity: warning
isPaused: false
missing_series_evals_to_resolve: 1
- uid: 3202077e-ba84-5401-86fe-0fe6b0a4c26d
title: HighContainerMemoryUsage
condition: threshold
data:
- refId: query
queryType: prometheus
relativeTimeRange:
from: 660
to: 60
datasourceUid: prometheus
model:
datasource:
type: prometheus
uid: prometheus
expr: container_memory_usage_bytes > 500 * 1024 * 1024
instant: true
intervalMs: 1000
maxDataPoints: 43200
range: false
refId: query
- refId: prometheus_math
queryType: math
datasourceUid: __expr__
model:
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: is_number($query) || is_nan($query) || is_inf($query)
intervalMs: 1000
maxDataPoints: 43200
refId: prometheus_math
type: math
- refId: threshold
queryType: threshold
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
datasource:
IsPrunable: false
access: ""
apiVersion: ""
basicAuth: false
basicAuthUser: ""
created: "0001-01-01T00:00:00Z"
database: ""
id: -100
isDefault: false
jsonData: {}
name: __expr__
readOnly: false
secureJsonData: {}
type: __expr__
uid: __expr__
updated: "0001-01-01T00:00:00Z"
url: ""
user: ""
withCredentials: false
expression: prometheus_math
intervalMs: 1000
maxDataPoints: 43200
refId: threshold
type: threshold
noDataState: OK
execErrState: OK
for: 5m
annotations:
description: |
Container {{ $labels.name }} has been using more than 500 MB of RAM for more than 5 minutes. Adjust the threshold if necessary.
summary: High memory usage on a container
labels:
__converted_prometheus_rule__: "true"
severity: warning
isPaused: false
missing_series_evals_to_resolve: 1