myos/docker/prometheus/prometheus/alert-rules.yml

148 lines
6.8 KiB
YAML

groups:
- name: example
rules:
# CET / CEST
- record: is_european_summer_time
expr: |
(vector(1) and (month() > 3 and month() < 10))
or
(vector(1) and (month() == 3 and (day_of_month() - day_of_week()) >= 25) and absent((day_of_month() >= 25) and (day_of_week() == 0)))
or
(vector(1) and (month() == 10 and (day_of_month() - day_of_week()) < 25) and absent((day_of_month() >= 25) and (day_of_week() == 0)))
or
(vector(1) and ((month() == 10 and hour() < 1) or (month() == 3 and hour() > 0)) and ((day_of_month() >= 25) and (day_of_week() == 0)))
or
vector(0)
# French time (UTC+1) CET / CEST
- record: european_french_time
expr: time() + 3600 + 3600 * is_european_summer_time
# Alert for any instance that is unreachable for a few seconds.
- alert: InstanceDown-01-low
expr: probe_success == 0
for: 30s
labels:
severity: "low"
type: "timeout"
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for a few seconds."
# Alert for any instance that is unreachable for some time.
- alert: InstanceDown-02-medium
expr: probe_success == 0
for: 5m
labels:
severity: "medium"
type: "timeout"
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for 10 minutes"
# Alert for any instance that is unreachable for a long time.
- alert: InstanceDown-03-high
expr: probe_success == 0
for: 1h
labels:
severity: "high"
type: "timeout"
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for 1 hour"
# Alert for any instance that is unreachable for a very long time.
- alert: InstanceDown-04-critical
expr: probe_success == 0
for: 12h
labels:
severity: "critical"
type: "timeout"
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 12 hours"
# Alert for GMV < 250€ from 8AM to 10PM on weekdays
- alert: "GMV (daytime) below lower threshold"
# Prometheus time is GMT
expr: gmv_hourly_return_value < 250 and ON() hour(european_french_time) > 8 < 22
for: 1h
labels:
severity: "high"
type: "lower than static threshold"
instance: "Hourly GMV"
annotations:
summary: "{{ $labels.instance }} GMV alert"
description: '`Hourly GMV` has been *Lower than 250€*, for more than 1 hour.
\n
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)'
# Alert for GMV too low (under static 50€) over night 10PM to 8AM
- alert: "GMV nightly below lower threshold"
# Prometheus time is GMT
expr: gmv_hourly_return_value < 50 and ON() hour(european_french_time) < 8 > 22
for: 4h
labels:
severity: "low"
type: "lower than static threshold"
instance: "Hourly GMV"
annotations:
summary: "{{ $labels.instance }} GMV alert"
description: '`Hourly GMV` (night) has been *Lower than 50€*, for more than 4 hour.
\n
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)
\n
\nGMV is usually very low between 23:30 and 05:00, but this still may require attention'
# Alert for GMV significantly lower (<33%) than mean value over last 4 weeks from 8AM to 10PM on weekdays
- alert: "GMV less than 33% compared to last 4 weeks"
# Prometheus time is GMT
expr: gmv_hourly_return_value < .33 * ( gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25 and ON() hour(european_french_time) > 8 < 22
for: 1h
labels:
severity: "low"
type: "lower than last 4 weeks"
instance: "Hourly GMV"
annotations:
summary: "{{ $labels.instance }} GMV alert"
description: '`Hourly GMV` has been *significantly lower than usual*, for more than 1 hour.
\n
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)'
query: '>Mean value observed this month is {{ range query "((gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25)" }}*{{ .Value | printf "%.2f" }}*{{ end }} (same day of the week, same hour)'
# Alert for GMV critically lower (<20%) than mean value over last 4 weeks from 8AM to 10PM on weekdays
- alert: "GMV less than 20% compared to last 4 weeks"
# Prometheus time is GMT
expr: gmv_hourly_return_value < .20 * ( gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25 and ON() hour(european_french_time) > 8 < 22
for: 1h
labels:
severity: "high"
type: "low over last 4 weeks"
instance: "Hourly GMV"
annotations:
summary: "{{ $labels.instance }} GMV alert"
description: '`Hourly GMV` has been *critically lower than usual*, for more than 1 hour.
\n
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)'
query: '>Mean value observed this month is {{ range query "((gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25)" }}*{{ .Value | printf "%.2f" }}*{{ end }} (same day of the week, same hour)'
# Alert for GMV suspiciously higher (>500%) than mean value over last 4 weeks from 8AM to 10PM on weekdays
- alert: "GMV more than 500% compared to last 4 weeks"
# Prometheus time is GMT
expr: gmv_hourly_return_value > 5 * ( gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25 and ON() hour(european_french_time) > 8 < 22
for: 1h
labels:
severity: "low"
type: "low over last 4 weeks"
instance: "Hourly GMV"
annotations:
summary: "{{ $labels.instance }} GMV alert"
description: '`Hourly GMV` has been *much higher than usual*, for more than 1 hour.
\n(If there is an ongoing sale, it is most probably ok)
\n
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)'
query: '>Mean value observed this month is {{ range query "((gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25)" }}*{{ .Value | printf "%.2f" }}*{{ end }} (same day of the week, same hour)'