148 lines
6.8 KiB
YAML
148 lines
6.8 KiB
YAML
groups:
|
|
- name: example
|
|
rules:
|
|
|
|
# CET / CEST
|
|
- record: is_european_summer_time
|
|
expr: |
|
|
(vector(1) and (month() > 3 and month() < 10))
|
|
or
|
|
(vector(1) and (month() == 3 and (day_of_month() - day_of_week()) >= 25) and absent((day_of_month() >= 25) and (day_of_week() == 0)))
|
|
or
|
|
(vector(1) and (month() == 10 and (day_of_month() - day_of_week()) < 25) and absent((day_of_month() >= 25) and (day_of_week() == 0)))
|
|
or
|
|
(vector(1) and ((month() == 10 and hour() < 1) or (month() == 3 and hour() > 0)) and ((day_of_month() >= 25) and (day_of_week() == 0)))
|
|
or
|
|
vector(0)
|
|
# French time (UTC+1) CET / CEST
|
|
- record: european_french_time
|
|
expr: time() + 3600 + 3600 * is_european_summer_time
|
|
|
|
# Alert for any instance that is unreachable for a few seconds.
|
|
- alert: InstanceDown-01-low
|
|
expr: probe_success == 0
|
|
for: 30s
|
|
labels:
|
|
severity: "low"
|
|
type: "timeout"
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for a few seconds."
|
|
|
|
# Alert for any instance that is unreachable for some time.
|
|
- alert: InstanceDown-02-medium
|
|
expr: probe_success == 0
|
|
for: 5m
|
|
labels:
|
|
severity: "medium"
|
|
type: "timeout"
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for 10 minutes"
|
|
|
|
# Alert for any instance that is unreachable for a long time.
|
|
- alert: InstanceDown-03-high
|
|
expr: probe_success == 0
|
|
for: 1h
|
|
labels:
|
|
severity: "high"
|
|
type: "timeout"
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for 1 hour"
|
|
|
|
# Alert for any instance that is unreachable for a very long time.
|
|
- alert: InstanceDown-04-critical
|
|
expr: probe_success == 0
|
|
for: 12h
|
|
labels:
|
|
severity: "critical"
|
|
type: "timeout"
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 12 hours"
|
|
|
|
# Alert for GMV < 250€ from 8AM to 10PM on weekdays
|
|
- alert: "GMV (daytime) below lower threshold"
|
|
# Prometheus time is GMT
|
|
expr: gmv_hourly_return_value < 250 and ON() hour(european_french_time) > 8 < 22
|
|
for: 1h
|
|
labels:
|
|
severity: "high"
|
|
type: "lower than static threshold"
|
|
instance: "Hourly GMV"
|
|
annotations:
|
|
summary: "{{ $labels.instance }} GMV alert"
|
|
description: '`Hourly GMV` has been *Lower than 250€*, for more than 1 hour.
|
|
\n
|
|
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)'
|
|
|
|
# Alert for GMV too low (under static 50€) over night 10PM to 8AM
|
|
- alert: "GMV nightly below lower threshold"
|
|
# Prometheus time is GMT
|
|
expr: gmv_hourly_return_value < 50 and ON() hour(european_french_time) < 8 > 22
|
|
for: 4h
|
|
labels:
|
|
severity: "low"
|
|
type: "lower than static threshold"
|
|
instance: "Hourly GMV"
|
|
annotations:
|
|
summary: "{{ $labels.instance }} GMV alert"
|
|
description: '`Hourly GMV` (night) has been *Lower than 50€*, for more than 4 hour.
|
|
\n
|
|
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)
|
|
\n
|
|
\nGMV is usually very low between 23:30 and 05:00, but this still may require attention'
|
|
|
|
# Alert for GMV significantly lower (<33%) than mean value over last 4 weeks from 8AM to 10PM on weekdays
|
|
- alert: "GMV less than 33% compared to last 4 weeks"
|
|
# Prometheus time is GMT
|
|
expr: gmv_hourly_return_value < .33 * ( gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25 and ON() hour(european_french_time) > 8 < 22
|
|
for: 1h
|
|
labels:
|
|
severity: "low"
|
|
type: "lower than last 4 weeks"
|
|
instance: "Hourly GMV"
|
|
annotations:
|
|
summary: "{{ $labels.instance }} GMV alert"
|
|
description: '`Hourly GMV` has been *significantly lower than usual*, for more than 1 hour.
|
|
\n
|
|
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)'
|
|
query: '>Mean value observed this month is {{ range query "((gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25)" }}*{{ .Value | printf "%.2f" }}*{{ end }} (same day of the week, same hour)'
|
|
|
|
# Alert for GMV critically lower (<20%) than mean value over last 4 weeks from 8AM to 10PM on weekdays
|
|
- alert: "GMV less than 20% compared to last 4 weeks"
|
|
# Prometheus time is GMT
|
|
expr: gmv_hourly_return_value < .20 * ( gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25 and ON() hour(european_french_time) > 8 < 22
|
|
for: 1h
|
|
labels:
|
|
severity: "high"
|
|
type: "low over last 4 weeks"
|
|
instance: "Hourly GMV"
|
|
annotations:
|
|
summary: "{{ $labels.instance }} GMV alert"
|
|
description: '`Hourly GMV` has been *critically lower than usual*, for more than 1 hour.
|
|
\n
|
|
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)'
|
|
query: '>Mean value observed this month is {{ range query "((gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25)" }}*{{ .Value | printf "%.2f" }}*{{ end }} (same day of the week, same hour)'
|
|
|
|
# Alert for GMV suspiciously higher (>500%) than mean value over last 4 weeks from 8AM to 10PM on weekdays
|
|
- alert: "GMV more than 500% compared to last 4 weeks"
|
|
# Prometheus time is GMT
|
|
expr: gmv_hourly_return_value > 5 * ( gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25 and ON() hour(european_french_time) > 8 < 22
|
|
for: 1h
|
|
labels:
|
|
severity: "low"
|
|
type: "low over last 4 weeks"
|
|
instance: "Hourly GMV"
|
|
annotations:
|
|
summary: "{{ $labels.instance }} GMV alert"
|
|
description: '`Hourly GMV` has been *much higher than usual*, for more than 1 hour.
|
|
\n(If there is an ongoing sale, it is most probably ok)
|
|
\n
|
|
\n> Current value is *{{ .Value | printf "%.2f" }}* (over the last hour)'
|
|
query: '>Mean value observed this month is {{ range query "((gmv_hourly_oneweekago_return_value + gmv_hourly_twoweeksago_return_value + gmv_hourly_threeweeksago_return_value + gmv_hourly_fourweeksago_return_value ) * .25)" }}*{{ .Value | printf "%.2f" }}*{{ end }} (same day of the week, same hour)'
|
|
|
|
|
|
|