Alerts


/etc/prometheus/alert_rules.yml > Instances
InstanceDown (0 active)
alert: InstanceDown
expr: up == 0
for: 1m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 1 minutes.'
  summary: Instance {{ $labels.instance }} down
/etc/prometheus/alert_rules.yml > system-alerts
HighCPUUsage (0 active)
alert: HighCPUUsage
expr: 100
  - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) *
  100) > 50
for: 5m
labels:
  severity: warning
annotations:
  description: CPU usage is above 50% for more than 5 minutes
  summary: High CPU usage on {{ $labels.instance }}
HighNetworkTraffic (0 active)
alert: HighNetworkTraffic
expr: (rate(node_network_receive_bytes_total[5m])
  + rate(node_network_transmit_bytes_total[5m])) > 1e+07
for: 5m
labels:
  severity: warning
annotations:
  description: Network traffic exceeds 10MB/s over 5 minutes
  summary: High network traffic on {{ $labels.instance }}
HighSwapUsage (0 active)
alert: HighSwapUsage
expr: node_memory_SwapUsed_bytes
  / node_memory_SwapTotal_bytes > 0.5
for: 5m
labels:
  severity: warning
annotations:
  description: Swap usage exceeds 50%
  summary: High swap usage on {{ $labels.instance }}
LowMemoryAvailable (0 active)
alert: LowMemoryAvailable
expr: node_memory_MemAvailable_bytes
  / node_memory_MemTotal_bytes < 0.25
for: 5m
labels:
  severity: critical
annotations:
  description: Less than 25% memory available
  summary: Low memory available on {{ $labels.instance }}