Alerts


/etc/prometheus/rules/blackbox.yml > blackbox_tcp
BlackboxTCPFailure (0 active)
alert: BlackboxTCPFailure
expr: probe_success
  == 0
for: 20s
labels:
  severity: critical
annotations:
  description: 'Blackbox TCP probe to {{ $labels.instance }} (service: {{ $labels.service
    }}) failed.'
  summary: TCP connectivity failed for {{ $labels.instance }}
/etc/prometheus/rules/extra_checks.yml > extra-checks
DotnetMissing (0 active)
alert: DotnetMissing
expr: dotnet_exists
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: .NET runtime is NOT installed on instance {{ $labels.instance }}.
  summary: .NET runtime missing
ServiceDoesNotExist (0 active)
alert: ServiceDoesNotExist
expr: service_exists
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: Systemd service '{{ $labels.service }}' is missing on instance
    {{ $labels.instance }}.
  summary: Service {{ $labels.service }} does NOT exist
/etc/prometheus/rules/main.yml > nginx_alerts
NginxExporterDown (0 active)
alert: NginxExporterDown
expr: up{job="nginx"}
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: Prometheus cannot scrape NGINX metrics.
  summary: 'NGINX exporter down: {{ $labels.instance }}'
NginxHighConnections (0 active)
alert: NginxHighConnections
expr: nginx_connections_active
  > 1000
for: 2m
labels:
  severity: warning
annotations:
  description: Active connections > 1000 for 2 minutes.
  summary: High NGINX connections on {{ $labels.instance }}
/etc/prometheus/rules/main.yml > node_exporter_alerts
HighCPUUsage (0 active)
alert: HighCPUUsage
expr: (100
  - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
  > 90
for: 5m
labels:
  severity: warning
annotations:
  description: CPU usage is above 90% for 5 minutes.
  summary: High CPU usage on {{ $labels.instance }}
HighDiskUsage (0 active)
alert: HighDiskUsage
expr: 100
  * (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay",job="node_exporter",mountpoint!~"/run.*|/var/lib/docker.*"}
  / node_filesystem_size_bytes{fstype!~"tmpfs|overlay",job="node_exporter",mountpoint!~"/run.*|/var/lib/docker.*"})
  > 90
for: 10m
labels:
  severity: warning
annotations:
  description: Filesystem {{ $labels.device }} mounted at {{ $labels.mountpoint }}
    is over 90% full.
  summary: Disk usage high on {{ $labels.instance }} ({{ $labels.mountpoint }})
HighMemoryUsage (0 active)
alert: HighMemoryUsage
expr: (1
  - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 5m
labels:
  severity: warning
annotations:
  description: Memory usage is above 90%.
  summary: High memory usage on {{ $labels.instance }}
NodeDown (0 active)
alert: NodeDown
expr: up{job="node_exporter"}
  == 0
for: 2m
labels:
  severity: critical
annotations:
  description: Node exporter is not reachable.
  summary: 'Node down: {{ $labels.instance }}'
/etc/prometheus/rules/main.yml > rabbitmq_alerts
RabbitMQExporterDown (0 active)
alert: RabbitMQExporterDown
expr: up{job="rabbitmq"}
  == 0
for: 2m
labels:
  severity: critical
annotations:
  description: Prometheus cannot scrape RabbitMQ metrics.
  summary: 'RabbitMQ exporter down: {{ $labels.instance }}'
RabbitMQQueueTooLarge (0 active)
alert: RabbitMQQueueTooLarge
expr: rabbitmq_queue_messages_ready
  > 500
for: 5m
labels:
  severity: warning
annotations:
  description: Queue {{ $labels.queue }} has {{ $value }} ready messages.
  summary: Queue backlog on {{ $labels.instance }}
RabbitMQUnackedMessages (0 active)
alert: RabbitMQUnackedMessages
expr: rabbitmq_queue_messages_unacked
  > 100
for: 5m
labels:
  severity: warning
annotations:
  description: Queue {{ $labels.queue }} has {{ $value }} unacked messages.
  summary: Unacked messages high on {{ $labels.instance }}
/etc/prometheus/rules/services.yml > service_status
AnyServiceDown (0 active)
alert: AnyServiceDown
expr: service_running
  == 0
for: 30s
labels:
  severity: critical
annotations:
  description: Service {{ $labels.service }} is not running on {{ $labels.instance
    }}.
  summary: Service {{ $labels.service }} is DOWN on {{ $labels.instance }}
/etc/prometheus/rules/vitals.yml > service_status
AnyServiceDown (0 active)
alert: AnyServiceDown
expr: service_running
  == 0
for: 30s
labels:
  severity: critical
annotations:
  description: Service {{ $labels.service }} has been reported as not running on {{
    $labels.instance }}.
  summary: Service {{ $labels.service }} is DOWN on {{ $labels.instance }}