|
/etc/prometheus/rules/blackbox.yml > blackbox_tcp
|
alert: BlackboxTCPFailure
expr: probe_success
== 0
for: 20s
labels:
severity: critical
annotations:
description: 'Blackbox TCP probe to {{ $labels.instance }} (service: {{ $labels.service
}}) failed.'
summary: TCP connectivity failed for {{ $labels.instance }}
|
|
/etc/prometheus/rules/extra_checks.yml > extra-checks
|
alert: DotnetMissing
expr: dotnet_exists
== 0
for: 1m
labels:
severity: critical
annotations:
description: .NET runtime is NOT installed on instance {{ $labels.instance }}.
summary: .NET runtime missing
|
alert: ServiceDoesNotExist
expr: service_exists
== 0
for: 1m
labels:
severity: critical
annotations:
description: Systemd service '{{ $labels.service }}' is missing on instance
{{ $labels.instance }}.
summary: Service {{ $labels.service }} does NOT exist
|
|
/etc/prometheus/rules/main.yml > nginx_alerts
|
alert: NginxExporterDown
expr: up{job="nginx"}
== 0
for: 1m
labels:
severity: critical
annotations:
description: Prometheus cannot scrape NGINX metrics.
summary: 'NGINX exporter down: {{ $labels.instance }}'
|
|
|
|
/etc/prometheus/rules/main.yml > node_exporter_alerts
|
|
|
|
|
|
|
alert: NodeDown
expr: up{job="node_exporter"}
== 0
for: 2m
labels:
severity: critical
annotations:
description: Node exporter is not reachable.
summary: 'Node down: {{ $labels.instance }}'
|
|
/etc/prometheus/rules/main.yml > rabbitmq_alerts
|
alert: RabbitMQExporterDown
expr: up{job="rabbitmq"}
== 0
for: 2m
labels:
severity: critical
annotations:
description: Prometheus cannot scrape RabbitMQ metrics.
summary: 'RabbitMQ exporter down: {{ $labels.instance }}'
|
|
|
|
|
|
/etc/prometheus/rules/services.yml > service_status
|
alert: AnyServiceDown
expr: service_running
== 0
for: 30s
labels:
severity: critical
annotations:
description: Service {{ $labels.service }} is not running on {{ $labels.instance
}}.
summary: Service {{ $labels.service }} is DOWN on {{ $labels.instance }}
|
|
/etc/prometheus/rules/vitals.yml > service_status
|
alert: AnyServiceDown
expr: service_running
== 0
for: 30s
labels:
severity: critical
annotations:
description: Service {{ $labels.service }} has been reported as not running on {{
$labels.instance }}.
summary: Service {{ $labels.service }} is DOWN on {{ $labels.instance }}
|