# Prometheus Alerting Rules # https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ groups: - name: infrastructure_alerts interval: 30s rules: # Host System Alerts - alert: HostHighCpuLoad expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning category: infrastructure annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is above 80% (current value: {{ $value }}%)" - alert: HostOutOfMemory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m labels: severity: critical category: infrastructure annotations: summary: "Host out of memory on {{ $labels.instance }}" description: "Available memory is below 10% (current value: {{ $value }}%)" - alert: HostOutOfDiskSpace expr: (node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"} * 100) < 10 for: 2m labels: severity: critical category: infrastructure annotations: summary: "Host out of disk space on {{ $labels.instance }}" description: "Disk space is below 10% (current value: {{ $value }}%)" - alert: HostDiskSpaceWarning expr: (node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"} * 100) < 20 for: 5m labels: severity: warning category: infrastructure annotations: summary: "Disk space warning on {{ $labels.instance }}" description: "Disk space is below 20% (current value: {{ $value }}%)" - alert: HostHighDiskReadLatency expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 for: 5m labels: severity: warning category: infrastructure annotations: summary: "High disk read latency on {{ $labels.instance }}" description: "Disk read latency is high (current value: {{ $value }}s)" # Container Alerts - alert: ContainerKilled expr: time() - container_last_seen{name!~".*exporter.*"} > 60 for: 1m labels: severity: critical category: container annotations: summary: "Container killed: {{ $labels.name }}" description: "Container {{ $labels.name }} has disappeared" - alert: ContainerHighCpuUsage expr: (sum(rate(container_cpu_usage_seconds_total{name!~".*exporter.*"}[5m])) by (name) * 100) > 80 for: 5m labels: severity: warning category: container annotations: summary: "High CPU usage in container {{ $labels.name }}" description: "Container CPU usage is above 80% (current value: {{ $value }}%)" - alert: ContainerHighMemoryUsage expr: (sum(container_memory_usage_bytes{name!~".*exporter.*"}) by (name) / sum(container_spec_memory_limit_bytes{name!~".*exporter.*"}) by (name) * 100) > 80 for: 5m labels: severity: warning category: container annotations: summary: "High memory usage in container {{ $labels.name }}" description: "Container memory usage is above 80% (current value: {{ $value }}%)" - alert: ContainerVolumeUsage expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 for: 5m labels: severity: warning category: container annotations: summary: "Container volume usage on {{ $labels.instance }}" description: "Container volume usage is above 80% (current value: {{ $value }}%)" - alert: ContainerRestartCount expr: rate(container_restart_count[5m]) > 0 for: 1m labels: severity: warning category: container annotations: summary: "Container restarting: {{ $labels.name }}" description: "Container {{ $labels.name }} is restarting frequently" # Prometheus Self-Monitoring - alert: PrometheusTargetDown expr: up == 0 for: 1m labels: severity: critical category: prometheus annotations: summary: "Prometheus target down: {{ $labels.job }}" description: "Target {{ $labels.job }} on {{ $labels.instance }} is down" - alert: PrometheusConfigReloadFailure expr: prometheus_config_last_reload_successful == 0 for: 1m labels: severity: critical category: prometheus annotations: summary: "Prometheus configuration reload failure" description: "Prometheus configuration reload has failed" - alert: PrometheusTooManyRestarts expr: changes(process_start_time_seconds{job=~"prometheus"}[15m]) > 2 for: 1m labels: severity: warning category: prometheus annotations: summary: "Prometheus restarting frequently" description: "Prometheus has restarted more than twice in the last 15 minutes" - alert: PrometheusTargetScrapingSlow expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 for: 5m labels: severity: warning category: prometheus annotations: summary: "Prometheus target scraping slow" description: "Prometheus is scraping targets slowly (current value: {{ $value }}s)" # Traefik Alerts - alert: TraefikServiceDown expr: count(traefik_service_server_up) by (service) == 0 for: 1m labels: severity: critical category: traefik annotations: summary: "Traefik service down: {{ $labels.service }}" description: "Traefik service {{ $labels.service }} is down" - alert: TraefikHighHttp4xxErrorRate expr: sum(rate(traefik_service_requests_total{code=~"4.."}[5m])) by (service) / sum(rate(traefik_service_requests_total[5m])) by (service) * 100 > 5 for: 5m labels: severity: warning category: traefik annotations: summary: "High HTTP 4xx error rate for {{ $labels.service }}" description: "HTTP 4xx error rate is above 5% (current value: {{ $value }}%)" - alert: TraefikHighHttp5xxErrorRate expr: sum(rate(traefik_service_requests_total{code=~"5.."}[5m])) by (service) / sum(rate(traefik_service_requests_total[5m])) by (service) * 100 > 1 for: 5m labels: severity: critical category: traefik annotations: summary: "High HTTP 5xx error rate for {{ $labels.service }}" description: "HTTP 5xx error rate is above 1% (current value: {{ $value }}%)" - name: database_alerts interval: 30s rules: # PostgreSQL Alerts (uncomment when postgres-exporter is deployed) # - alert: PostgresqlDown # expr: pg_up == 0 # for: 1m # labels: # severity: critical # category: database # annotations: # summary: "PostgreSQL down on {{ $labels.instance }}" # description: "PostgreSQL instance is down" # - alert: PostgresqlTooManyConnections # expr: sum by (instance) (pg_stat_activity_count) > pg_settings_max_connections * 0.8 # for: 5m # labels: # severity: warning # category: database # annotations: # summary: "Too many PostgreSQL connections on {{ $labels.instance }}" # description: "PostgreSQL connections are above 80% of max_connections" # - alert: PostgresqlDeadLocks # expr: rate(pg_stat_database_deadlocks[1m]) > 0 # for: 1m # labels: # severity: warning # category: database # annotations: # summary: "PostgreSQL deadlocks on {{ $labels.instance }}" # description: "PostgreSQL has deadlocks" # Redis Alerts (uncomment when redis-exporter is deployed) # - alert: RedisDown # expr: redis_up == 0 # for: 1m # labels: # severity: critical # category: cache # annotations: # summary: "Redis down on {{ $labels.instance }}" # description: "Redis instance is down" # - alert: RedisOutOfMemory # expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 # for: 5m # labels: # severity: critical # category: cache # annotations: # summary: "Redis out of memory on {{ $labels.instance }}" # description: "Redis memory usage is above 90%" # - alert: RedisTooManyConnections # expr: redis_connected_clients > 100 # for: 5m # labels: # severity: warning # category: cache # annotations: # summary: "Too many Redis connections on {{ $labels.instance }}" # description: "Redis has too many client connections (current value: {{ $value }})"