feat: CI/CD pipeline setup complete - Ansible playbooks updated, secrets configured, workflow ready
This commit is contained in:
245
deployment/stacks/monitoring/prometheus/alerts.yml
Normal file
245
deployment/stacks/monitoring/prometheus/alerts.yml
Normal file
@@ -0,0 +1,245 @@
|
||||
# Prometheus Alerting Rules
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
|
||||
|
||||
groups:
|
||||
- name: infrastructure_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# Host System Alerts
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "High CPU load on {{ $labels.instance }}"
|
||||
description: "CPU load is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "Host out of memory on {{ $labels.instance }}"
|
||||
description: "Available memory is below 10% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: (node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"} * 100) < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "Host out of disk space on {{ $labels.instance }}"
|
||||
description: "Disk space is below 10% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: HostDiskSpaceWarning
|
||||
expr: (node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"} * 100) < 20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "Disk space warning on {{ $labels.instance }}"
|
||||
description: "Disk space is below 20% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: HostHighDiskReadLatency
|
||||
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "High disk read latency on {{ $labels.instance }}"
|
||||
description: "Disk read latency is high (current value: {{ $value }}s)"
|
||||
|
||||
# Container Alerts
|
||||
- alert: ContainerKilled
|
||||
expr: time() - container_last_seen{name!~".*exporter.*"} > 60
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
category: container
|
||||
annotations:
|
||||
summary: "Container killed: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} has disappeared"
|
||||
|
||||
- alert: ContainerHighCpuUsage
|
||||
expr: (sum(rate(container_cpu_usage_seconds_total{name!~".*exporter.*"}[5m])) by (name) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: container
|
||||
annotations:
|
||||
summary: "High CPU usage in container {{ $labels.name }}"
|
||||
description: "Container CPU usage is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: ContainerHighMemoryUsage
|
||||
expr: (sum(container_memory_usage_bytes{name!~".*exporter.*"}) by (name) / sum(container_spec_memory_limit_bytes{name!~".*exporter.*"}) by (name) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: container
|
||||
annotations:
|
||||
summary: "High memory usage in container {{ $labels.name }}"
|
||||
description: "Container memory usage is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: ContainerVolumeUsage
|
||||
expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: container
|
||||
annotations:
|
||||
summary: "Container volume usage on {{ $labels.instance }}"
|
||||
description: "Container volume usage is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: ContainerRestartCount
|
||||
expr: rate(container_restart_count[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
category: container
|
||||
annotations:
|
||||
summary: "Container restarting: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is restarting frequently"
|
||||
|
||||
# Prometheus Self-Monitoring
|
||||
- alert: PrometheusTargetDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
category: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus target down: {{ $labels.job }}"
|
||||
description: "Target {{ $labels.job }} on {{ $labels.instance }} is down"
|
||||
|
||||
- alert: PrometheusConfigReloadFailure
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
category: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus configuration reload failure"
|
||||
description: "Prometheus configuration reload has failed"
|
||||
|
||||
- alert: PrometheusTooManyRestarts
|
||||
expr: changes(process_start_time_seconds{job=~"prometheus"}[15m]) > 2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
category: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus restarting frequently"
|
||||
description: "Prometheus has restarted more than twice in the last 15 minutes"
|
||||
|
||||
- alert: PrometheusTargetScrapingSlow
|
||||
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus target scraping slow"
|
||||
description: "Prometheus is scraping targets slowly (current value: {{ $value }}s)"
|
||||
|
||||
# Traefik Alerts
|
||||
- alert: TraefikServiceDown
|
||||
expr: count(traefik_service_server_up) by (service) == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
category: traefik
|
||||
annotations:
|
||||
summary: "Traefik service down: {{ $labels.service }}"
|
||||
description: "Traefik service {{ $labels.service }} is down"
|
||||
|
||||
- alert: TraefikHighHttp4xxErrorRate
|
||||
expr: sum(rate(traefik_service_requests_total{code=~"4.."}[5m])) by (service) / sum(rate(traefik_service_requests_total[5m])) by (service) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: traefik
|
||||
annotations:
|
||||
summary: "High HTTP 4xx error rate for {{ $labels.service }}"
|
||||
description: "HTTP 4xx error rate is above 5% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: TraefikHighHttp5xxErrorRate
|
||||
expr: sum(rate(traefik_service_requests_total{code=~"5.."}[5m])) by (service) / sum(rate(traefik_service_requests_total[5m])) by (service) * 100 > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
category: traefik
|
||||
annotations:
|
||||
summary: "High HTTP 5xx error rate for {{ $labels.service }}"
|
||||
description: "HTTP 5xx error rate is above 1% (current value: {{ $value }}%)"
|
||||
|
||||
- name: database_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# PostgreSQL Alerts (uncomment when postgres-exporter is deployed)
|
||||
# - alert: PostgresqlDown
|
||||
# expr: pg_up == 0
|
||||
# for: 1m
|
||||
# labels:
|
||||
# severity: critical
|
||||
# category: database
|
||||
# annotations:
|
||||
# summary: "PostgreSQL down on {{ $labels.instance }}"
|
||||
# description: "PostgreSQL instance is down"
|
||||
|
||||
# - alert: PostgresqlTooManyConnections
|
||||
# expr: sum by (instance) (pg_stat_activity_count) > pg_settings_max_connections * 0.8
|
||||
# for: 5m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# category: database
|
||||
# annotations:
|
||||
# summary: "Too many PostgreSQL connections on {{ $labels.instance }}"
|
||||
# description: "PostgreSQL connections are above 80% of max_connections"
|
||||
|
||||
# - alert: PostgresqlDeadLocks
|
||||
# expr: rate(pg_stat_database_deadlocks[1m]) > 0
|
||||
# for: 1m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# category: database
|
||||
# annotations:
|
||||
# summary: "PostgreSQL deadlocks on {{ $labels.instance }}"
|
||||
# description: "PostgreSQL has deadlocks"
|
||||
|
||||
# Redis Alerts (uncomment when redis-exporter is deployed)
|
||||
# - alert: RedisDown
|
||||
# expr: redis_up == 0
|
||||
# for: 1m
|
||||
# labels:
|
||||
# severity: critical
|
||||
# category: cache
|
||||
# annotations:
|
||||
# summary: "Redis down on {{ $labels.instance }}"
|
||||
# description: "Redis instance is down"
|
||||
|
||||
# - alert: RedisOutOfMemory
|
||||
# expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
|
||||
# for: 5m
|
||||
# labels:
|
||||
# severity: critical
|
||||
# category: cache
|
||||
# annotations:
|
||||
# summary: "Redis out of memory on {{ $labels.instance }}"
|
||||
# description: "Redis memory usage is above 90%"
|
||||
|
||||
# - alert: RedisTooManyConnections
|
||||
# expr: redis_connected_clients > 100
|
||||
# for: 5m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# category: cache
|
||||
# annotations:
|
||||
# summary: "Too many Redis connections on {{ $labels.instance }}"
|
||||
# description: "Redis has too many client connections (current value: {{ $value }})"
|
||||
82
deployment/stacks/monitoring/prometheus/prometheus.yml
Normal file
82
deployment/stacks/monitoring/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,82 @@
|
||||
# Prometheus Configuration
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'production'
|
||||
environment: 'michaelschiemer'
|
||||
|
||||
# Alertmanager configuration (optional)
|
||||
# alerting:
|
||||
# alertmanagers:
|
||||
# - static_configs:
|
||||
# - targets:
|
||||
# - alertmanager:9093
|
||||
|
||||
# Load alerting rules
|
||||
rule_files:
|
||||
- '/etc/prometheus/alerts.yml'
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
labels:
|
||||
service: 'prometheus'
|
||||
|
||||
# Node Exporter - Host system metrics
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
labels:
|
||||
service: 'node-exporter'
|
||||
instance: 'production-server'
|
||||
|
||||
# cAdvisor - Container metrics
|
||||
- job_name: 'cadvisor'
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
labels:
|
||||
service: 'cadvisor'
|
||||
|
||||
# Traefik metrics
|
||||
- job_name: 'traefik'
|
||||
static_configs:
|
||||
- targets: ['traefik:8080']
|
||||
labels:
|
||||
service: 'traefik'
|
||||
|
||||
# PostgreSQL Exporter (if deployed)
|
||||
# Uncomment if you add postgres-exporter to postgresql stack
|
||||
# - job_name: 'postgres'
|
||||
# static_configs:
|
||||
# - targets: ['postgres-exporter:9187']
|
||||
# labels:
|
||||
# service: 'postgresql'
|
||||
|
||||
# Redis Exporter (if deployed)
|
||||
# Uncomment if you add redis-exporter to application stack
|
||||
# - job_name: 'redis'
|
||||
# static_configs:
|
||||
# - targets: ['redis-exporter:9121']
|
||||
# labels:
|
||||
# service: 'redis'
|
||||
|
||||
# Application metrics endpoint (if available)
|
||||
# Uncomment and configure if your PHP app exposes Prometheus metrics
|
||||
# - job_name: 'application'
|
||||
# static_configs:
|
||||
# - targets: ['app:9000']
|
||||
# labels:
|
||||
# service: 'application'
|
||||
|
||||
# Nginx metrics (if nginx-prometheus-exporter deployed)
|
||||
# - job_name: 'nginx'
|
||||
# static_configs:
|
||||
# - targets: ['nginx-exporter:9113']
|
||||
# labels:
|
||||
# service: 'nginx'
|
||||
Reference in New Issue
Block a user