Files
michaelschiemer/scripts/deployment/health-check.sh
Michael Schiemer 36ef2a1e2c
Some checks failed
🚀 Build & Deploy Image / Determine Build Necessity (push) Failing after 10m14s
🚀 Build & Deploy Image / Build Runtime Base Image (push) Has been skipped
🚀 Build & Deploy Image / Build Docker Image (push) Has been skipped
🚀 Build & Deploy Image / Run Tests & Quality Checks (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Staging (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Production (push) Has been skipped
Security Vulnerability Scan / Check for Dependency Changes (push) Failing after 11m25s
Security Vulnerability Scan / Composer Security Audit (push) Has been cancelled
fix: Gitea Traefik routing and connection pool optimization
- Remove middleware reference from Gitea Traefik labels (caused routing issues)
- Optimize Gitea connection pool settings (MAX_IDLE_CONNS=30, authentication_timeout=180s)
- Add explicit service reference in Traefik labels
- Fix intermittent 504 timeouts by improving PostgreSQL connection handling

Fixes Gitea unreachability via git.michaelschiemer.de
2025-11-09 14:46:15 +01:00

436 lines
11 KiB
Bash
Executable File

#!/bin/bash
# Production Health Check Script
# Comprehensive health monitoring for all production services
#
# Usage:
# ./scripts/health-check.sh [--verbose] [--json]
#
# Options:
# --verbose Show detailed output
# --json Output in JSON format
set -euo pipefail
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
VERBOSE=false
JSON_OUTPUT=false
# Parse arguments
for arg in "$@"; do
case $arg in
--verbose)
VERBOSE=true
;;
--json)
JSON_OUTPUT=true
;;
esac
done
# Colors
GREEN="\e[32m"
YELLOW="\e[33m"
RED="\e[31m"
BLUE="\e[34m"
RESET="\e[0m"
# Health check results
declare -A HEALTH_RESULTS
OVERALL_HEALTHY=true
# Logging functions
log() {
if [[ "$JSON_OUTPUT" == "false" ]]; then
echo -e "${BLUE}[$(date +'%H:%M:%S')]${RESET} $1"
fi
}
success() {
if [[ "$JSON_OUTPUT" == "false" ]]; then
echo -e "${GREEN}$1${RESET}"
fi
}
warning() {
if [[ "$JSON_OUTPUT" == "false" ]]; then
echo -e "${YELLOW}⚠️ $1${RESET}"
fi
}
error() {
if [[ "$JSON_OUTPUT" == "false" ]]; then
echo -e "${RED}$1${RESET}"
fi
}
# Check Docker daemon
check_docker() {
log "Checking Docker daemon..."
if docker info &>/dev/null; then
HEALTH_RESULTS[docker]="healthy"
success "Docker daemon is running"
return 0
else
HEALTH_RESULTS[docker]="unhealthy"
error "Docker daemon is not running"
OVERALL_HEALTHY=false
return 1
fi
}
# Check Docker Compose services
check_docker_services() {
log "Checking Docker Compose services..."
cd "$PROJECT_ROOT"
local services=("web" "php" "db" "redis" "queue-worker")
local all_healthy=true
for service in "${services[@]}"; do
if docker compose ps "$service" | grep -q "Up"; then
HEALTH_RESULTS["service_${service}"]="healthy"
[[ "$VERBOSE" == "true" ]] && success "$service is running"
else
HEALTH_RESULTS["service_${service}"]="unhealthy"
error "$service is not running"
all_healthy=false
fi
done
if [[ "$all_healthy" == "true" ]]; then
success "All Docker services are running"
else
error "Some Docker services are not running"
OVERALL_HEALTHY=false
fi
}
# Check web server response
check_web_response() {
log "Checking web server response..."
local max_retries=3
local retry_count=0
while [[ $retry_count -lt $max_retries ]]; do
if curl -f -s -k -H "User-Agent: Mozilla/5.0 (Health Check)" "https://localhost" > /dev/null 2>&1; then
HEALTH_RESULTS[web_response]="healthy"
success "Web server is responding"
return 0
fi
retry_count=$((retry_count + 1))
sleep 2
done
HEALTH_RESULTS[web_response]="unhealthy"
error "Web server is not responding"
OVERALL_HEALTHY=false
return 1
}
# Check health endpoint
check_health_endpoint() {
log "Checking /health endpoint..."
if response=$(curl -f -s -k -H "User-Agent: Mozilla/5.0 (Health Check)" "https://localhost/health" 2>&1); then
HEALTH_RESULTS[health_endpoint]="healthy"
success "Health endpoint is responding"
if [[ "$VERBOSE" == "true" ]]; then
echo "$response" | head -20
fi
return 0
else
HEALTH_RESULTS[health_endpoint]="unhealthy"
error "Health endpoint is not responding"
OVERALL_HEALTHY=false
return 1
fi
}
# Check database connectivity
check_database() {
log "Checking database connectivity..."
cd "$PROJECT_ROOT"
if docker compose exec -T db pg_isready -U postgres &>/dev/null; then
HEALTH_RESULTS[database]="healthy"
success "Database is accepting connections"
# Get connection count
if [[ "$VERBOSE" == "true" ]]; then
local conn_count=$(docker compose exec -T db psql -U postgres -t -c "SELECT count(*) FROM pg_stat_activity;" | tr -d ' ')
log "Active connections: $conn_count"
fi
return 0
else
HEALTH_RESULTS[database]="unhealthy"
error "Database is not accepting connections"
OVERALL_HEALTHY=false
return 1
fi
}
# Check Redis connectivity
check_redis() {
log "Checking Redis connectivity..."
cd "$PROJECT_ROOT"
if docker compose exec -T redis redis-cli ping &>/dev/null; then
HEALTH_RESULTS[redis]="healthy"
success "Redis is responding"
# Get Redis info
if [[ "$VERBOSE" == "true" ]]; then
local used_memory=$(docker compose exec -T redis redis-cli info memory | grep "used_memory_human" | cut -d: -f2 | tr -d '\r')
log "Redis memory usage: $used_memory"
fi
return 0
else
HEALTH_RESULTS[redis]="unhealthy"
error "Redis is not responding"
OVERALL_HEALTHY=false
return 1
fi
}
# Check SSL certificate
check_ssl_certificate() {
log "Checking SSL certificate..."
cd "$PROJECT_ROOT"
if docker compose exec -T php php console.php ssl:status 2>/dev/null | grep -q "Certificate is valid"; then
HEALTH_RESULTS[ssl]="healthy"
success "SSL certificate is valid"
if [[ "$VERBOSE" == "true" ]]; then
docker compose exec -T php php console.php ssl:status
fi
return 0
else
HEALTH_RESULTS[ssl]="warning"
warning "SSL certificate status unclear"
return 1
fi
}
# Check Vault connectivity
check_vault() {
log "Checking Vault connectivity..."
cd "$PROJECT_ROOT"
if docker compose exec -T php php console.php vault:list &>/dev/null; then
HEALTH_RESULTS[vault]="healthy"
success "Vault is accessible"
return 0
else
HEALTH_RESULTS[vault]="unhealthy"
error "Vault is not accessible"
OVERALL_HEALTHY=false
return 1
fi
}
# Check disk space
check_disk_space() {
log "Checking disk space..."
local disk_usage=$(df -h "$PROJECT_ROOT" | tail -1 | awk '{print $5}' | tr -d '%')
if [[ $disk_usage -lt 80 ]]; then
HEALTH_RESULTS[disk_space]="healthy"
success "Disk space usage: ${disk_usage}%"
elif [[ $disk_usage -lt 90 ]]; then
HEALTH_RESULTS[disk_space]="warning"
warning "Disk space usage: ${disk_usage}% (consider cleanup)"
else
HEALTH_RESULTS[disk_space]="critical"
error "Disk space usage: ${disk_usage}% (critical)"
OVERALL_HEALTHY=false
fi
}
# Check memory usage
check_memory() {
log "Checking memory usage..."
local mem_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100}')
if [[ $mem_usage -lt 80 ]]; then
HEALTH_RESULTS[memory]="healthy"
success "Memory usage: ${mem_usage}%"
elif [[ $mem_usage -lt 90 ]]; then
HEALTH_RESULTS[memory]="warning"
warning "Memory usage: ${mem_usage}% (high)"
else
HEALTH_RESULTS[memory]="critical"
error "Memory usage: ${mem_usage}% (critical)"
OVERALL_HEALTHY=false
fi
}
# Check queue worker status
check_queue_workers() {
log "Checking queue workers..."
cd "$PROJECT_ROOT"
local worker_count=$(docker compose ps queue-worker | grep "Up" | wc -l)
if [[ $worker_count -ge 2 ]]; then
HEALTH_RESULTS[queue_workers]="healthy"
success "Queue workers: $worker_count running"
elif [[ $worker_count -ge 1 ]]; then
HEALTH_RESULTS[queue_workers]="warning"
warning "Queue workers: only $worker_count running (expected 2)"
else
HEALTH_RESULTS[queue_workers]="unhealthy"
error "Queue workers: none running"
OVERALL_HEALTHY=false
fi
}
# Check logs for errors
check_recent_errors() {
log "Checking recent errors in logs..."
cd "$PROJECT_ROOT"
local error_count=$(docker compose logs --tail=1000 php 2>/dev/null | grep -ci "error\|exception\|fatal" || echo "0")
if [[ $error_count -lt 5 ]]; then
HEALTH_RESULTS[recent_errors]="healthy"
success "Recent errors: $error_count (last 1000 lines)"
elif [[ $error_count -lt 20 ]]; then
HEALTH_RESULTS[recent_errors]="warning"
warning "Recent errors: $error_count (last 1000 lines)"
else
HEALTH_RESULTS[recent_errors]="critical"
error "Recent errors: $error_count (last 1000 lines)"
OVERALL_HEALTHY=false
fi
}
# Output JSON report
output_json() {
echo "{"
echo " \"timestamp\": \"$(date -Iseconds)\","
echo " \"overall_status\": \"$([ "$OVERALL_HEALTHY" == "true" ] && echo "healthy" || echo "unhealthy")\","
echo " \"checks\": {"
local first=true
for check in "${!HEALTH_RESULTS[@]}"; do
if [[ "$first" == "true" ]]; then
first=false
else
echo ","
fi
echo -n " \"$check\": \"${HEALTH_RESULTS[$check]}\""
done
echo ""
echo " }"
echo "}"
}
# Display summary
display_summary() {
echo ""
echo -e "${BLUE}========================================${RESET}"
echo -e "${BLUE} Production Health Check Summary${RESET}"
echo -e "${BLUE}========================================${RESET}"
echo ""
local healthy_count=0
local warning_count=0
local unhealthy_count=0
for status in "${HEALTH_RESULTS[@]}"; do
case $status in
healthy)
healthy_count=$((healthy_count + 1))
;;
warning)
warning_count=$((warning_count + 1))
;;
unhealthy|critical)
unhealthy_count=$((unhealthy_count + 1))
;;
esac
done
echo "📊 Health Status:"
echo " ✅ Healthy: $healthy_count"
echo " ⚠️ Warnings: $warning_count"
echo " ❌ Unhealthy: $unhealthy_count"
echo ""
if [[ "$OVERALL_HEALTHY" == "true" ]]; then
echo -e "${GREEN}Overall Status: HEALTHY ✅${RESET}"
echo ""
echo "🎉 All critical systems are operational"
else
echo -e "${RED}Overall Status: UNHEALTHY ❌${RESET}"
echo ""
echo "⚠️ Critical issues detected - immediate action required"
echo ""
echo "📝 Recommended Actions:"
echo " 1. Check Docker logs: docker compose logs -f --tail=100"
echo " 2. Review service status: docker compose ps"
echo " 3. Check system resources: df -h && free -h"
echo " 4. Review recent deployments for issues"
fi
echo ""
echo -e "${BLUE}========================================${RESET}"
}
# Main health check execution
main() {
if [[ "$JSON_OUTPUT" == "false" ]]; then
log "🔍 Starting production health check..."
echo ""
fi
check_docker
check_docker_services
check_web_response
check_health_endpoint
check_database
check_redis
check_ssl_certificate
check_vault
check_disk_space
check_memory
check_queue_workers
check_recent_errors
if [[ "$JSON_OUTPUT" == "true" ]]; then
output_json
else
display_summary
fi
# Exit with appropriate code
if [[ "$OVERALL_HEALTHY" == "true" ]]; then
exit 0
else
exit 1
fi
}
# Run main
main "$@"