#!/bin/bash # Production Health Check Script # Comprehensive health monitoring for all production services # # Usage: # ./scripts/health-check.sh [--verbose] [--json] # # Options: # --verbose Show detailed output # --json Output in JSON format set -euo pipefail PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" VERBOSE=false JSON_OUTPUT=false # Parse arguments for arg in "$@"; do case $arg in --verbose) VERBOSE=true ;; --json) JSON_OUTPUT=true ;; esac done # Colors GREEN="\e[32m" YELLOW="\e[33m" RED="\e[31m" BLUE="\e[34m" RESET="\e[0m" # Health check results declare -A HEALTH_RESULTS OVERALL_HEALTHY=true # Logging functions log() { if [[ "$JSON_OUTPUT" == "false" ]]; then echo -e "${BLUE}[$(date +'%H:%M:%S')]${RESET} $1" fi } success() { if [[ "$JSON_OUTPUT" == "false" ]]; then echo -e "${GREEN}✅ $1${RESET}" fi } warning() { if [[ "$JSON_OUTPUT" == "false" ]]; then echo -e "${YELLOW}⚠️ $1${RESET}" fi } error() { if [[ "$JSON_OUTPUT" == "false" ]]; then echo -e "${RED}❌ $1${RESET}" fi } # Check Docker daemon check_docker() { log "Checking Docker daemon..." if docker info &>/dev/null; then HEALTH_RESULTS[docker]="healthy" success "Docker daemon is running" return 0 else HEALTH_RESULTS[docker]="unhealthy" error "Docker daemon is not running" OVERALL_HEALTHY=false return 1 fi } # Check Docker Compose services check_docker_services() { log "Checking Docker Compose services..." cd "$PROJECT_ROOT" local services=("web" "php" "db" "redis" "queue-worker") local all_healthy=true for service in "${services[@]}"; do if docker compose ps "$service" | grep -q "Up"; then HEALTH_RESULTS["service_${service}"]="healthy" [[ "$VERBOSE" == "true" ]] && success "$service is running" else HEALTH_RESULTS["service_${service}"]="unhealthy" error "$service is not running" all_healthy=false fi done if [[ "$all_healthy" == "true" ]]; then success "All Docker services are running" else error "Some Docker services are not running" OVERALL_HEALTHY=false fi } # Check web server response check_web_response() { log "Checking web server response..." local max_retries=3 local retry_count=0 while [[ $retry_count -lt $max_retries ]]; do if curl -f -s -k -H "User-Agent: Mozilla/5.0 (Health Check)" "https://localhost" > /dev/null 2>&1; then HEALTH_RESULTS[web_response]="healthy" success "Web server is responding" return 0 fi retry_count=$((retry_count + 1)) sleep 2 done HEALTH_RESULTS[web_response]="unhealthy" error "Web server is not responding" OVERALL_HEALTHY=false return 1 } # Check health endpoint check_health_endpoint() { log "Checking /health endpoint..." if response=$(curl -f -s -k -H "User-Agent: Mozilla/5.0 (Health Check)" "https://localhost/health" 2>&1); then HEALTH_RESULTS[health_endpoint]="healthy" success "Health endpoint is responding" if [[ "$VERBOSE" == "true" ]]; then echo "$response" | head -20 fi return 0 else HEALTH_RESULTS[health_endpoint]="unhealthy" error "Health endpoint is not responding" OVERALL_HEALTHY=false return 1 fi } # Check database connectivity check_database() { log "Checking database connectivity..." cd "$PROJECT_ROOT" if docker compose exec -T db pg_isready -U postgres &>/dev/null; then HEALTH_RESULTS[database]="healthy" success "Database is accepting connections" # Get connection count if [[ "$VERBOSE" == "true" ]]; then local conn_count=$(docker compose exec -T db psql -U postgres -t -c "SELECT count(*) FROM pg_stat_activity;" | tr -d ' ') log "Active connections: $conn_count" fi return 0 else HEALTH_RESULTS[database]="unhealthy" error "Database is not accepting connections" OVERALL_HEALTHY=false return 1 fi } # Check Redis connectivity check_redis() { log "Checking Redis connectivity..." cd "$PROJECT_ROOT" if docker compose exec -T redis redis-cli ping &>/dev/null; then HEALTH_RESULTS[redis]="healthy" success "Redis is responding" # Get Redis info if [[ "$VERBOSE" == "true" ]]; then local used_memory=$(docker compose exec -T redis redis-cli info memory | grep "used_memory_human" | cut -d: -f2 | tr -d '\r') log "Redis memory usage: $used_memory" fi return 0 else HEALTH_RESULTS[redis]="unhealthy" error "Redis is not responding" OVERALL_HEALTHY=false return 1 fi } # Check SSL certificate check_ssl_certificate() { log "Checking SSL certificate..." cd "$PROJECT_ROOT" if docker compose exec -T php php console.php ssl:status 2>/dev/null | grep -q "Certificate is valid"; then HEALTH_RESULTS[ssl]="healthy" success "SSL certificate is valid" if [[ "$VERBOSE" == "true" ]]; then docker compose exec -T php php console.php ssl:status fi return 0 else HEALTH_RESULTS[ssl]="warning" warning "SSL certificate status unclear" return 1 fi } # Check Vault connectivity check_vault() { log "Checking Vault connectivity..." cd "$PROJECT_ROOT" if docker compose exec -T php php console.php vault:list &>/dev/null; then HEALTH_RESULTS[vault]="healthy" success "Vault is accessible" return 0 else HEALTH_RESULTS[vault]="unhealthy" error "Vault is not accessible" OVERALL_HEALTHY=false return 1 fi } # Check disk space check_disk_space() { log "Checking disk space..." local disk_usage=$(df -h "$PROJECT_ROOT" | tail -1 | awk '{print $5}' | tr -d '%') if [[ $disk_usage -lt 80 ]]; then HEALTH_RESULTS[disk_space]="healthy" success "Disk space usage: ${disk_usage}%" elif [[ $disk_usage -lt 90 ]]; then HEALTH_RESULTS[disk_space]="warning" warning "Disk space usage: ${disk_usage}% (consider cleanup)" else HEALTH_RESULTS[disk_space]="critical" error "Disk space usage: ${disk_usage}% (critical)" OVERALL_HEALTHY=false fi } # Check memory usage check_memory() { log "Checking memory usage..." local mem_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100}') if [[ $mem_usage -lt 80 ]]; then HEALTH_RESULTS[memory]="healthy" success "Memory usage: ${mem_usage}%" elif [[ $mem_usage -lt 90 ]]; then HEALTH_RESULTS[memory]="warning" warning "Memory usage: ${mem_usage}% (high)" else HEALTH_RESULTS[memory]="critical" error "Memory usage: ${mem_usage}% (critical)" OVERALL_HEALTHY=false fi } # Check queue worker status check_queue_workers() { log "Checking queue workers..." cd "$PROJECT_ROOT" local worker_count=$(docker compose ps queue-worker | grep "Up" | wc -l) if [[ $worker_count -ge 2 ]]; then HEALTH_RESULTS[queue_workers]="healthy" success "Queue workers: $worker_count running" elif [[ $worker_count -ge 1 ]]; then HEALTH_RESULTS[queue_workers]="warning" warning "Queue workers: only $worker_count running (expected 2)" else HEALTH_RESULTS[queue_workers]="unhealthy" error "Queue workers: none running" OVERALL_HEALTHY=false fi } # Check logs for errors check_recent_errors() { log "Checking recent errors in logs..." cd "$PROJECT_ROOT" local error_count=$(docker compose logs --tail=1000 php 2>/dev/null | grep -ci "error\|exception\|fatal" || echo "0") if [[ $error_count -lt 5 ]]; then HEALTH_RESULTS[recent_errors]="healthy" success "Recent errors: $error_count (last 1000 lines)" elif [[ $error_count -lt 20 ]]; then HEALTH_RESULTS[recent_errors]="warning" warning "Recent errors: $error_count (last 1000 lines)" else HEALTH_RESULTS[recent_errors]="critical" error "Recent errors: $error_count (last 1000 lines)" OVERALL_HEALTHY=false fi } # Output JSON report output_json() { echo "{" echo " \"timestamp\": \"$(date -Iseconds)\"," echo " \"overall_status\": \"$([ "$OVERALL_HEALTHY" == "true" ] && echo "healthy" || echo "unhealthy")\"," echo " \"checks\": {" local first=true for check in "${!HEALTH_RESULTS[@]}"; do if [[ "$first" == "true" ]]; then first=false else echo "," fi echo -n " \"$check\": \"${HEALTH_RESULTS[$check]}\"" done echo "" echo " }" echo "}" } # Display summary display_summary() { echo "" echo -e "${BLUE}========================================${RESET}" echo -e "${BLUE} Production Health Check Summary${RESET}" echo -e "${BLUE}========================================${RESET}" echo "" local healthy_count=0 local warning_count=0 local unhealthy_count=0 for status in "${HEALTH_RESULTS[@]}"; do case $status in healthy) healthy_count=$((healthy_count + 1)) ;; warning) warning_count=$((warning_count + 1)) ;; unhealthy|critical) unhealthy_count=$((unhealthy_count + 1)) ;; esac done echo "📊 Health Status:" echo " ✅ Healthy: $healthy_count" echo " ⚠️ Warnings: $warning_count" echo " ❌ Unhealthy: $unhealthy_count" echo "" if [[ "$OVERALL_HEALTHY" == "true" ]]; then echo -e "${GREEN}Overall Status: HEALTHY ✅${RESET}" echo "" echo "🎉 All critical systems are operational" else echo -e "${RED}Overall Status: UNHEALTHY ❌${RESET}" echo "" echo "⚠️ Critical issues detected - immediate action required" echo "" echo "📝 Recommended Actions:" echo " 1. Check Docker logs: docker compose logs -f --tail=100" echo " 2. Review service status: docker compose ps" echo " 3. Check system resources: df -h && free -h" echo " 4. Review recent deployments for issues" fi echo "" echo -e "${BLUE}========================================${RESET}" } # Main health check execution main() { if [[ "$JSON_OUTPUT" == "false" ]]; then log "🔍 Starting production health check..." echo "" fi check_docker check_docker_services check_web_response check_health_endpoint check_database check_redis check_ssl_certificate check_vault check_disk_space check_memory check_queue_workers check_recent_errors if [[ "$JSON_OUTPUT" == "true" ]]; then output_json else display_summary fi # Exit with appropriate code if [[ "$OVERALL_HEALTHY" == "true" ]]; then exit 0 else exit 1 fi } # Run main main "$@"