Some checks failed
🚀 Build & Deploy Image / Determine Build Necessity (push) Failing after 10m14s
🚀 Build & Deploy Image / Build Runtime Base Image (push) Has been skipped
🚀 Build & Deploy Image / Build Docker Image (push) Has been skipped
🚀 Build & Deploy Image / Run Tests & Quality Checks (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Staging (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Production (push) Has been skipped
Security Vulnerability Scan / Check for Dependency Changes (push) Failing after 11m25s
Security Vulnerability Scan / Composer Security Audit (push) Has been cancelled
- Remove middleware reference from Gitea Traefik labels (caused routing issues) - Optimize Gitea connection pool settings (MAX_IDLE_CONNS=30, authentication_timeout=180s) - Add explicit service reference in Traefik labels - Fix intermittent 504 timeouts by improving PostgreSQL connection handling Fixes Gitea unreachability via git.michaelschiemer.de
436 lines
11 KiB
Bash
Executable File
436 lines
11 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Production Health Check Script
|
|
# Comprehensive health monitoring for all production services
|
|
#
|
|
# Usage:
|
|
# ./scripts/health-check.sh [--verbose] [--json]
|
|
#
|
|
# Options:
|
|
# --verbose Show detailed output
|
|
# --json Output in JSON format
|
|
|
|
set -euo pipefail
|
|
|
|
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
VERBOSE=false
|
|
JSON_OUTPUT=false
|
|
|
|
# Parse arguments
|
|
for arg in "$@"; do
|
|
case $arg in
|
|
--verbose)
|
|
VERBOSE=true
|
|
;;
|
|
--json)
|
|
JSON_OUTPUT=true
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Colors
|
|
GREEN="\e[32m"
|
|
YELLOW="\e[33m"
|
|
RED="\e[31m"
|
|
BLUE="\e[34m"
|
|
RESET="\e[0m"
|
|
|
|
# Health check results
|
|
declare -A HEALTH_RESULTS
|
|
OVERALL_HEALTHY=true
|
|
|
|
# Logging functions
|
|
log() {
|
|
if [[ "$JSON_OUTPUT" == "false" ]]; then
|
|
echo -e "${BLUE}[$(date +'%H:%M:%S')]${RESET} $1"
|
|
fi
|
|
}
|
|
|
|
success() {
|
|
if [[ "$JSON_OUTPUT" == "false" ]]; then
|
|
echo -e "${GREEN}✅ $1${RESET}"
|
|
fi
|
|
}
|
|
|
|
warning() {
|
|
if [[ "$JSON_OUTPUT" == "false" ]]; then
|
|
echo -e "${YELLOW}⚠️ $1${RESET}"
|
|
fi
|
|
}
|
|
|
|
error() {
|
|
if [[ "$JSON_OUTPUT" == "false" ]]; then
|
|
echo -e "${RED}❌ $1${RESET}"
|
|
fi
|
|
}
|
|
|
|
# Check Docker daemon
|
|
check_docker() {
|
|
log "Checking Docker daemon..."
|
|
|
|
if docker info &>/dev/null; then
|
|
HEALTH_RESULTS[docker]="healthy"
|
|
success "Docker daemon is running"
|
|
return 0
|
|
else
|
|
HEALTH_RESULTS[docker]="unhealthy"
|
|
error "Docker daemon is not running"
|
|
OVERALL_HEALTHY=false
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Check Docker Compose services
|
|
check_docker_services() {
|
|
log "Checking Docker Compose services..."
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
local services=("web" "php" "db" "redis" "queue-worker")
|
|
local all_healthy=true
|
|
|
|
for service in "${services[@]}"; do
|
|
if docker compose ps "$service" | grep -q "Up"; then
|
|
HEALTH_RESULTS["service_${service}"]="healthy"
|
|
[[ "$VERBOSE" == "true" ]] && success "$service is running"
|
|
else
|
|
HEALTH_RESULTS["service_${service}"]="unhealthy"
|
|
error "$service is not running"
|
|
all_healthy=false
|
|
fi
|
|
done
|
|
|
|
if [[ "$all_healthy" == "true" ]]; then
|
|
success "All Docker services are running"
|
|
else
|
|
error "Some Docker services are not running"
|
|
OVERALL_HEALTHY=false
|
|
fi
|
|
}
|
|
|
|
# Check web server response
|
|
check_web_response() {
|
|
log "Checking web server response..."
|
|
|
|
local max_retries=3
|
|
local retry_count=0
|
|
|
|
while [[ $retry_count -lt $max_retries ]]; do
|
|
if curl -f -s -k -H "User-Agent: Mozilla/5.0 (Health Check)" "https://localhost" > /dev/null 2>&1; then
|
|
HEALTH_RESULTS[web_response]="healthy"
|
|
success "Web server is responding"
|
|
return 0
|
|
fi
|
|
|
|
retry_count=$((retry_count + 1))
|
|
sleep 2
|
|
done
|
|
|
|
HEALTH_RESULTS[web_response]="unhealthy"
|
|
error "Web server is not responding"
|
|
OVERALL_HEALTHY=false
|
|
return 1
|
|
}
|
|
|
|
# Check health endpoint
|
|
check_health_endpoint() {
|
|
log "Checking /health endpoint..."
|
|
|
|
if response=$(curl -f -s -k -H "User-Agent: Mozilla/5.0 (Health Check)" "https://localhost/health" 2>&1); then
|
|
HEALTH_RESULTS[health_endpoint]="healthy"
|
|
success "Health endpoint is responding"
|
|
|
|
if [[ "$VERBOSE" == "true" ]]; then
|
|
echo "$response" | head -20
|
|
fi
|
|
|
|
return 0
|
|
else
|
|
HEALTH_RESULTS[health_endpoint]="unhealthy"
|
|
error "Health endpoint is not responding"
|
|
OVERALL_HEALTHY=false
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Check database connectivity
|
|
check_database() {
|
|
log "Checking database connectivity..."
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
if docker compose exec -T db pg_isready -U postgres &>/dev/null; then
|
|
HEALTH_RESULTS[database]="healthy"
|
|
success "Database is accepting connections"
|
|
|
|
# Get connection count
|
|
if [[ "$VERBOSE" == "true" ]]; then
|
|
local conn_count=$(docker compose exec -T db psql -U postgres -t -c "SELECT count(*) FROM pg_stat_activity;" | tr -d ' ')
|
|
log "Active connections: $conn_count"
|
|
fi
|
|
|
|
return 0
|
|
else
|
|
HEALTH_RESULTS[database]="unhealthy"
|
|
error "Database is not accepting connections"
|
|
OVERALL_HEALTHY=false
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Check Redis connectivity
|
|
check_redis() {
|
|
log "Checking Redis connectivity..."
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
if docker compose exec -T redis redis-cli ping &>/dev/null; then
|
|
HEALTH_RESULTS[redis]="healthy"
|
|
success "Redis is responding"
|
|
|
|
# Get Redis info
|
|
if [[ "$VERBOSE" == "true" ]]; then
|
|
local used_memory=$(docker compose exec -T redis redis-cli info memory | grep "used_memory_human" | cut -d: -f2 | tr -d '\r')
|
|
log "Redis memory usage: $used_memory"
|
|
fi
|
|
|
|
return 0
|
|
else
|
|
HEALTH_RESULTS[redis]="unhealthy"
|
|
error "Redis is not responding"
|
|
OVERALL_HEALTHY=false
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Check SSL certificate
|
|
check_ssl_certificate() {
|
|
log "Checking SSL certificate..."
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
if docker compose exec -T php php console.php ssl:status 2>/dev/null | grep -q "Certificate is valid"; then
|
|
HEALTH_RESULTS[ssl]="healthy"
|
|
success "SSL certificate is valid"
|
|
|
|
if [[ "$VERBOSE" == "true" ]]; then
|
|
docker compose exec -T php php console.php ssl:status
|
|
fi
|
|
|
|
return 0
|
|
else
|
|
HEALTH_RESULTS[ssl]="warning"
|
|
warning "SSL certificate status unclear"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Check Vault connectivity
|
|
check_vault() {
|
|
log "Checking Vault connectivity..."
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
if docker compose exec -T php php console.php vault:list &>/dev/null; then
|
|
HEALTH_RESULTS[vault]="healthy"
|
|
success "Vault is accessible"
|
|
return 0
|
|
else
|
|
HEALTH_RESULTS[vault]="unhealthy"
|
|
error "Vault is not accessible"
|
|
OVERALL_HEALTHY=false
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Check disk space
|
|
check_disk_space() {
|
|
log "Checking disk space..."
|
|
|
|
local disk_usage=$(df -h "$PROJECT_ROOT" | tail -1 | awk '{print $5}' | tr -d '%')
|
|
|
|
if [[ $disk_usage -lt 80 ]]; then
|
|
HEALTH_RESULTS[disk_space]="healthy"
|
|
success "Disk space usage: ${disk_usage}%"
|
|
elif [[ $disk_usage -lt 90 ]]; then
|
|
HEALTH_RESULTS[disk_space]="warning"
|
|
warning "Disk space usage: ${disk_usage}% (consider cleanup)"
|
|
else
|
|
HEALTH_RESULTS[disk_space]="critical"
|
|
error "Disk space usage: ${disk_usage}% (critical)"
|
|
OVERALL_HEALTHY=false
|
|
fi
|
|
}
|
|
|
|
# Check memory usage
|
|
check_memory() {
|
|
log "Checking memory usage..."
|
|
|
|
local mem_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100}')
|
|
|
|
if [[ $mem_usage -lt 80 ]]; then
|
|
HEALTH_RESULTS[memory]="healthy"
|
|
success "Memory usage: ${mem_usage}%"
|
|
elif [[ $mem_usage -lt 90 ]]; then
|
|
HEALTH_RESULTS[memory]="warning"
|
|
warning "Memory usage: ${mem_usage}% (high)"
|
|
else
|
|
HEALTH_RESULTS[memory]="critical"
|
|
error "Memory usage: ${mem_usage}% (critical)"
|
|
OVERALL_HEALTHY=false
|
|
fi
|
|
}
|
|
|
|
# Check queue worker status
|
|
check_queue_workers() {
|
|
log "Checking queue workers..."
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
local worker_count=$(docker compose ps queue-worker | grep "Up" | wc -l)
|
|
|
|
if [[ $worker_count -ge 2 ]]; then
|
|
HEALTH_RESULTS[queue_workers]="healthy"
|
|
success "Queue workers: $worker_count running"
|
|
elif [[ $worker_count -ge 1 ]]; then
|
|
HEALTH_RESULTS[queue_workers]="warning"
|
|
warning "Queue workers: only $worker_count running (expected 2)"
|
|
else
|
|
HEALTH_RESULTS[queue_workers]="unhealthy"
|
|
error "Queue workers: none running"
|
|
OVERALL_HEALTHY=false
|
|
fi
|
|
}
|
|
|
|
# Check logs for errors
|
|
check_recent_errors() {
|
|
log "Checking recent errors in logs..."
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
local error_count=$(docker compose logs --tail=1000 php 2>/dev/null | grep -ci "error\|exception\|fatal" || echo "0")
|
|
|
|
if [[ $error_count -lt 5 ]]; then
|
|
HEALTH_RESULTS[recent_errors]="healthy"
|
|
success "Recent errors: $error_count (last 1000 lines)"
|
|
elif [[ $error_count -lt 20 ]]; then
|
|
HEALTH_RESULTS[recent_errors]="warning"
|
|
warning "Recent errors: $error_count (last 1000 lines)"
|
|
else
|
|
HEALTH_RESULTS[recent_errors]="critical"
|
|
error "Recent errors: $error_count (last 1000 lines)"
|
|
OVERALL_HEALTHY=false
|
|
fi
|
|
}
|
|
|
|
# Output JSON report
|
|
output_json() {
|
|
echo "{"
|
|
echo " \"timestamp\": \"$(date -Iseconds)\","
|
|
echo " \"overall_status\": \"$([ "$OVERALL_HEALTHY" == "true" ] && echo "healthy" || echo "unhealthy")\","
|
|
echo " \"checks\": {"
|
|
|
|
local first=true
|
|
for check in "${!HEALTH_RESULTS[@]}"; do
|
|
if [[ "$first" == "true" ]]; then
|
|
first=false
|
|
else
|
|
echo ","
|
|
fi
|
|
echo -n " \"$check\": \"${HEALTH_RESULTS[$check]}\""
|
|
done
|
|
|
|
echo ""
|
|
echo " }"
|
|
echo "}"
|
|
}
|
|
|
|
# Display summary
|
|
display_summary() {
|
|
echo ""
|
|
echo -e "${BLUE}========================================${RESET}"
|
|
echo -e "${BLUE} Production Health Check Summary${RESET}"
|
|
echo -e "${BLUE}========================================${RESET}"
|
|
echo ""
|
|
|
|
local healthy_count=0
|
|
local warning_count=0
|
|
local unhealthy_count=0
|
|
|
|
for status in "${HEALTH_RESULTS[@]}"; do
|
|
case $status in
|
|
healthy)
|
|
healthy_count=$((healthy_count + 1))
|
|
;;
|
|
warning)
|
|
warning_count=$((warning_count + 1))
|
|
;;
|
|
unhealthy|critical)
|
|
unhealthy_count=$((unhealthy_count + 1))
|
|
;;
|
|
esac
|
|
done
|
|
|
|
echo "📊 Health Status:"
|
|
echo " ✅ Healthy: $healthy_count"
|
|
echo " ⚠️ Warnings: $warning_count"
|
|
echo " ❌ Unhealthy: $unhealthy_count"
|
|
echo ""
|
|
|
|
if [[ "$OVERALL_HEALTHY" == "true" ]]; then
|
|
echo -e "${GREEN}Overall Status: HEALTHY ✅${RESET}"
|
|
echo ""
|
|
echo "🎉 All critical systems are operational"
|
|
else
|
|
echo -e "${RED}Overall Status: UNHEALTHY ❌${RESET}"
|
|
echo ""
|
|
echo "⚠️ Critical issues detected - immediate action required"
|
|
echo ""
|
|
echo "📝 Recommended Actions:"
|
|
echo " 1. Check Docker logs: docker compose logs -f --tail=100"
|
|
echo " 2. Review service status: docker compose ps"
|
|
echo " 3. Check system resources: df -h && free -h"
|
|
echo " 4. Review recent deployments for issues"
|
|
fi
|
|
|
|
echo ""
|
|
echo -e "${BLUE}========================================${RESET}"
|
|
}
|
|
|
|
# Main health check execution
|
|
main() {
|
|
if [[ "$JSON_OUTPUT" == "false" ]]; then
|
|
log "🔍 Starting production health check..."
|
|
echo ""
|
|
fi
|
|
|
|
check_docker
|
|
check_docker_services
|
|
check_web_response
|
|
check_health_endpoint
|
|
check_database
|
|
check_redis
|
|
check_ssl_certificate
|
|
check_vault
|
|
check_disk_space
|
|
check_memory
|
|
check_queue_workers
|
|
check_recent_errors
|
|
|
|
if [[ "$JSON_OUTPUT" == "true" ]]; then
|
|
output_json
|
|
else
|
|
display_summary
|
|
fi
|
|
|
|
# Exit with appropriate code
|
|
if [[ "$OVERALL_HEALTHY" == "true" ]]; then
|
|
exit 0
|
|
else
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Run main
|
|
main "$@"
|