Files
michaelschiemer/.deployment-archive-20251030-111806/scripts/deployment-diagnostics.sh

362 lines
10 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
#
# Deployment Diagnostics Script
# Purpose: Comprehensive diagnostics for troubleshooting deployment issues
#
# Usage:
# ./scripts/deployment-diagnostics.sh # Run all diagnostics
# ./scripts/deployment-diagnostics.sh --quick # Quick checks only
# ./scripts/deployment-diagnostics.sh --verbose # Verbose output
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRODUCTION_SERVER="94.16.110.151"
REGISTRY="git.michaelschiemer.de:5000"
STACK_NAME="framework"
IMAGE="framework"
QUICK_MODE=false
VERBOSE=false
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
log_error() {
echo -e "${RED}${NC} $1"
}
log_success() {
echo -e "${GREEN}${NC} $1"
}
log_warn() {
echo -e "${YELLOW}${NC} $1"
}
log_info() {
echo -e "${BLUE}${NC} $1"
}
log_section() {
echo ""
echo -e "${CYAN}═══ $1 ═══${NC}"
}
# SSH helper
ssh_exec() {
ssh -i ~/.ssh/production deploy@"${PRODUCTION_SERVER}" "$@" 2>/dev/null || echo "SSH_FAILED"
}
# Check local prerequisites
check_local() {
log_section "Local Environment"
# Git status
if git status &> /dev/null; then
log_success "Git repository detected"
BRANCH=$(git rev-parse --abbrev-ref HEAD)
log_info "Current branch: ${BRANCH}"
if [[ -n $(git status --porcelain) ]]; then
log_warn "Working directory has uncommitted changes"
else
log_success "Working directory is clean"
fi
else
log_error "Not in a git repository"
fi
# Docker
if command -v docker &> /dev/null; then
log_success "Docker installed"
DOCKER_VERSION=$(docker --version | cut -d' ' -f3 | tr -d ',')
log_info "Version: ${DOCKER_VERSION}"
else
log_error "Docker not found"
fi
# Ansible
if command -v ansible-playbook &> /dev/null; then
log_success "Ansible installed"
ANSIBLE_VERSION=$(ansible-playbook --version | head -1 | cut -d' ' -f2)
log_info "Version: ${ANSIBLE_VERSION}"
else
log_error "Ansible not found"
fi
# SSH key
if [[ -f ~/.ssh/production ]]; then
log_success "Production SSH key found"
else
log_error "Production SSH key not found at ~/.ssh/production"
fi
}
# Check SSH connectivity
check_ssh() {
log_section "SSH Connectivity"
RESULT=$(ssh_exec "echo 'OK'")
if [[ "$RESULT" == "OK" ]]; then
log_success "SSH connection to production server"
else
log_error "Cannot connect to production server via SSH"
log_info "Check: ssh -i ~/.ssh/production deploy@${PRODUCTION_SERVER}"
return 1
fi
}
# Check Docker Swarm
check_docker_swarm() {
log_section "Docker Swarm Status"
SWARM_STATUS=$(ssh_exec "docker info | grep 'Swarm:' | awk '{print \$2}'")
if [[ "$SWARM_STATUS" == "active" ]]; then
log_success "Docker Swarm is active"
# Manager nodes
MANAGERS=$(ssh_exec "docker node ls --filter role=manager --format '{{.Hostname}}'")
log_info "Manager nodes: ${MANAGERS}"
# Worker nodes
WORKERS=$(ssh_exec "docker node ls --filter role=worker --format '{{.Hostname}}' | wc -l")
log_info "Worker nodes: ${WORKERS}"
else
log_error "Docker Swarm is not active"
return 1
fi
}
# Check services
check_services() {
log_section "Framework Services"
# List services
SERVICES=$(ssh_exec "docker service ls --filter 'name=${STACK_NAME}' --format '{{.Name}}: {{.Replicas}}'")
if [[ -n "$SERVICES" ]]; then
log_success "Framework services found"
echo "$SERVICES" | while read -r line; do
log_info "$line"
done
else
log_error "No framework services found"
return 1
fi
# Check web service
WEB_STATUS=$(ssh_exec "docker service ps ${STACK_NAME}_web --filter 'desired-state=running' --format '{{.CurrentState}}' | head -1")
if [[ "$WEB_STATUS" =~ Running ]]; then
log_success "Web service is running"
else
log_error "Web service is not running: ${WEB_STATUS}"
fi
# Check worker service
WORKER_STATUS=$(ssh_exec "docker service ps ${STACK_NAME}_queue-worker --filter 'desired-state=running' --format '{{.CurrentState}}' | head -1")
if [[ "$WORKER_STATUS" =~ Running ]]; then
log_success "Queue worker is running"
else
log_error "Queue worker is not running: ${WORKER_STATUS}"
fi
}
# Check Docker images
check_images() {
log_section "Docker Images"
# Current running image
CURRENT_IMAGE=$(ssh_exec "docker service inspect ${STACK_NAME}_web --format '{{.Spec.TaskTemplate.ContainerSpec.Image}}'")
if [[ -n "$CURRENT_IMAGE" ]]; then
log_success "Current image: ${CURRENT_IMAGE}"
else
log_error "Cannot determine current image"
fi
# Available images (last 5)
log_info "Available images (last 5):"
ssh_exec "docker images ${REGISTRY}/${IMAGE} --format ' {{.Tag}} ({{.CreatedAt}})' | grep -v buildcache | head -5"
}
# Check networks
check_networks() {
log_section "Docker Networks"
NETWORKS=$(ssh_exec "docker network ls --filter 'name=${STACK_NAME}' --format '{{.Name}}: {{.Driver}}'")
if [[ -n "$NETWORKS" ]]; then
log_success "Framework networks found"
echo "$NETWORKS" | while read -r line; do
log_info "$line"
done
else
log_warn "No framework-specific networks found"
fi
}
# Check volumes
check_volumes() {
log_section "Docker Volumes"
VOLUMES=$(ssh_exec "docker volume ls --filter 'name=${STACK_NAME}' --format '{{.Name}}'")
if [[ -n "$VOLUMES" ]]; then
log_success "Framework volumes found"
echo "$VOLUMES" | while read -r line; do
log_info "$line"
done
else
log_warn "No framework-specific volumes found"
fi
}
# Check application health
check_app_health() {
log_section "Application Health"
# Main health endpoint
HTTP_CODE=$(curl -k -s -o /dev/null -w "%{http_code}" https://michaelschiemer.de/health || echo "000")
if [[ "$HTTP_CODE" == "200" ]] || [[ "$HTTP_CODE" == "302" ]]; then
log_success "Application health endpoint: ${HTTP_CODE}"
else
log_error "Application health endpoint failed: ${HTTP_CODE}"
fi
# Database health
DB_CODE=$(curl -k -s -o /dev/null -w "%{http_code}" https://michaelschiemer.de/health/database || echo "000")
if [[ "$DB_CODE" == "200" ]]; then
log_success "Database connectivity: OK"
else
log_warn "Database connectivity: ${DB_CODE}"
fi
# Redis health
REDIS_CODE=$(curl -k -s -o /dev/null -w "%{http_code}" https://michaelschiemer.de/health/redis || echo "000")
if [[ "$REDIS_CODE" == "200" ]]; then
log_success "Redis connectivity: OK"
else
log_warn "Redis connectivity: ${REDIS_CODE}"
fi
}
# Check Docker secrets
check_secrets() {
log_section "Docker Secrets"
SECRETS=$(ssh_exec "docker secret ls --format '{{.Name}}' | wc -l")
if [[ "$SECRETS" -gt 0 ]]; then
log_success "Docker secrets configured: ${SECRETS} secrets"
else
log_warn "No Docker secrets found"
fi
}
# Check recent logs
check_logs() {
log_section "Recent Logs"
log_info "Last 20 lines from web service:"
ssh_exec "docker service logs ${STACK_NAME}_web --tail 20"
}
# Check Gitea runner
check_gitea_runner() {
log_section "Gitea Actions Runner"
RUNNER_STATUS=$(ssh_exec "systemctl is-active gitea-runner 2>/dev/null || echo 'not-found'")
if [[ "$RUNNER_STATUS" == "active" ]]; then
log_success "Gitea runner service is active"
elif [[ "$RUNNER_STATUS" == "not-found" ]]; then
log_warn "Gitea runner service not found (may not be installed yet)"
else
log_error "Gitea runner service is ${RUNNER_STATUS}"
fi
}
# Resource usage
check_resources() {
log_section "Resource Usage"
# Disk usage
DISK_USAGE=$(ssh_exec "df -h / | tail -1 | awk '{print \$5}'")
log_info "Disk usage: ${DISK_USAGE}"
# Memory usage
MEMORY_USAGE=$(ssh_exec "free -h | grep Mem | awk '{print \$3\"/\"\$2}'")
log_info "Memory usage: ${MEMORY_USAGE}"
# Docker disk usage
log_info "Docker disk usage:"
ssh_exec "docker system df"
}
# Parse arguments
for arg in "$@"; do
case $arg in
--quick)
QUICK_MODE=true
;;
--verbose)
VERBOSE=true
;;
esac
done
# Main diagnostics
main() {
echo ""
echo -e "${CYAN}╔════════════════════════════════════════════════════════╗${NC}"
echo -e "${CYAN}║ DEPLOYMENT DIAGNOSTICS REPORT ║${NC}"
echo -e "${CYAN}╚════════════════════════════════════════════════════════╝${NC}"
echo ""
check_local
check_ssh || { log_error "SSH connectivity failed - cannot continue"; exit 1; }
check_docker_swarm
check_services
check_images
check_app_health
if [[ "$QUICK_MODE" == false ]]; then
check_networks
check_volumes
check_secrets
check_gitea_runner
check_resources
if [[ "$VERBOSE" == true ]]; then
check_logs
fi
fi
echo ""
echo -e "${CYAN}╔════════════════════════════════════════════════════════╗${NC}"
echo -e "${CYAN}║ DIAGNOSTICS COMPLETED ║${NC}"
echo -e "${CYAN}╚════════════════════════════════════════════════════════╝${NC}"
echo ""
log_info "For detailed logs: ./scripts/deployment-diagnostics.sh --verbose"
log_info "For service recovery: ./scripts/service-recovery.sh recover"
echo ""
}
main "$@"