--- # Ansible Playbook: Production Health Check # Purpose: Comprehensive health verification for production deployment # Usage: ansible-playbook -i inventory/production.yml playbooks/health-check.yml - name: Production Health Check hosts: production_server become: no vars: app_url: "https://michaelschiemer.de" stack_name: "framework" health_timeout: 30 max_retries: 10 tasks: - name: Check Docker Swarm status shell: docker info | grep "Swarm: active" register: swarm_status failed_when: swarm_status.rc != 0 changed_when: false - name: Check running services shell: docker service ls --filter "name={{ stack_name }}" --format "{{`{{.Name}}`}} {{`{{.Replicas}}`}}" register: service_list changed_when: false - name: Display service status debug: msg: "{{ service_list.stdout_lines }}" - name: Verify web service is running shell: | docker service ps {{ stack_name }}_web \ --filter "desired-state=running" \ --format "{{`{{.CurrentState}}`}}" | head -1 register: web_state changed_when: false - name: Fail if web service not running fail: msg: "Web service is not in Running state: {{ web_state.stdout }}" when: "'Running' not in web_state.stdout" - name: Verify worker service is running shell: | docker service ps {{ stack_name }}_queue-worker \ --filter "desired-state=running" \ --format "{{`{{.CurrentState}}`}}" | head -1 register: worker_state changed_when: false - name: Fail if worker service not running fail: msg: "Worker service is not in Running state: {{ worker_state.stdout }}" when: "'Running' not in worker_state.stdout" - name: Wait for application to be ready uri: url: "{{ app_url }}/health" validate_certs: no status_code: [200, 302] timeout: "{{ health_timeout }}" register: health_response retries: "{{ max_retries }}" delay: 3 until: health_response.status in [200, 302] - name: Check database connectivity uri: url: "{{ app_url }}/health/database" validate_certs: no status_code: 200 timeout: "{{ health_timeout }}" register: db_health ignore_errors: yes - name: Check Redis connectivity uri: url: "{{ app_url }}/health/redis" validate_certs: no status_code: 200 timeout: "{{ health_timeout }}" register: redis_health ignore_errors: yes - name: Check queue system uri: url: "{{ app_url }}/health/queue" validate_certs: no status_code: 200 timeout: "{{ health_timeout }}" register: queue_health ignore_errors: yes - name: Get service replicas count shell: | docker service ls --filter "name={{ stack_name }}_web" --format "{{`{{.Replicas}}`}}" register: replicas changed_when: false - name: Check for service errors shell: | docker service ps {{ stack_name }}_web --filter "desired-state=running" | grep -c Error || true register: error_count changed_when: false - name: Warn if errors detected debug: msg: "⚠️ Warning: {{ error_count.stdout }} errors detected in service logs" when: error_count.stdout | int > 0 - name: Display health check summary debug: msg: | ✅ Health Check Summary: Services: - Web Service: {{ web_state.stdout }} - Worker Service: {{ worker_state.stdout }} - Replicas: {{ replicas.stdout }} Endpoints: - Application: {{ health_response.status }} - Database: {{ db_health.status | default('SKIPPED') }} - Redis: {{ redis_health.status | default('SKIPPED') }} - Queue: {{ queue_health.status | default('SKIPPED') }} Errors: {{ error_count.stdout }} - name: Overall health assessment debug: msg: "✅ All health checks PASSED" when: - health_response.status in [200, 302] - error_count.stdout | int == 0 - name: Fail if critical health checks failed fail: msg: "❌ Health check FAILED - manual intervention required" when: health_response.status not in [200, 302]