Files
michaelschiemer/.deployment-archive-20251030-111806/ansible/playbooks/health-check.yml

141 lines
4.3 KiB
YAML

---
# Ansible Playbook: Production Health Check
# Purpose: Comprehensive health verification for production deployment
# Usage: ansible-playbook -i inventory/production.yml playbooks/health-check.yml
- name: Production Health Check
hosts: production_server
become: no
vars:
app_url: "https://michaelschiemer.de"
stack_name: "framework"
health_timeout: 30
max_retries: 10
tasks:
- name: Check Docker Swarm status
shell: docker info | grep "Swarm: active"
register: swarm_status
failed_when: swarm_status.rc != 0
changed_when: false
- name: Check running services
shell: docker service ls --filter "name={{ stack_name }}" --format "{{`{{.Name}}`}} {{`{{.Replicas}}`}}"
register: service_list
changed_when: false
- name: Display service status
debug:
msg: "{{ service_list.stdout_lines }}"
- name: Verify web service is running
shell: |
docker service ps {{ stack_name }}_web \
--filter "desired-state=running" \
--format "{{`{{.CurrentState}}`}}" | head -1
register: web_state
changed_when: false
- name: Fail if web service not running
fail:
msg: "Web service is not in Running state: {{ web_state.stdout }}"
when: "'Running' not in web_state.stdout"
- name: Verify worker service is running
shell: |
docker service ps {{ stack_name }}_queue-worker \
--filter "desired-state=running" \
--format "{{`{{.CurrentState}}`}}" | head -1
register: worker_state
changed_when: false
- name: Fail if worker service not running
fail:
msg: "Worker service is not in Running state: {{ worker_state.stdout }}"
when: "'Running' not in worker_state.stdout"
- name: Wait for application to be ready
uri:
url: "{{ app_url }}/health"
validate_certs: no
status_code: [200, 302]
timeout: "{{ health_timeout }}"
register: health_response
retries: "{{ max_retries }}"
delay: 3
until: health_response.status in [200, 302]
- name: Check database connectivity
uri:
url: "{{ app_url }}/health/database"
validate_certs: no
status_code: 200
timeout: "{{ health_timeout }}"
register: db_health
ignore_errors: yes
- name: Check Redis connectivity
uri:
url: "{{ app_url }}/health/redis"
validate_certs: no
status_code: 200
timeout: "{{ health_timeout }}"
register: redis_health
ignore_errors: yes
- name: Check queue system
uri:
url: "{{ app_url }}/health/queue"
validate_certs: no
status_code: 200
timeout: "{{ health_timeout }}"
register: queue_health
ignore_errors: yes
- name: Get service replicas count
shell: |
docker service ls --filter "name={{ stack_name }}_web" --format "{{`{{.Replicas}}`}}"
register: replicas
changed_when: false
- name: Check for service errors
shell: |
docker service ps {{ stack_name }}_web --filter "desired-state=running" | grep -c Error || true
register: error_count
changed_when: false
- name: Warn if errors detected
debug:
msg: "⚠️ Warning: {{ error_count.stdout }} errors detected in service logs"
when: error_count.stdout | int > 0
- name: Display health check summary
debug:
msg: |
✅ Health Check Summary:
Services:
- Web Service: {{ web_state.stdout }}
- Worker Service: {{ worker_state.stdout }}
- Replicas: {{ replicas.stdout }}
Endpoints:
- Application: {{ health_response.status }}
- Database: {{ db_health.status | default('SKIPPED') }}
- Redis: {{ redis_health.status | default('SKIPPED') }}
- Queue: {{ queue_health.status | default('SKIPPED') }}
Errors: {{ error_count.stdout }}
- name: Overall health assessment
debug:
msg: "✅ All health checks PASSED"
when:
- health_response.status in [200, 302]
- error_count.stdout | int == 0
- name: Fail if critical health checks failed
fail:
msg: "❌ Health check FAILED - manual intervention required"
when: health_response.status not in [200, 302]