141 lines
4.3 KiB
YAML
141 lines
4.3 KiB
YAML
---
|
|
# Ansible Playbook: Production Health Check
|
|
# Purpose: Comprehensive health verification for production deployment
|
|
# Usage: ansible-playbook -i inventory/production.yml playbooks/health-check.yml
|
|
|
|
- name: Production Health Check
|
|
hosts: production_server
|
|
become: no
|
|
vars:
|
|
app_url: "https://michaelschiemer.de"
|
|
stack_name: "framework"
|
|
health_timeout: 30
|
|
max_retries: 10
|
|
|
|
tasks:
|
|
- name: Check Docker Swarm status
|
|
shell: docker info | grep "Swarm: active"
|
|
register: swarm_status
|
|
failed_when: swarm_status.rc != 0
|
|
changed_when: false
|
|
|
|
- name: Check running services
|
|
shell: docker service ls --filter "name={{ stack_name }}" --format "{{`{{.Name}}`}} {{`{{.Replicas}}`}}"
|
|
register: service_list
|
|
changed_when: false
|
|
|
|
- name: Display service status
|
|
debug:
|
|
msg: "{{ service_list.stdout_lines }}"
|
|
|
|
- name: Verify web service is running
|
|
shell: |
|
|
docker service ps {{ stack_name }}_web \
|
|
--filter "desired-state=running" \
|
|
--format "{{`{{.CurrentState}}`}}" | head -1
|
|
register: web_state
|
|
changed_when: false
|
|
|
|
- name: Fail if web service not running
|
|
fail:
|
|
msg: "Web service is not in Running state: {{ web_state.stdout }}"
|
|
when: "'Running' not in web_state.stdout"
|
|
|
|
- name: Verify worker service is running
|
|
shell: |
|
|
docker service ps {{ stack_name }}_queue-worker \
|
|
--filter "desired-state=running" \
|
|
--format "{{`{{.CurrentState}}`}}" | head -1
|
|
register: worker_state
|
|
changed_when: false
|
|
|
|
- name: Fail if worker service not running
|
|
fail:
|
|
msg: "Worker service is not in Running state: {{ worker_state.stdout }}"
|
|
when: "'Running' not in worker_state.stdout"
|
|
|
|
- name: Wait for application to be ready
|
|
uri:
|
|
url: "{{ app_url }}/health"
|
|
validate_certs: no
|
|
status_code: [200, 302]
|
|
timeout: "{{ health_timeout }}"
|
|
register: health_response
|
|
retries: "{{ max_retries }}"
|
|
delay: 3
|
|
until: health_response.status in [200, 302]
|
|
|
|
- name: Check database connectivity
|
|
uri:
|
|
url: "{{ app_url }}/health/database"
|
|
validate_certs: no
|
|
status_code: 200
|
|
timeout: "{{ health_timeout }}"
|
|
register: db_health
|
|
ignore_errors: yes
|
|
|
|
- name: Check Redis connectivity
|
|
uri:
|
|
url: "{{ app_url }}/health/redis"
|
|
validate_certs: no
|
|
status_code: 200
|
|
timeout: "{{ health_timeout }}"
|
|
register: redis_health
|
|
ignore_errors: yes
|
|
|
|
- name: Check queue system
|
|
uri:
|
|
url: "{{ app_url }}/health/queue"
|
|
validate_certs: no
|
|
status_code: 200
|
|
timeout: "{{ health_timeout }}"
|
|
register: queue_health
|
|
ignore_errors: yes
|
|
|
|
- name: Get service replicas count
|
|
shell: |
|
|
docker service ls --filter "name={{ stack_name }}_web" --format "{{`{{.Replicas}}`}}"
|
|
register: replicas
|
|
changed_when: false
|
|
|
|
- name: Check for service errors
|
|
shell: |
|
|
docker service ps {{ stack_name }}_web --filter "desired-state=running" | grep -c Error || true
|
|
register: error_count
|
|
changed_when: false
|
|
|
|
- name: Warn if errors detected
|
|
debug:
|
|
msg: "⚠️ Warning: {{ error_count.stdout }} errors detected in service logs"
|
|
when: error_count.stdout | int > 0
|
|
|
|
- name: Display health check summary
|
|
debug:
|
|
msg: |
|
|
✅ Health Check Summary:
|
|
|
|
Services:
|
|
- Web Service: {{ web_state.stdout }}
|
|
- Worker Service: {{ worker_state.stdout }}
|
|
- Replicas: {{ replicas.stdout }}
|
|
|
|
Endpoints:
|
|
- Application: {{ health_response.status }}
|
|
- Database: {{ db_health.status | default('SKIPPED') }}
|
|
- Redis: {{ redis_health.status | default('SKIPPED') }}
|
|
- Queue: {{ queue_health.status | default('SKIPPED') }}
|
|
|
|
Errors: {{ error_count.stdout }}
|
|
|
|
- name: Overall health assessment
|
|
debug:
|
|
msg: "✅ All health checks PASSED"
|
|
when:
|
|
- health_response.status in [200, 302]
|
|
- error_count.stdout | int == 0
|
|
|
|
- name: Fail if critical health checks failed
|
|
fail:
|
|
msg: "❌ Health check FAILED - manual intervention required"
|
|
when: health_response.status not in [200, 302]
|