--- # Consolidated Traefik Diagnosis Playbook # Consolidates: diagnose-traefik-restarts.yml, find-traefik-restart-source.yml, # monitor-traefik-restarts.yml, monitor-traefik-continuously.yml, # verify-traefik-fix.yml # # Usage: # # Basic diagnosis (default) # ansible-playbook -i inventory/production.yml playbooks/diagnose/traefik.yml # # # Find restart source # ansible-playbook -i inventory/production.yml playbooks/diagnose/traefik.yml --tags restart-source # # # Monitor restarts # ansible-playbook -i inventory/production.yml playbooks/diagnose/traefik.yml --tags monitor - name: Diagnose Traefik Issues hosts: production gather_facts: yes become: yes vars: traefik_stack_path: "{{ stacks_base_path }}/traefik" traefik_container_name: "traefik" monitor_duration_seconds: "{{ monitor_duration_seconds | default(120) }}" monitor_lookback_hours: "{{ monitor_lookback_hours | default(24) }}" tasks: - name: Display diagnostic plan ansible.builtin.debug: msg: | ================================================================================ TRAEFIK DIAGNOSIS ================================================================================ Running diagnosis with tags: {{ ansible_run_tags | default(['all']) }} Basic checks (always): - Container status - Restart count - Recent logs Restart source (--tags restart-source): - Find source of restart loops - Check cronjobs, systemd, scripts Monitor (--tags monitor): - Monitor for restarts over time ================================================================================ # ======================================== # BASIC DIAGNOSIS (always runs) # ======================================== - name: Check Traefik container status ansible.builtin.shell: | cd {{ traefik_stack_path }} docker compose ps {{ traefik_container_name }} register: traefik_status changed_when: false - name: Check Traefik container restart count ansible.builtin.shell: | docker inspect {{ traefik_container_name }} --format '{{ '{{' }}.RestartCount{{ '}}' }}' 2>/dev/null || echo "0" register: traefik_restart_count changed_when: false - name: Check Traefik container start time ansible.builtin.shell: | docker inspect {{ traefik_container_name }} --format '{{ '{{' }}.State.StartedAt{{ '}}' }}' 2>/dev/null || echo "UNKNOWN" register: traefik_started_at changed_when: false - name: Check Traefik logs for recent restarts ansible.builtin.shell: | cd {{ traefik_stack_path }} docker compose logs {{ traefik_container_name }} --since 2h 2>&1 | grep -iE "stopping server gracefully|I have to go|restart|shutdown" | tail -20 || echo "No restart messages in last 2 hours" register: traefik_restart_logs changed_when: false failed_when: false - name: Check Traefik logs for errors ansible.builtin.shell: | cd {{ traefik_stack_path }} docker compose logs {{ traefik_container_name }} --tail=100 2>&1 | grep -iE "error|warn|fail" | tail -20 || echo "No errors in recent logs" register: traefik_error_logs changed_when: false failed_when: false # ======================================== # RESTART SOURCE DIAGNOSIS (--tags restart-source) # ======================================== - name: Check all user crontabs for Traefik/Docker commands ansible.builtin.shell: | for user in $(cut -f1 -d: /etc/passwd); do crontab -u "$user" -l 2>/dev/null | grep -qE "traefik|docker.*compose.*traefik|docker.*stop.*traefik|docker.*restart.*traefik|docker.*down.*traefik" && echo "=== User: $user ===" && crontab -u "$user" -l 2>/dev/null | grep -E "traefik|docker.*compose.*traefik|docker.*stop.*traefik|docker.*restart.*traefik|docker.*down.*traefik" || true done || echo "No user crontabs with Traefik commands found" register: all_user_crontabs changed_when: false tags: - restart-source - name: Check system-wide cron directories ansible.builtin.shell: | for dir in /etc/cron.d /etc/cron.daily /etc/cron.hourly /etc/cron.weekly /etc/cron.monthly; do if [ -d "$dir" ]; then echo "=== $dir ===" grep -rE "traefik|docker.*compose.*traefik|docker.*stop.*traefik|docker.*restart.*traefik|docker.*down.*traefik" "$dir" 2>/dev/null || echo "No matches" fi done register: system_cron_dirs changed_when: false tags: - restart-source - name: Check systemd timers and services ansible.builtin.shell: | echo "=== Active Timers ===" systemctl list-timers --all --no-pager | grep -E "traefik|docker.*compose" || echo "No Traefik-related timers" echo "" echo "=== Custom Services ===" systemctl list-units --type=service --all | grep -E "traefik|docker.*compose" || echo "No Traefik-related services" register: systemd_services changed_when: false tags: - restart-source - name: Check for scripts in deployment directory that restart Traefik ansible.builtin.shell: | find /home/deploy/deployment -type f \( -name "*.sh" -o -name "*.yml" -o -name "*.yaml" \) -exec grep -lE "traefik.*restart|docker.*compose.*traefik.*restart|docker.*compose.*traefik.*down|docker.*compose.*traefik.*stop" {} \; 2>/dev/null | head -30 register: deployment_scripts changed_when: false tags: - restart-source - name: Check Ansible roles for traefik_auto_restart or restart tasks ansible.builtin.shell: | grep -rE "traefik_auto_restart|traefik.*restart|docker.*compose.*traefik.*restart" /home/deploy/deployment/ansible/roles/ 2>/dev/null | grep -v ".git" | head -20 || echo "No auto-restart settings found" register: ansible_auto_restart changed_when: false tags: - restart-source - name: Check Docker events for Traefik (last 24 hours) ansible.builtin.shell: | timeout 5 docker events --since 24h --filter container={{ traefik_container_name }} --filter event=die --format "{{ '{{' }}.Time{{ '}}' }} {{ '{{' }}.Action{{ '}}' }}" 2>/dev/null | tail -20 || echo "No Traefik die events found" register: docker_events_traefik changed_when: false failed_when: false tags: - restart-source # ======================================== # MONITOR (--tags monitor) # ======================================== - name: Check Traefik logs for stop messages (lookback period) ansible.builtin.shell: | cd {{ traefik_stack_path }} docker compose logs {{ traefik_container_name }} --since {{ monitor_lookback_hours }}h 2>&1 | grep -E "I have to go|Stopping server gracefully" | tail -20 || echo "No stop messages found" register: traefik_stop_messages changed_when: false tags: - monitor - name: Count stop messages ansible.builtin.set_fact: stop_count: "{{ traefik_stop_messages.stdout | regex_findall('I have to go|Stopping server gracefully') | length }}" tags: - monitor - name: Check system reboot history ansible.builtin.shell: | last reboot | head -5 || echo "No reboots found" register: reboots changed_when: false tags: - monitor # ======================================== # SUMMARY # ======================================== - name: Summary ansible.builtin.debug: msg: | ================================================================================ TRAEFIK DIAGNOSIS SUMMARY ================================================================================ Container Status: - Status: {{ traefik_status.stdout | regex_replace('.*(Up|Down|Restarting).*', '\\1') | default('UNKNOWN') }} - Restart Count: {{ traefik_restart_count.stdout }} - Started At: {{ traefik_started_at.stdout }} Recent Logs: - Restart Messages (last 2h): {{ traefik_restart_logs.stdout | default('None') }} - Errors (last 100 lines): {{ traefik_error_logs.stdout | default('None') }} {% if 'restart-source' in ansible_run_tags %} Restart Source Analysis: - User Crontabs: {{ all_user_crontabs.stdout | default('None found') }} - System Cron: {{ system_cron_dirs.stdout | default('None found') }} - Systemd Services/Timers: {{ systemd_services.stdout | default('None found') }} - Deployment Scripts: {{ deployment_scripts.stdout | default('None found') }} - Ansible Auto-Restart: {{ ansible_auto_restart.stdout | default('None found') }} - Docker Events: {{ docker_events_traefik.stdout | default('None found') }} {% endif %} {% if 'monitor' in ansible_run_tags %} Monitoring (last {{ monitor_lookback_hours }} hours): - Stop Messages: {{ stop_count | default(0) }} - System Reboots: {{ reboots.stdout | default('None') }} {% endif %} ================================================================================ RECOMMENDATIONS ================================================================================ {% if 'stopping server gracefully' in traefik_restart_logs.stdout | lower or 'I have to go' in traefik_restart_logs.stdout %} ❌ PROBLEM: Traefik is being stopped regularly! → Run with --tags restart-source to find the source {% endif %} {% if (traefik_restart_count.stdout | int) > 5 %} ⚠️ WARNING: High restart count ({{ traefik_restart_count.stdout }}) → Check restart source: ansible-playbook -i inventory/production.yml playbooks/diagnose/traefik.yml --tags restart-source {% endif %} ================================================================================