--- # Diagnose: Finde Ursache für Traefik Restart-Loop # Prüft alle möglichen Ursachen für regelmäßige Traefik-Restarts - name: Diagnose Traefik Restart Loop hosts: production gather_facts: yes become: yes tasks: - name: Check systemd timers ansible.builtin.shell: | systemctl list-timers --all --no-pager register: systemd_timers changed_when: false - name: Display systemd timers ansible.builtin.debug: msg: | ================================================================================ Systemd Timers (können Container stoppen): ================================================================================ {{ systemd_timers.stdout }} ================================================================================ - name: Check root crontab ansible.builtin.shell: | crontab -l 2>/dev/null || echo "No root crontab" register: root_crontab changed_when: false - name: Display root crontab ansible.builtin.debug: msg: | ================================================================================ Root Crontab: ================================================================================ {{ root_crontab.stdout }} ================================================================================ - name: Check deploy user crontab ansible.builtin.shell: | crontab -l -u deploy 2>/dev/null || echo "No deploy user crontab" register: deploy_crontab changed_when: false - name: Display deploy user crontab ansible.builtin.debug: msg: | ================================================================================ Deploy User Crontab: ================================================================================ {{ deploy_crontab.stdout }} ================================================================================ - name: Check system-wide cron jobs ansible.builtin.shell: | echo "=== /etc/cron.d ===" ls -la /etc/cron.d 2>/dev/null || echo "Directory not found" grep -r "traefik\|docker.*compose.*traefik\|docker.*stop\|docker.*restart" /etc/cron.d 2>/dev/null || echo "No matches" echo "" echo "=== /etc/cron.daily ===" ls -la /etc/cron.daily 2>/dev/null || echo "Directory not found" grep -r "traefik\|docker.*compose.*traefik\|docker.*stop\|docker.*restart" /etc/cron.daily 2>/dev/null || echo "No matches" echo "" echo "=== /etc/cron.hourly ===" ls -la /etc/cron.hourly 2>/dev/null || echo "Directory not found" grep -r "traefik\|docker.*compose.*traefik\|docker.*stop\|docker.*restart" /etc/cron.hourly 2>/dev/null || echo "No matches" echo "" echo "=== /etc/cron.weekly ===" ls -la /etc/cron.weekly 2>/dev/null || echo "Directory not found" grep -r "traefik\|docker.*compose.*traefik\|docker.*stop\|docker.*restart" /etc/cron.weekly 2>/dev/null || echo "No matches" echo "" echo "=== /etc/cron.monthly ===" ls -la /etc/cron.monthly 2>/dev/null || echo "Directory not found" grep -r "traefik\|docker.*compose.*traefik\|docker.*stop\|docker.*restart" /etc/cron.monthly 2>/dev/null || echo "No matches" register: system_cron changed_when: false - name: Display system cron jobs ansible.builtin.debug: msg: | ================================================================================ System-Wide Cron Jobs: ================================================================================ {{ system_cron.stdout }} ================================================================================ - name: Check for scripts that might restart Traefik ansible.builtin.shell: | find /home/deploy -type f -name "*.sh" -exec grep -l "traefik\|docker.*compose.*restart\|docker.*stop.*traefik\|docker.*down.*traefik" {} \; 2>/dev/null | head -20 register: traefik_scripts changed_when: false - name: Display scripts that might restart Traefik ansible.builtin.debug: msg: | ================================================================================ Scripts die Traefik stoppen/restarten könnten: ================================================================================ {% if traefik_scripts.stdout %} {{ traefik_scripts.stdout }} {% else %} Keine Skripte gefunden {% endif %} ================================================================================ - name: Check Docker events for Traefik container (last 24h) ansible.builtin.shell: | timeout 5 docker events --since 24h --filter container=traefik --format "{{ '{{' }}.Time{{ '}}' }} {{ '{{' }}.Action{{ '}}' }} {{ '{{' }}.Actor.Attributes.name{{ '}}' }}" 2>/dev/null | tail -50 || echo "No recent events or docker events not available" register: docker_events changed_when: false - name: Display Docker events ansible.builtin.debug: msg: | ================================================================================ Docker Events für Traefik (letzte 24h): ================================================================================ {{ docker_events.stdout }} ================================================================================ - name: Check Traefik container exit history ansible.builtin.shell: | docker ps -a --filter "name=traefik" --format "{{ '{{' }}.ID{{ '}}' }} | {{ '{{' }}.Status{{ '}}' }} | {{ '{{' }}.CreatedAt{{ '}}' }}" | head -10 register: traefik_exits changed_when: false - name: Display Traefik container exit history ansible.builtin.debug: msg: | ================================================================================ Traefik Container Exit-Historie: ================================================================================ {{ traefik_exits.stdout }} ================================================================================ - name: Check Docker daemon logs for Traefik stops ansible.builtin.shell: | journalctl -u docker.service --since "24h ago" --no-pager | grep -i "traefik\|stop\|kill" | tail -50 || echo "No relevant logs in journalctl" register: docker_daemon_logs changed_when: false - name: Display Docker daemon logs ansible.builtin.debug: msg: | ================================================================================ Docker Daemon Logs (Traefik/Stop/Kill): ================================================================================ {{ docker_daemon_logs.stdout }} ================================================================================ - name: Check if there's a health check script running ansible.builtin.shell: | ps aux | grep -E "traefik|health.*check|monitor.*docker|auto.*heal|watchdog" | grep -v grep || echo "No health check processes found" register: health_check_processes changed_when: false - name: Display health check processes ansible.builtin.debug: msg: | ================================================================================ Laufende Health-Check/Monitoring-Prozesse: ================================================================================ {{ health_check_processes.stdout }} ================================================================================ - name: Check for monitoring/auto-heal scripts ansible.builtin.shell: | find /home/deploy -type f \( -name "*monitor*" -o -name "*health*" -o -name "*auto*heal*" -o -name "*watchdog*" \) 2>/dev/null | head -20 register: monitoring_scripts changed_when: false - name: Display monitoring scripts ansible.builtin.debug: msg: | ================================================================================ Monitoring/Auto-Heal-Skripte: ================================================================================ {% if monitoring_scripts.stdout %} {{ monitoring_scripts.stdout }} {% else %} Keine Monitoring-Skripte gefunden {% endif %} ================================================================================ - name: Check Docker Compose file for restart policies ansible.builtin.shell: | cd /home/deploy/deployment/stacks/traefik && grep -A 5 "restart:" docker-compose.yml || echo "No restart policy found" register: restart_policy changed_when: false - name: Display restart policy ansible.builtin.debug: msg: | ================================================================================ Docker Compose Restart Policy: ================================================================================ {{ restart_policy.stdout }} ================================================================================ - name: Check if Traefik is managed by systemd ansible.builtin.shell: | systemctl list-units --type=service --all | grep -i traefik || echo "No Traefik systemd service found" register: traefik_systemd changed_when: false - name: Display Traefik systemd service ansible.builtin.debug: msg: | ================================================================================ Traefik Systemd Service: ================================================================================ {{ traefik_systemd.stdout }} ================================================================================ - name: Check recent Traefik container logs for stop messages ansible.builtin.shell: | cd /home/deploy/deployment/stacks/traefik && docker compose logs traefik --since 24h 2>&1 | grep -E "I have to go|Stopping server gracefully|SIGTERM|SIGINT|received signal" | tail -20 || echo "No stop messages in logs" register: traefik_stop_logs changed_when: false - name: Display Traefik stop messages ansible.builtin.debug: msg: | ================================================================================ Traefik Stop-Meldungen (letzte 24h): ================================================================================ {{ traefik_stop_logs.stdout }} ================================================================================ - name: Check Traefik container uptime and restart count ansible.builtin.shell: | docker inspect traefik --format '{{ '{{' }}.State.StartedAt{{ '}}' }} | {{ '{{' }}.State.FinishedAt{{ '}}' }} | Restarts: {{ '{{' }}.RestartCount{{ '}}' }}' 2>/dev/null || echo "Container not found" register: traefik_uptime changed_when: false - name: Display Traefik uptime and restart count ansible.builtin.debug: msg: | ================================================================================ Traefik Container Uptime & Restart Count: ================================================================================ {{ traefik_uptime.stdout }} ================================================================================ - name: Check for unattended-upgrades activity ansible.builtin.shell: | journalctl -u unattended-upgrades --since "24h ago" --no-pager | tail -20 || echo "No unattended-upgrades logs" register: unattended_upgrades changed_when: false - name: Display unattended-upgrades activity ansible.builtin.debug: msg: | ================================================================================ Unattended-Upgrades Aktivität (kann zu Reboots führen): ================================================================================ {{ unattended_upgrades.stdout }} ================================================================================ - name: Check system reboot history ansible.builtin.shell: | last reboot | head -10 || echo "No reboot history available" register: reboot_history changed_when: false - name: Display reboot history ansible.builtin.debug: msg: | ================================================================================ System Reboot-Historie: ================================================================================ {{ reboot_history.stdout }} ================================================================================ - name: Check Docker Compose processes that might affect Traefik ansible.builtin.shell: | ps aux | grep -E "docker.*compose.*traefik|docker-compose.*traefik" | grep -v grep || echo "No docker compose processes for Traefik found" register: docker_compose_processes changed_when: false - name: Display Docker Compose processes ansible.builtin.debug: msg: | ================================================================================ Docker Compose Prozesse für Traefik: ================================================================================ {{ docker_compose_processes.stdout }} ================================================================================ - name: Check all user crontabs (not just root/deploy) ansible.builtin.shell: | for user in $(cut -f1 -d: /etc/passwd); do crontab -u "$user" -l 2>/dev/null | grep -q "traefik\|docker.*compose.*traefik\|docker.*restart.*traefik" && echo "=== User: $user ===" && crontab -u "$user" -l 2>/dev/null | grep -E "traefik|docker.*compose.*traefik|docker.*restart.*traefik" || true done || echo "No user crontabs with Traefik commands found" register: all_user_crontabs changed_when: false - name: Display all user crontabs with Traefik commands ansible.builtin.debug: msg: | ================================================================================ Alle User-Crontabs mit Traefik-Befehlen: ================================================================================ {{ all_user_crontabs.stdout }} ================================================================================ - name: Check for Gitea Workflows that might restart Traefik ansible.builtin.shell: | find /home/deploy -type f -path "*/.gitea/workflows/*.yml" -o -path "*/.github/workflows/*.yml" 2>/dev/null | xargs grep -l "traefik\|restart.*traefik\|docker.*compose.*traefik" 2>/dev/null | head -10 || echo "No Gitea/GitHub workflows found that restart Traefik" register: gitea_workflows changed_when: false - name: Display Gitea Workflows that might restart Traefik ansible.builtin.debug: msg: | ================================================================================ Gitea/GitHub Workflows die Traefik restarten könnten: ================================================================================ {{ gitea_workflows.stdout }} ================================================================================ - name: Check for custom systemd services in /etc/systemd/system/ ansible.builtin.shell: | find /etc/systemd/system -type f -name "*.service" -o -name "*.timer" 2>/dev/null | xargs grep -l "traefik\|docker.*compose.*traefik\|docker.*restart.*traefik" 2>/dev/null | head -10 || echo "No custom systemd services/timers found for Traefik" register: custom_systemd_services changed_when: false - name: Display custom systemd services ansible.builtin.debug: msg: | ================================================================================ Custom Systemd Services/Timers für Traefik: ================================================================================ {{ custom_systemd_services.stdout }} ================================================================================ - name: Check for at jobs (scheduled tasks) ansible.builtin.shell: | atq 2>/dev/null | while read line; do job_id=$(echo "$line" | awk '{print $1}') at -c "$job_id" 2>/dev/null | grep -q "traefik\|docker.*compose.*traefik\|docker.*restart.*traefik" && echo "=== Job ID: $job_id ===" && at -c "$job_id" 2>/dev/null | grep -E "traefik|docker.*compose.*traefik|docker.*restart.*traefik" || true done || echo "No at jobs found or atq not available" register: at_jobs changed_when: false - name: Display at jobs ansible.builtin.debug: msg: | ================================================================================ At Jobs (geplante Tasks) die Traefik betreffen: ================================================================================ {{ at_jobs.stdout }} ================================================================================ - name: Check for Docker Compose watch mode ansible.builtin.shell: | cd /home/deploy/deployment/stacks/traefik && docker compose ps --format json 2>/dev/null | jq -r '.[] | select(.Service=="traefik") | .State' || echo "Could not check Docker Compose watch mode" register: docker_compose_watch changed_when: false - name: Check if Docker Compose watch is enabled ansible.builtin.shell: | cd /home/deploy/deployment/stacks/traefik && docker compose config 2>/dev/null | grep -i "watch\|x-develop" || echo "No watch mode configured" register: docker_compose_watch_config changed_when: false - name: Display Docker Compose watch mode ansible.builtin.debug: msg: | ================================================================================ Docker Compose Watch Mode: ================================================================================ Watch Config: {{ docker_compose_watch_config.stdout }} ================================================================================ - name: Check Ansible traefik_auto_restart setting ansible.builtin.shell: | grep -r "traefik_auto_restart" /home/deploy/deployment/ansible/roles/traefik/defaults/ /home/deploy/deployment/ansible/inventory/ 2>/dev/null | head -10 || echo "traefik_auto_restart not found in Ansible config" register: ansible_auto_restart changed_when: false - name: Display Ansible traefik_auto_restart setting ansible.builtin.debug: msg: | ================================================================================ Ansible traefik_auto_restart Einstellung: ================================================================================ {{ ansible_auto_restart.stdout }} ================================================================================ - name: Check Port 80/443 configuration ansible.builtin.shell: | echo "=== Port 80 ===" netstat -tlnp 2>/dev/null | grep ":80 " || ss -tlnp 2>/dev/null | grep ":80 " || echo "Could not check port 80" echo "" echo "=== Port 443 ===" netstat -tlnp 2>/dev/null | grep ":443 " || ss -tlnp 2>/dev/null | grep ":443 " || echo "Could not check port 443" echo "" echo "=== Docker Port Mappings for Traefik ===" docker inspect traefik --format '{{ '{{' }}json .HostConfig.PortBindings{{ '}}' }}' 2>/dev/null | jq '.' || echo "Could not get Docker port mappings" register: port_config changed_when: false - name: Display Port configuration ansible.builtin.debug: msg: | ================================================================================ Port-Konfiguration (80/443): ================================================================================ {{ port_config.stdout }} ================================================================================ - name: Check if other services are blocking ports 80/443 ansible.builtin.shell: | echo "=== Services listening on port 80 ===" lsof -i :80 2>/dev/null || fuser 80/tcp 2>/dev/null || echo "Could not check port 80" echo "" echo "=== Services listening on port 443 ===" lsof -i :443 2>/dev/null || fuser 443/tcp 2>/dev/null || echo "Could not check port 443" register: port_blockers changed_when: false - name: Display port blockers ansible.builtin.debug: msg: | ================================================================================ Services die Ports 80/443 blockieren könnten: ================================================================================ {{ port_blockers.stdout }} ================================================================================ - name: Check Traefik network configuration ansible.builtin.shell: | docker inspect traefik --format '{{ '{{' }}json .NetworkSettings{{ '}}' }}' 2>/dev/null | jq '.Networks' || echo "Could not get Traefik network configuration" register: traefik_network changed_when: false - name: Display Traefik network configuration ansible.builtin.debug: msg: | ================================================================================ Traefik Netzwerk-Konfiguration: ================================================================================ {{ traefik_network.stdout }} ================================================================================ - name: Summary - Most likely causes ansible.builtin.debug: msg: | ================================================================================ ZUSAMMENFASSUNG - Mögliche Ursachen für Traefik-Restarts: ================================================================================ Prüfe die obigen Ausgaben auf: 1. Systemd-Timer: Können Container stoppen (z.B. unattended-upgrades) 2. Cronjobs: Regelmäßige Skripte die Traefik stoppen (alle User-Crontabs geprüft) 3. Docker-Events: Zeigen wer/was den Container stoppt 4. Monitoring-Skripte: Auto-Heal-Skripte die bei Fehlern restarten 5. Unattended-Upgrades: Können zu Reboots führen 6. Reboot-Historie: System-Reboots stoppen alle Container 7. Gitea Workflows: Können Traefik via Ansible restarten 8. Custom Systemd Services: Eigene Services die Traefik verwalten 9. At Jobs: Geplante Tasks die Traefik stoppen 10. Docker Compose Watch Mode: Automatische Restarts bei Dateiänderungen 11. Ansible traefik_auto_restart: Automatische Restarts nach Config-Deployment 12. Port-Konfiguration: Ports 80/443 müssen auf Traefik zeigen Nächste Schritte: - Prüfe die Docker-Events für wiederkehrende Muster - Prüfe alle User-Crontabs auf regelmäßige Traefik-Befehle - Prüfe ob Monitoring-Skripte zu aggressiv sind - Prüfe ob unattended-upgrades zu Reboots führt - Prüfe ob traefik_auto_restart zu häufigen Restarts führt - Verifiziere Port-Konfiguration (80/443) ================================================================================