--- # Find Source of Traefik Restarts # Umfassende Diagnose um die Quelle der regelmäßigen Traefik-Restarts zu finden - name: Find Source of Traefik Restarts hosts: production gather_facts: yes become: yes vars: traefik_stack_path: "{{ stacks_base_path }}/traefik" monitor_duration_seconds: 120 # 2 Minuten Monitoring (kann erhöht werden) tasks: - name: Check Traefik container restart count ansible.builtin.shell: | docker inspect traefik --format '{{ '{{' }}.RestartCount{{ '}}' }}' 2>/dev/null || echo "0" register: traefik_restart_count changed_when: false - name: Check Traefik container start time ansible.builtin.shell: | docker inspect traefik --format '{{ '{{' }}.State.StartedAt{{ '}}' }}' 2>/dev/null || echo "UNKNOWN" register: traefik_started_at changed_when: false - name: Analyze Traefik logs for "Stopping server gracefully" messages ansible.builtin.shell: | cd {{ traefik_stack_path }} docker compose logs traefik 2>&1 | grep -i "stopping server gracefully\|I have to go" | tail -20 register: traefik_stop_messages changed_when: false failed_when: false - name: Extract timestamps from stop messages ansible.builtin.shell: | cd {{ traefik_stack_path }} docker compose logs traefik 2>&1 | grep -i "stopping server gracefully\|I have to go" | tail -20 | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}' | sort | uniq register: stop_timestamps changed_when: false failed_when: false - name: Check Docker daemon logs for Traefik stop events ansible.builtin.shell: | journalctl -u docker.service --since "24 hours ago" --no-pager | grep -iE "traefik.*stop|traefik.*kill|traefik.*die|container.*traefik.*stopped" | tail -30 || echo "No Traefik stop events in Docker daemon logs" register: docker_daemon_logs changed_when: false failed_when: false - name: Check Docker events for Traefik (last 24 hours) ansible.builtin.shell: | docker events --since 24h --until now --filter container=traefik --filter event=die --format "{{ '{{' }}.Time{{ '}}' }} {{ '{{' }}.Action{{ '}}' }} {{ '{{' }}.Actor.Attributes.name{{ '}}' }}" 2>/dev/null | tail -20 || echo "No Traefik die events found" register: docker_events_traefik changed_when: false failed_when: false - name: Check all user crontabs for Traefik/Docker commands ansible.builtin.shell: | for user in $(cut -f1 -d: /etc/passwd); do crontab -u "$user" -l 2>/dev/null | grep -qE "traefik|docker.*compose.*traefik|docker.*stop.*traefik|docker.*restart.*traefik|docker.*down.*traefik" && echo "=== User: $user ===" && crontab -u "$user" -l 2>/dev/null | grep -E "traefik|docker.*compose.*traefik|docker.*stop.*traefik|docker.*restart.*traefik|docker.*down.*traefik" || true done || echo "No user crontabs with Traefik commands found" register: all_user_crontabs changed_when: false - name: Check system-wide cron directories ansible.builtin.shell: | for dir in /etc/cron.d /etc/cron.daily /etc/cron.hourly /etc/cron.weekly /etc/cron.monthly; do if [ -d "$dir" ]; then echo "=== $dir ===" grep -rE "traefik|docker.*compose.*traefik|docker.*stop.*traefik|docker.*restart.*traefik|docker.*down.*traefik" "$dir" 2>/dev/null || echo "No matches" fi done register: system_cron_dirs changed_when: false - name: Check systemd timers and services ansible.builtin.shell: | echo "=== Active Timers ===" systemctl list-timers --all --no-pager | grep -E "traefik|docker.*compose" || echo "No Traefik-related timers" echo "" echo "=== Custom Services ===" systemctl list-units --type=service --all | grep -E "traefik|docker.*compose" || echo "No Traefik-related services" register: systemd_services changed_when: false - name: Check for scripts in deployment directory that restart Traefik ansible.builtin.shell: | find /home/deploy/deployment -type f \( -name "*.sh" -o -name "*.yml" -o -name "*.yaml" \) -exec grep -lE "traefik.*restart|docker.*compose.*traefik.*restart|docker.*compose.*traefik.*down|docker.*compose.*traefik.*stop" {} \; 2>/dev/null | head -30 register: deployment_scripts changed_when: false - name: Check Ansible roles for traefik_auto_restart or restart tasks ansible.builtin.shell: | grep -rE "traefik_auto_restart|traefik.*restart|docker.*compose.*traefik.*restart" /home/deploy/deployment/ansible/roles/ 2>/dev/null | grep -v ".git" | head -20 || echo "No auto-restart settings found" register: ansible_auto_restart changed_when: false - name: Check Docker Compose watch mode ansible.builtin.shell: | cd {{ traefik_stack_path }} docker compose ps traefik 2>/dev/null | grep -q "traefik" && echo "running" || echo "not_running" register: docker_compose_watch changed_when: false failed_when: false - name: Check if Docker Compose is running in watch mode ansible.builtin.shell: | ps aux | grep -E "docker.*compose.*watch|docker.*compose.*--watch" | grep -v grep || echo "No Docker Compose watch mode detected" register: watch_mode_process changed_when: false - name: Check for monitoring/watchdog scripts ansible.builtin.shell: | find /home/deploy -type f -name "*monitor*" -o -name "*watchdog*" -o -name "*health*" 2>/dev/null | xargs grep -lE "traefik|docker.*compose.*traefik" 2>/dev/null | head -10 || echo "No monitoring scripts found" register: monitoring_scripts changed_when: false - name: Check Gitea Workflows for Traefik restarts ansible.builtin.shell: | find /home/deploy -type f -path "*/.gitea/workflows/*.yml" -o -path "*/.github/workflows/*.yml" 2>/dev/null | xargs grep -lE "traefik.*restart|docker.*compose.*traefik.*restart" 2>/dev/null | head -10 || echo "No Gitea workflows found that restart Traefik" register: gitea_workflows changed_when: false - name: Monitor Docker events in real-time (5 minutes) ansible.builtin.shell: | timeout {{ monitor_duration_seconds }} docker events --filter container=traefik --format "{{ '{{' }}.Time{{ '}}' }} {{ '{{' }}.Action{{ '}}' }} {{ '{{' }}.Actor.Attributes.name{{ '}}' }}" 2>&1 || echo "Monitoring completed or timeout" register: docker_events_realtime changed_when: false failed_when: false async: "{{ monitor_duration_seconds + 10 }}" poll: 0 - name: Wait for monitoring to complete ansible.builtin.async_status: jid: "{{ docker_events_realtime.ansible_job_id }}" register: monitoring_result until: monitoring_result.finished retries: "{{ (monitor_duration_seconds / 10) | int + 5 }}" delay: 10 failed_when: false - name: Check system reboot history ansible.builtin.shell: | last reboot --since "24 hours ago" 2>/dev/null | head -10 || echo "No reboots in last 24 hours" register: reboot_history changed_when: false failed_when: false - name: Check for at jobs ansible.builtin.shell: | atq 2>/dev/null | while read line; do job_id=$(echo "$line" | awk '{print $1}') at -c "$job_id" 2>/dev/null | grep -qE "traefik|docker.*compose.*traefik" && echo "=== Job ID: $job_id ===" && at -c "$job_id" 2>/dev/null | grep -E "traefik|docker.*compose.*traefik" || true done || echo "No at jobs found or atq not available" register: at_jobs changed_when: false - name: Check Docker daemon configuration for auto-restart ansible.builtin.shell: | cat /etc/docker/daemon.json 2>/dev/null | grep -iE "restart|live-restore" || echo "No restart settings in daemon.json" register: docker_daemon_config changed_when: false failed_when: false - name: Check if Traefik has restart policy ansible.builtin.shell: | cd {{ traefik_stack_path }} docker compose config | grep -A 5 "traefik:" | grep -E "restart|restart_policy" || echo "No explicit restart policy found" register: traefik_restart_policy changed_when: false failed_when: false - name: Summary ansible.builtin.debug: msg: | ================================================================================ TRAEFIK RESTART SOURCE DIAGNOSE - ZUSAMMENFASSUNG: ================================================================================ Traefik Status: - Restart Count: {{ traefik_restart_count.stdout }} - Started At: {{ traefik_started_at.stdout }} - Stop Messages gefunden: {{ traefik_stop_messages.stdout_lines | length }} (letzte 20) Stop-Zeitstempel (letzte 20): {% if stop_timestamps.stdout %} {{ stop_timestamps.stdout }} {% else %} Keine Stop-Zeitstempel gefunden {% endif %} Docker Events (letzte 24h): {% if docker_events_traefik.stdout and 'No Traefik die events' not in docker_events_traefik.stdout %} {{ docker_events_traefik.stdout }} {% else %} Keine Traefik die-Events in den letzten 24 Stunden {% endif %} Docker Daemon Logs: {% if docker_daemon_logs.stdout and 'No Traefik stop events' not in docker_daemon_logs.stdout %} {{ docker_daemon_logs.stdout }} {% else %} Keine Traefik-Stop-Events in Docker-Daemon-Logs {% endif %} Gefundene Quellen: {% if all_user_crontabs.stdout and 'No user crontabs' not in all_user_crontabs.stdout %} 1. ❌ CRONJOBS (User): {{ all_user_crontabs.stdout }} {% endif %} {% if system_cron_dirs.stdout and 'No matches' not in system_cron_dirs.stdout %} 2. ❌ SYSTEM CRON: {{ system_cron_dirs.stdout }} {% endif %} {% if systemd_services.stdout and 'No Traefik-related' not in systemd_services.stdout %} 3. ❌ SYSTEMD TIMERS/SERVICES: {{ systemd_services.stdout }} {% endif %} {% if deployment_scripts.stdout and 'No' not in deployment_scripts.stdout %} 4. ⚠️ DEPLOYMENT SCRIPTS: {{ deployment_scripts.stdout }} {% endif %} {% if ansible_auto_restart.stdout and 'No auto-restart' not in ansible_auto_restart.stdout %} 5. ⚠️ ANSIBLE AUTO-RESTART: {{ ansible_auto_restart.stdout }} {% endif %} {% if gitea_workflows.stdout and 'No Gitea workflows' not in gitea_workflows.stdout %} 6. ⚠️ GITEA WORKFLOWS: {{ gitea_workflows.stdout }} {% endif %} {% if monitoring_scripts.stdout and 'No monitoring scripts' not in monitoring_scripts.stdout %} 7. ⚠️ MONITORING SCRIPTS: {{ monitoring_scripts.stdout }} {% endif %} {% if at_jobs.stdout and 'No at jobs' not in at_jobs.stdout %} 8. ❌ AT JOBS: {{ at_jobs.stdout }} {% endif %} {% if docker_compose_watch.stdout and 'Could not check' not in docker_compose_watch.stdout %} 9. ⚠️ DOCKER COMPOSE WATCH: {{ docker_compose_watch.stdout }} {% endif %} {% if watch_mode_process.stdout and 'No Docker Compose watch' not in watch_mode_process.stdout %} 10. ❌ DOCKER COMPOSE WATCH MODE (PROZESS): {{ watch_mode_process.stdout }} {% endif %} {% if reboot_history.stdout and 'No reboots' not in reboot_history.stdout %} 11. ⚠️ SYSTEM REBOOTS: {{ reboot_history.stdout }} {% endif %} Real-Time Monitoring ({{ monitor_duration_seconds }} Sekunden): {% if monitoring_result.finished and monitoring_result.ansible_job_id %} {{ monitoring_result.stdout | default('Keine Events während Monitoring') }} {% else %} Monitoring läuft noch oder wurde unterbrochen {% endif %} ================================================================================ NÄCHSTE SCHRITTE: ================================================================================ {% if all_user_crontabs.stdout and 'No user crontabs' not in all_user_crontabs.stdout %} 1. ❌ CRONJOBS DEAKTIVIEREN: - Prüfe gefundene Cronjobs: {{ all_user_crontabs.stdout }} - Entferne oder kommentiere die entsprechenden Einträge {% endif %} {% if system_cron_dirs.stdout and 'No matches' not in system_cron_dirs.stdout %} 2. ❌ SYSTEM CRON DEAKTIVIEREN: - Prüfe gefundene System-Cronjobs: {{ system_cron_dirs.stdout }} - Entferne oder benenne die Dateien um {% endif %} {% if systemd_services.stdout and 'No Traefik-related' not in systemd_services.stdout %} 3. ❌ SYSTEMD TIMERS/SERVICES DEAKTIVIEREN: - Prüfe gefundene Services/Timers: {{ systemd_services.stdout }} - Deaktiviere mit: systemctl disable {% endif %} {% if deployment_scripts.stdout and 'No' not in deployment_scripts.stdout %} 4. ⚠️ DEPLOYMENT SCRIPTS PRÜFEN: - Prüfe gefundene Scripts: {{ deployment_scripts.stdout }} - Entferne oder kommentiere Traefik-Restart-Befehle {% endif %} {% if ansible_auto_restart.stdout and 'No auto-restart' not in ansible_auto_restart.stdout %} 5. ⚠️ ANSIBLE AUTO-RESTART PRÜFEN: - Prüfe gefundene Einstellungen: {{ ansible_auto_restart.stdout }} - Setze traefik_auto_restart: false in group_vars {% endif %} {% if not all_user_crontabs.stdout or 'No user crontabs' in all_user_crontabs.stdout %} {% if not system_cron_dirs.stdout or 'No matches' in system_cron_dirs.stdout %} {% if not systemd_services.stdout or 'No Traefik-related' in systemd_services.stdout %} {% if not deployment_scripts.stdout or 'No' in deployment_scripts.stdout %} {% if not ansible_auto_restart.stdout or 'No auto-restart' in ansible_auto_restart.stdout %} ⚠️ KEINE AUTOMATISCHEN RESTART-MECHANISMEN GEFUNDEN! Mögliche Ursachen: 1. Externer Prozess (nicht über Cron/Systemd) 2. Docker-Service-Restarts (systemctl restart docker) 3. Host-Reboots 4. Manuelle Restarts (von außen) 5. Monitoring-Service (Portainer, Watchtower, etc.) Nächste Schritte: 1. Führe 'docker events --filter container=traefik' manuell aus und beobachte 2. Prüfe journalctl -u docker.service für Docker-Service-Restarts 3. Prüfe ob Portainer oder andere Monitoring-Tools laufen 4. Prüfe ob Watchtower oder andere Auto-Update-Tools installiert sind {% endif %} {% endif %} {% endif %} {% endif %} {% endif %} ================================================================================