Files
michaelschiemer/deployment/ansible/playbooks/find-traefik-restart-source.yml
Michael Schiemer 36ef2a1e2c
Some checks failed
🚀 Build & Deploy Image / Determine Build Necessity (push) Failing after 10m14s
🚀 Build & Deploy Image / Build Runtime Base Image (push) Has been skipped
🚀 Build & Deploy Image / Build Docker Image (push) Has been skipped
🚀 Build & Deploy Image / Run Tests & Quality Checks (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Staging (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Production (push) Has been skipped
Security Vulnerability Scan / Check for Dependency Changes (push) Failing after 11m25s
Security Vulnerability Scan / Composer Security Audit (push) Has been cancelled
fix: Gitea Traefik routing and connection pool optimization
- Remove middleware reference from Gitea Traefik labels (caused routing issues)
- Optimize Gitea connection pool settings (MAX_IDLE_CONNS=30, authentication_timeout=180s)
- Add explicit service reference in Traefik labels
- Fix intermittent 504 timeouts by improving PostgreSQL connection handling

Fixes Gitea unreachability via git.michaelschiemer.de
2025-11-09 14:46:15 +01:00

329 lines
15 KiB
YAML

---
# Find Source of Traefik Restarts
# Umfassende Diagnose um die Quelle der regelmäßigen Traefik-Restarts zu finden
- name: Find Source of Traefik Restarts
hosts: production
gather_facts: yes
become: yes
vars:
traefik_stack_path: "{{ stacks_base_path }}/traefik"
monitor_duration_seconds: 120 # 2 Minuten Monitoring (kann erhöht werden)
tasks:
- name: Check Traefik container restart count
ansible.builtin.shell: |
docker inspect traefik --format '{{ '{{' }}.RestartCount{{ '}}' }}' 2>/dev/null || echo "0"
register: traefik_restart_count
changed_when: false
- name: Check Traefik container start time
ansible.builtin.shell: |
docker inspect traefik --format '{{ '{{' }}.State.StartedAt{{ '}}' }}' 2>/dev/null || echo "UNKNOWN"
register: traefik_started_at
changed_when: false
- name: Analyze Traefik logs for "Stopping server gracefully" messages
ansible.builtin.shell: |
cd {{ traefik_stack_path }}
docker compose logs traefik 2>&1 | grep -i "stopping server gracefully\|I have to go" | tail -20
register: traefik_stop_messages
changed_when: false
failed_when: false
- name: Extract timestamps from stop messages
ansible.builtin.shell: |
cd {{ traefik_stack_path }}
docker compose logs traefik 2>&1 | grep -i "stopping server gracefully\|I have to go" | tail -20 | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}' | sort | uniq
register: stop_timestamps
changed_when: false
failed_when: false
- name: Check Docker daemon logs for Traefik stop events
ansible.builtin.shell: |
journalctl -u docker.service --since "24 hours ago" --no-pager | grep -iE "traefik.*stop|traefik.*kill|traefik.*die|container.*traefik.*stopped" | tail -30 || echo "No Traefik stop events in Docker daemon logs"
register: docker_daemon_logs
changed_when: false
failed_when: false
- name: Check Docker events for Traefik (last 24 hours)
ansible.builtin.shell: |
docker events --since 24h --until now --filter container=traefik --filter event=die --format "{{ '{{' }}.Time{{ '}}' }} {{ '{{' }}.Action{{ '}}' }} {{ '{{' }}.Actor.Attributes.name{{ '}}' }}" 2>/dev/null | tail -20 || echo "No Traefik die events found"
register: docker_events_traefik
changed_when: false
failed_when: false
- name: Check all user crontabs for Traefik/Docker commands
ansible.builtin.shell: |
for user in $(cut -f1 -d: /etc/passwd); do
crontab -u "$user" -l 2>/dev/null | grep -qE "traefik|docker.*compose.*traefik|docker.*stop.*traefik|docker.*restart.*traefik|docker.*down.*traefik" && echo "=== User: $user ===" && crontab -u "$user" -l 2>/dev/null | grep -E "traefik|docker.*compose.*traefik|docker.*stop.*traefik|docker.*restart.*traefik|docker.*down.*traefik" || true
done || echo "No user crontabs with Traefik commands found"
register: all_user_crontabs
changed_when: false
- name: Check system-wide cron directories
ansible.builtin.shell: |
for dir in /etc/cron.d /etc/cron.daily /etc/cron.hourly /etc/cron.weekly /etc/cron.monthly; do
if [ -d "$dir" ]; then
echo "=== $dir ==="
grep -rE "traefik|docker.*compose.*traefik|docker.*stop.*traefik|docker.*restart.*traefik|docker.*down.*traefik" "$dir" 2>/dev/null || echo "No matches"
fi
done
register: system_cron_dirs
changed_when: false
- name: Check systemd timers and services
ansible.builtin.shell: |
echo "=== Active Timers ==="
systemctl list-timers --all --no-pager | grep -E "traefik|docker.*compose" || echo "No Traefik-related timers"
echo ""
echo "=== Custom Services ==="
systemctl list-units --type=service --all | grep -E "traefik|docker.*compose" || echo "No Traefik-related services"
register: systemd_services
changed_when: false
- name: Check for scripts in deployment directory that restart Traefik
ansible.builtin.shell: |
find /home/deploy/deployment -type f \( -name "*.sh" -o -name "*.yml" -o -name "*.yaml" \) -exec grep -lE "traefik.*restart|docker.*compose.*traefik.*restart|docker.*compose.*traefik.*down|docker.*compose.*traefik.*stop" {} \; 2>/dev/null | head -30
register: deployment_scripts
changed_when: false
- name: Check Ansible roles for traefik_auto_restart or restart tasks
ansible.builtin.shell: |
grep -rE "traefik_auto_restart|traefik.*restart|docker.*compose.*traefik.*restart" /home/deploy/deployment/ansible/roles/ 2>/dev/null | grep -v ".git" | head -20 || echo "No auto-restart settings found"
register: ansible_auto_restart
changed_when: false
- name: Check Docker Compose watch mode
ansible.builtin.shell: |
cd {{ traefik_stack_path }}
docker compose ps traefik 2>/dev/null | grep -q "traefik" && echo "running" || echo "not_running"
register: docker_compose_watch
changed_when: false
failed_when: false
- name: Check if Docker Compose is running in watch mode
ansible.builtin.shell: |
ps aux | grep -E "docker.*compose.*watch|docker.*compose.*--watch" | grep -v grep || echo "No Docker Compose watch mode detected"
register: watch_mode_process
changed_when: false
- name: Check for monitoring/watchdog scripts
ansible.builtin.shell: |
find /home/deploy -type f -name "*monitor*" -o -name "*watchdog*" -o -name "*health*" 2>/dev/null | xargs grep -lE "traefik|docker.*compose.*traefik" 2>/dev/null | head -10 || echo "No monitoring scripts found"
register: monitoring_scripts
changed_when: false
- name: Check Gitea Workflows for Traefik restarts
ansible.builtin.shell: |
find /home/deploy -type f -path "*/.gitea/workflows/*.yml" -o -path "*/.github/workflows/*.yml" 2>/dev/null | xargs grep -lE "traefik.*restart|docker.*compose.*traefik.*restart" 2>/dev/null | head -10 || echo "No Gitea workflows found that restart Traefik"
register: gitea_workflows
changed_when: false
- name: Monitor Docker events in real-time (5 minutes)
ansible.builtin.shell: |
timeout {{ monitor_duration_seconds }} docker events --filter container=traefik --format "{{ '{{' }}.Time{{ '}}' }} {{ '{{' }}.Action{{ '}}' }} {{ '{{' }}.Actor.Attributes.name{{ '}}' }}" 2>&1 || echo "Monitoring completed or timeout"
register: docker_events_realtime
changed_when: false
failed_when: false
async: "{{ monitor_duration_seconds + 10 }}"
poll: 0
- name: Wait for monitoring to complete
ansible.builtin.async_status:
jid: "{{ docker_events_realtime.ansible_job_id }}"
register: monitoring_result
until: monitoring_result.finished
retries: "{{ (monitor_duration_seconds / 10) | int + 5 }}"
delay: 10
failed_when: false
- name: Check system reboot history
ansible.builtin.shell: |
last reboot --since "24 hours ago" 2>/dev/null | head -10 || echo "No reboots in last 24 hours"
register: reboot_history
changed_when: false
failed_when: false
- name: Check for at jobs
ansible.builtin.shell: |
atq 2>/dev/null | while read line; do
job_id=$(echo "$line" | awk '{print $1}')
at -c "$job_id" 2>/dev/null | grep -qE "traefik|docker.*compose.*traefik" && echo "=== Job ID: $job_id ===" && at -c "$job_id" 2>/dev/null | grep -E "traefik|docker.*compose.*traefik" || true
done || echo "No at jobs found or atq not available"
register: at_jobs
changed_when: false
- name: Check Docker daemon configuration for auto-restart
ansible.builtin.shell: |
cat /etc/docker/daemon.json 2>/dev/null | grep -iE "restart|live-restore" || echo "No restart settings in daemon.json"
register: docker_daemon_config
changed_when: false
failed_when: false
- name: Check if Traefik has restart policy
ansible.builtin.shell: |
cd {{ traefik_stack_path }}
docker compose config | grep -A 5 "traefik:" | grep -E "restart|restart_policy" || echo "No explicit restart policy found"
register: traefik_restart_policy
changed_when: false
failed_when: false
- name: Summary
ansible.builtin.debug:
msg: |
================================================================================
TRAEFIK RESTART SOURCE DIAGNOSE - ZUSAMMENFASSUNG:
================================================================================
Traefik Status:
- Restart Count: {{ traefik_restart_count.stdout }}
- Started At: {{ traefik_started_at.stdout }}
- Stop Messages gefunden: {{ traefik_stop_messages.stdout_lines | length }} (letzte 20)
Stop-Zeitstempel (letzte 20):
{% if stop_timestamps.stdout %}
{{ stop_timestamps.stdout }}
{% else %}
Keine Stop-Zeitstempel gefunden
{% endif %}
Docker Events (letzte 24h):
{% if docker_events_traefik.stdout and 'No Traefik die events' not in docker_events_traefik.stdout %}
{{ docker_events_traefik.stdout }}
{% else %}
Keine Traefik die-Events in den letzten 24 Stunden
{% endif %}
Docker Daemon Logs:
{% if docker_daemon_logs.stdout and 'No Traefik stop events' not in docker_daemon_logs.stdout %}
{{ docker_daemon_logs.stdout }}
{% else %}
Keine Traefik-Stop-Events in Docker-Daemon-Logs
{% endif %}
Gefundene Quellen:
{% if all_user_crontabs.stdout and 'No user crontabs' not in all_user_crontabs.stdout %}
1. ❌ CRONJOBS (User):
{{ all_user_crontabs.stdout }}
{% endif %}
{% if system_cron_dirs.stdout and 'No matches' not in system_cron_dirs.stdout %}
2. ❌ SYSTEM CRON:
{{ system_cron_dirs.stdout }}
{% endif %}
{% if systemd_services.stdout and 'No Traefik-related' not in systemd_services.stdout %}
3. ❌ SYSTEMD TIMERS/SERVICES:
{{ systemd_services.stdout }}
{% endif %}
{% if deployment_scripts.stdout and 'No' not in deployment_scripts.stdout %}
4. ⚠️ DEPLOYMENT SCRIPTS:
{{ deployment_scripts.stdout }}
{% endif %}
{% if ansible_auto_restart.stdout and 'No auto-restart' not in ansible_auto_restart.stdout %}
5. ⚠️ ANSIBLE AUTO-RESTART:
{{ ansible_auto_restart.stdout }}
{% endif %}
{% if gitea_workflows.stdout and 'No Gitea workflows' not in gitea_workflows.stdout %}
6. ⚠️ GITEA WORKFLOWS:
{{ gitea_workflows.stdout }}
{% endif %}
{% if monitoring_scripts.stdout and 'No monitoring scripts' not in monitoring_scripts.stdout %}
7. ⚠️ MONITORING SCRIPTS:
{{ monitoring_scripts.stdout }}
{% endif %}
{% if at_jobs.stdout and 'No at jobs' not in at_jobs.stdout %}
8. ❌ AT JOBS:
{{ at_jobs.stdout }}
{% endif %}
{% if docker_compose_watch.stdout and 'Could not check' not in docker_compose_watch.stdout %}
9. ⚠️ DOCKER COMPOSE WATCH:
{{ docker_compose_watch.stdout }}
{% endif %}
{% if watch_mode_process.stdout and 'No Docker Compose watch' not in watch_mode_process.stdout %}
10. ❌ DOCKER COMPOSE WATCH MODE (PROZESS):
{{ watch_mode_process.stdout }}
{% endif %}
{% if reboot_history.stdout and 'No reboots' not in reboot_history.stdout %}
11. ⚠️ SYSTEM REBOOTS:
{{ reboot_history.stdout }}
{% endif %}
Real-Time Monitoring ({{ monitor_duration_seconds }} Sekunden):
{% if monitoring_result.finished and monitoring_result.ansible_job_id %}
{{ monitoring_result.stdout | default('Keine Events während Monitoring') }}
{% else %}
Monitoring läuft noch oder wurde unterbrochen
{% endif %}
================================================================================
NÄCHSTE SCHRITTE:
================================================================================
{% if all_user_crontabs.stdout and 'No user crontabs' not in all_user_crontabs.stdout %}
1. ❌ CRONJOBS DEAKTIVIEREN:
- Prüfe gefundene Cronjobs: {{ all_user_crontabs.stdout }}
- Entferne oder kommentiere die entsprechenden Einträge
{% endif %}
{% if system_cron_dirs.stdout and 'No matches' not in system_cron_dirs.stdout %}
2. ❌ SYSTEM CRON DEAKTIVIEREN:
- Prüfe gefundene System-Cronjobs: {{ system_cron_dirs.stdout }}
- Entferne oder benenne die Dateien um
{% endif %}
{% if systemd_services.stdout and 'No Traefik-related' not in systemd_services.stdout %}
3. ❌ SYSTEMD TIMERS/SERVICES DEAKTIVIEREN:
- Prüfe gefundene Services/Timers: {{ systemd_services.stdout }}
- Deaktiviere mit: systemctl disable <service>
{% endif %}
{% if deployment_scripts.stdout and 'No' not in deployment_scripts.stdout %}
4. ⚠️ DEPLOYMENT SCRIPTS PRÜFEN:
- Prüfe gefundene Scripts: {{ deployment_scripts.stdout }}
- Entferne oder kommentiere Traefik-Restart-Befehle
{% endif %}
{% if ansible_auto_restart.stdout and 'No auto-restart' not in ansible_auto_restart.stdout %}
5. ⚠️ ANSIBLE AUTO-RESTART PRÜFEN:
- Prüfe gefundene Einstellungen: {{ ansible_auto_restart.stdout }}
- Setze traefik_auto_restart: false in group_vars
{% endif %}
{% if not all_user_crontabs.stdout or 'No user crontabs' in all_user_crontabs.stdout %}
{% if not system_cron_dirs.stdout or 'No matches' in system_cron_dirs.stdout %}
{% if not systemd_services.stdout or 'No Traefik-related' in systemd_services.stdout %}
{% if not deployment_scripts.stdout or 'No' in deployment_scripts.stdout %}
{% if not ansible_auto_restart.stdout or 'No auto-restart' in ansible_auto_restart.stdout %}
⚠️ KEINE AUTOMATISCHEN RESTART-MECHANISMEN GEFUNDEN!
Mögliche Ursachen:
1. Externer Prozess (nicht über Cron/Systemd)
2. Docker-Service-Restarts (systemctl restart docker)
3. Host-Reboots
4. Manuelle Restarts (von außen)
5. Monitoring-Service (Portainer, Watchtower, etc.)
Nächste Schritte:
1. Führe 'docker events --filter container=traefik' manuell aus und beobachte
2. Prüfe journalctl -u docker.service für Docker-Service-Restarts
3. Prüfe ob Portainer oder andere Monitoring-Tools laufen
4. Prüfe ob Watchtower oder andere Auto-Update-Tools installiert sind
{% endif %}
{% endif %}
{% endif %}
{% endif %}
{% endif %}
================================================================================