Some checks failed
🚀 Build & Deploy Image / Determine Build Necessity (push) Failing after 10m14s
🚀 Build & Deploy Image / Build Runtime Base Image (push) Has been skipped
🚀 Build & Deploy Image / Build Docker Image (push) Has been skipped
🚀 Build & Deploy Image / Run Tests & Quality Checks (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Staging (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Production (push) Has been skipped
Security Vulnerability Scan / Check for Dependency Changes (push) Failing after 11m25s
Security Vulnerability Scan / Composer Security Audit (push) Has been cancelled
- Remove middleware reference from Gitea Traefik labels (caused routing issues) - Optimize Gitea connection pool settings (MAX_IDLE_CONNS=30, authentication_timeout=180s) - Add explicit service reference in Traefik labels - Fix intermittent 504 timeouts by improving PostgreSQL connection handling Fixes Gitea unreachability via git.michaelschiemer.de
151 lines
6.9 KiB
YAML
151 lines
6.9 KiB
YAML
---
|
|
# Monitor Traefik for Unexpected Restarts
|
|
# Überwacht Traefik-Logs auf "I have to go..." Meldungen und identifiziert die Ursache
|
|
- name: Monitor Traefik Restarts
|
|
hosts: production
|
|
gather_facts: yes
|
|
become: no
|
|
|
|
vars:
|
|
monitor_lookback_hours: "{{ monitor_lookback_hours | default(24) }}"
|
|
|
|
tasks:
|
|
- name: Check Traefik logs for "I have to go..." messages
|
|
ansible.builtin.shell: |
|
|
cd /home/deploy/deployment/stacks/traefik
|
|
docker compose logs traefik --since {{ monitor_lookback_hours }}h 2>&1 | grep -E "I have to go|Stopping server gracefully" | tail -20 || echo "No stop messages found"
|
|
register: traefik_stop_messages
|
|
changed_when: false
|
|
|
|
- name: Display Traefik stop messages
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
================================================================================
|
|
Traefik Stop-Meldungen (letzte {{ monitor_lookback_hours }} Stunden):
|
|
================================================================================
|
|
{{ traefik_stop_messages.stdout }}
|
|
================================================================================
|
|
|
|
- name: Check Traefik container restart count
|
|
ansible.builtin.shell: |
|
|
docker inspect traefik --format '{{ '{{' }}.RestartCount{{ '}}' }}' 2>/dev/null || echo "0"
|
|
register: traefik_restart_count
|
|
changed_when: false
|
|
|
|
- name: Check Traefik container start time
|
|
ansible.builtin.shell: |
|
|
docker inspect traefik --format '{{ '{{' }}.State.StartedAt{{ '}}' }}' 2>/dev/null || echo "UNKNOWN"
|
|
register: traefik_started_at
|
|
changed_when: false
|
|
|
|
- name: Check Docker events for Traefik stops
|
|
ansible.builtin.shell: |
|
|
timeout 5 docker events --since {{ monitor_lookback_hours }}h --filter container=traefik --filter event=die --format "{{ '{{' }}.Time{{ '}}' }} {{ '{{' }}.Action{{ '}}' }} {{ '{{' }}.Actor.Attributes.name{{ '}}' }}" 2>/dev/null | tail -20 || echo "No stop events found or docker events not available"
|
|
register: traefik_stop_events
|
|
changed_when: false
|
|
|
|
- name: Display Traefik stop events
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
================================================================================
|
|
Docker Stop-Events für Traefik (letzte {{ monitor_lookback_hours }} Stunden):
|
|
================================================================================
|
|
{{ traefik_stop_events.stdout }}
|
|
================================================================================
|
|
|
|
- name: Check for manual docker compose commands in history
|
|
ansible.builtin.shell: |
|
|
history | grep -E "docker.*compose.*traefik.*(restart|stop|down|up)" | tail -10 || echo "No manual docker compose commands found in history"
|
|
register: manual_commands
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: Display manual docker compose commands
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
================================================================================
|
|
Manuelle Docker Compose Befehle (aus History):
|
|
================================================================================
|
|
{{ manual_commands.stdout }}
|
|
================================================================================
|
|
|
|
- name: Check systemd docker service status
|
|
ansible.builtin.shell: |
|
|
systemctl status docker.service --no-pager -l | head -20 || echo "Could not check docker service status"
|
|
register: docker_service_status
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: Display Docker service status
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
================================================================================
|
|
Docker Service Status:
|
|
================================================================================
|
|
{{ docker_service_status.stdout }}
|
|
================================================================================
|
|
|
|
- name: Check for system reboots
|
|
ansible.builtin.shell: |
|
|
last reboot --since "{{ monitor_lookback_hours }} hours ago" 2>/dev/null | head -5 || echo "No reboots in the last {{ monitor_lookback_hours }} hours"
|
|
register: reboots
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: Display reboot history
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
================================================================================
|
|
System Reboots (letzte {{ monitor_lookback_hours }} Stunden):
|
|
================================================================================
|
|
{{ reboots.stdout }}
|
|
================================================================================
|
|
|
|
- name: Analyze stop message timestamps
|
|
ansible.builtin.set_fact:
|
|
stop_timestamps: "{{ traefik_stop_messages.stdout | regex_findall('\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}') }}"
|
|
|
|
- name: Count stop messages
|
|
ansible.builtin.set_fact:
|
|
stop_count: "{{ stop_timestamps | length | int }}"
|
|
|
|
- name: Summary
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
================================================================================
|
|
ZUSAMMENFASSUNG - Traefik Restart Monitoring:
|
|
================================================================================
|
|
|
|
Überwachungszeitraum: Letzte {{ monitor_lookback_hours }} Stunden
|
|
|
|
Traefik Status:
|
|
- Restart Count: {{ traefik_restart_count.stdout }}
|
|
- Gestartet um: {{ traefik_started_at.stdout }}
|
|
- Stop-Meldungen gefunden: {{ stop_count | default(0) }}
|
|
|
|
{% if (stop_count | default(0) | int) > 0 %}
|
|
⚠️ {{ stop_count }} Stop-Meldungen gefunden:
|
|
{{ traefik_stop_messages.stdout }}
|
|
|
|
Mögliche Ursachen:
|
|
{% if reboots.stdout and 'No reboots' not in reboots.stdout %}
|
|
1. System-Reboots: {{ reboots.stdout }}
|
|
{% endif %}
|
|
{% if traefik_stop_events.stdout and 'No stop events' not in traefik_stop_events.stdout %}
|
|
2. Docker Stop-Events: {{ traefik_stop_events.stdout }}
|
|
{% endif %}
|
|
{% if manual_commands.stdout and 'No manual' not in manual_commands.stdout %}
|
|
3. Manuelle Befehle: {{ manual_commands.stdout }}
|
|
{% endif %}
|
|
|
|
Nächste Schritte:
|
|
- Prüfe ob die Stop-Meldungen mit unseren manuellen Restarts übereinstimmen
|
|
- Prüfe ob System-Reboots die Ursache sind
|
|
- Prüfe Docker-Service-Logs für automatische Stops
|
|
{% else %}
|
|
✅ Keine Stop-Meldungen in den letzten {{ monitor_lookback_hours }} Stunden
|
|
Traefik läuft stabil!
|
|
{% endif %}
|
|
|
|
================================================================================
|