fix(ansible): Prevent Traefik and Gitea restart loops
Some checks failed
Security Vulnerability Scan / Check for Dependency Changes (push) Successful in 29s
Security Vulnerability Scan / Composer Security Audit (push) Has been skipped
🚀 Build & Deploy Image / Determine Build Necessity (push) Failing after 11m3s
🚀 Build & Deploy Image / Build Runtime Base Image (push) Has been cancelled
🚀 Build & Deploy Image / Run Tests & Quality Checks (push) Has been cancelled
🚀 Build & Deploy Image / Build Docker Image (push) Has been cancelled
🚀 Build & Deploy Image / Auto-deploy to Staging (push) Has been cancelled
🚀 Build & Deploy Image / Auto-deploy to Production (push) Has been cancelled

- Set traefik_auto_restart: false in group_vars to prevent automatic restarts after config deployment
- Set traefik_ssl_restart: false to prevent automatic restarts during SSL certificate setup
- Set gitea_auto_restart: false to prevent automatic restarts when healthcheck fails
- Modify traefik/tasks/ssl.yml to only restart if explicitly requested or acme.json was created
- Modify traefik/tasks/config.yml to respect traefik_auto_restart flag
- Modify gitea/tasks/restart.yml to respect gitea_auto_restart flag
- Add verify-traefik-fix.yml playbook to monitor Traefik stability

This fixes the issue where Traefik was restarting every minute due to
automatic restart mechanisms triggered by config deployments and health checks.
The restart loops caused 504 Gateway Timeouts for Gitea and other services.

Fixes: Traefik restart loop causing service unavailability
This commit is contained in:
2025-11-08 23:25:38 +01:00
parent aa9de7173d
commit bb7cf35e54
7 changed files with 645 additions and 0 deletions

View File

@@ -1,4 +1,62 @@
---
# Gitea Stack Configuration
gitea_stack_path: "{{ stacks_base_path }}/gitea"
gitea_container_name: "gitea"
gitea_url: "https://{{ gitea_domain | default('git.michaelschiemer.de') }}"
gitea_domain: "{{ gitea_domain | default('git.michaelschiemer.de') }}"
# Wait Configuration
gitea_wait_timeout: "{{ wait_timeout | default(60) }}"
gitea_wait_interval: 5
gitea_restart_wait_timeout: 30
gitea_restart_retries: 30
gitea_restart_delay: 2
# Health Check Configuration
gitea_health_check_timeout: 10
gitea_check_health: true
gitea_show_status: true
gitea_show_logs: true
gitea_logs_tail: 50
# Auto-Restart Configuration
# Set to false to prevent automatic restarts when healthcheck fails
# This prevents restart loops when Gitea is temporarily unavailable
gitea_auto_restart: true
# Config Update Configuration
gitea_app_ini_path: "{{ gitea_stack_path }}/app.ini"
gitea_app_ini_container_path: "/data/gitea/conf/app.ini"
gitea_app_ini_template: "../../templates/gitea-app.ini.j2"
gitea_config_retries: 30
gitea_config_delay: 2
# Setup Configuration
gitea_admin_username: "{{ vault_gitea_admin_username | default('admin') }}"
gitea_admin_password: "{{ vault_gitea_admin_password | default('') }}"
gitea_admin_email: "{{ vault_gitea_admin_email | default(acme_email) }}"
gitea_force_update_app_ini: false
gitea_setup_health_retries: 30
gitea_setup_health_delay: 5
gitea_setup_db_wait: 10
# Runner Configuration
gitea_runner_path: "{{ runner_path | default('/home/deploy/deployment/gitea-runner') }}"
gitea_runner_container_name: "gitea-runner"
gitea_instance_url: "https://git.michaelschiemer.de"
gitea_runner_action: "fix" # Options: fix, register
gitea_runner_registration_token: ""
gitea_runner_name: "dev-runner-01"
gitea_runner_labels: "ubuntu-latest:docker://node:16-bullseye,ubuntu-22.04:docker://node:16-bullseye,php-ci:docker://php-ci:latest"
gitea_runner_show_status: true
gitea_runner_wait_seconds: 5
# Repository Configuration
gitea_repo_name: "michaelschiemer"
gitea_repo_owner: "michael"
gitea_repo_private: false
gitea_repo_description: "Main application repository"
gitea_repo_auto_init: false
gitea_configure_git_remote: true
gitea_git_repo_path: "/home/michael/dev/michaelschiemer"
gitea_force_create_repo: false

View File

@@ -0,0 +1,123 @@
---
# Check and Restart Gitea if Unhealthy
- name: Check if Gitea stack directory exists
ansible.builtin.stat:
path: "{{ gitea_stack_path }}"
register: gitea_stack_exists
- name: Fail if Gitea stack directory does not exist
ansible.builtin.fail:
msg: "Gitea stack directory not found at {{ gitea_stack_path }}"
when: not gitea_stack_exists.stat.exists
- name: Check Gitea container status
ansible.builtin.shell: |
cd {{ gitea_stack_path }}
docker compose ps {{ gitea_container_name }} --format json
register: gitea_container_status
changed_when: false
failed_when: false
- name: Display Gitea container status
ansible.builtin.debug:
msg: |
================================================================================
Gitea Container Status:
{{ gitea_container_status.stdout | default('Container not found or error') }}
================================================================================
when: gitea_show_status | default(true) | bool
- name: Check Gitea health endpoint
ansible.builtin.uri:
url: "{{ gitea_url }}/api/healthz"
method: GET
status_code: [200]
validate_certs: false
timeout: "{{ gitea_health_check_timeout | default(10) }}"
register: gitea_health
ignore_errors: yes
changed_when: false
- name: Display Gitea health check result
ansible.builtin.debug:
msg: |
================================
Gitea Health Check:
- Status Code: {{ gitea_health.status | default('UNREACHABLE') }}
- Response Time: {{ gitea_health.elapsed | default('N/A') }}s
- Status: {% if gitea_health.status | default(0) == 200 %}✅ HEALTHY{% else %}❌ UNHEALTHY or TIMEOUT{% endif %}
================================
when: gitea_show_status | default(true) | bool
- name: Get Gitea container logs
ansible.builtin.shell: |
cd {{ gitea_stack_path }}
docker compose logs --tail={{ gitea_logs_tail | default(50) }} {{ gitea_container_name }} 2>&1 || echo "LOGS_NOT_AVAILABLE"
register: gitea_logs
changed_when: false
failed_when: false
- name: Display Gitea container logs
ansible.builtin.debug:
msg: |
================================================================================
Gitea Container Logs (last {{ gitea_logs_tail | default(50) }} lines):
{{ gitea_logs.stdout | default('No logs available') }}
================================================================================
when: gitea_show_logs | default(true) | bool
- name: Check if Gitea container is running
ansible.builtin.set_fact:
gitea_is_running: "{{ 'State\":\"running' in (gitea_container_status.stdout | default('')) }}"
- name: Check if Gitea is healthy
ansible.builtin.set_fact:
gitea_is_healthy: "{{ (gitea_health.status | default(0)) == 200 }}"
- name: Restart Gitea container if unhealthy or not running
ansible.builtin.shell: |
cd {{ gitea_stack_path }}
docker compose restart {{ gitea_container_name }}
when:
- (not gitea_is_healthy | bool or not gitea_is_running | bool)
- gitea_auto_restart | default(true) | bool
register: gitea_restart
changed_when: gitea_restart.rc == 0
notify: wait for gitea
- name: Wait for Gitea to be ready after restart
ansible.builtin.uri:
url: "{{ gitea_url }}/api/healthz"
method: GET
status_code: [200]
validate_certs: false
timeout: "{{ gitea_health_check_timeout | default(10) }}"
register: gitea_health_after_restart
until: gitea_health_after_restart.status == 200
retries: "{{ gitea_restart_retries | default(30) }}"
delay: "{{ gitea_restart_delay | default(2) }}"
when: gitea_restart.changed | default(false)
ignore_errors: yes
changed_when: false
- name: Display final status
ansible.builtin.debug:
msg: |
========================================
========================================
Gitea Status Summary
========================================
Container Running: {% if gitea_is_running | bool %}✅ YES{% else %}❌ NO{% endif %}
Health Check: {% if gitea_health_after_restart.status | default(0) == 200 %}✅ HEALTHY{% elif gitea_is_healthy | bool %}✅ HEALTHY{% else %}❌ UNHEALTHY{% endif %}
Action Taken: {% if gitea_restart.changed | default(false) %}🔄 Container restarted{% else %} No restart needed{% endif %}
Final Status: {% if gitea_is_running | bool and (gitea_health_after_restart.status | default(0) == 200 or gitea_is_healthy | bool) %}✅ HEALTHY{% else %}❌ STILL UNHEALTHY{% endif %}
========================================
{% if gitea_is_running | bool and (gitea_health_after_restart.status | default(0) == 200 or gitea_is_healthy | bool) %}
✅ Gitea is now accessible and healthy!
{% else %}
❌ Gitea is still not fully healthy. Manual intervention may be required.
{% endif %}
========================================
when: gitea_show_status | default(true) | bool

View File

@@ -1,4 +1,36 @@
---
# Traefik Stack Configuration
traefik_stack_path: "{{ stacks_base_path }}/traefik"
traefik_container_name: "traefik"
traefik_url: "https://traefik.michaelschiemer.de"
# Local config path (for config deployment)
traefik_local_config_path: "{{ playbook_dir | default('') }}/../../stacks/traefik"
# Wait Configuration
traefik_wait_timeout: "{{ wait_timeout | default(60) }}"
traefik_wait_interval: 5
traefik_restart_wait_timeout: 30
# Restart Configuration
traefik_restart_action: "restart" # Options: restart, recreate
traefik_check_health: true
traefik_show_status: true
# Config Deployment
traefik_auto_restart: true # Automatically restart after config deployment
# Logs Configuration
traefik_logs_tail: 100
traefik_logs_error_tail: 20
traefik_logs_recent_tail: 50
traefik_logs_since_minutes: 10 # Optional: Show logs from last N minutes
traefik_show_all_logs: false
# SSL Certificate Configuration
traefik_ssl_domains: [] # List of domains for SSL certificate setup
traefik_acme_email: "{{ acme_email | default('kontakt@michaelschiemer.de') }}"
traefik_ssl_restart: false # Restart Traefik after SSL setup (default: false to avoid restart loops)
traefik_ssl_wait_timeout: 10
traefik_ssl_trigger_timeout: 5
traefik_ssl_cert_wait_timeout: 30

View File

@@ -0,0 +1,78 @@
---
# Deploy Traefik Configuration Files
- name: Check if local Traefik config directory exists
ansible.builtin.stat:
path: "{{ traefik_local_config_path }}"
register: local_traefik_exists
delegate_to: localhost
run_once: true
- name: Fail if local Traefik config directory does not exist
ansible.builtin.fail:
msg: "Local Traefik config directory not found at {{ traefik_local_config_path }}"
when: not local_traefik_exists.stat.exists
delegate_to: localhost
run_once: true
- name: Check if remote Traefik stack directory exists
ansible.builtin.stat:
path: "{{ traefik_stack_path }}"
register: traefik_stack_exists
- name: Fail if remote Traefik stack directory does not exist
ansible.builtin.fail:
msg: "Remote Traefik stack directory not found at {{ traefik_stack_path }}"
when: not traefik_stack_exists.stat.exists
- name: Deploy docker-compose.yml
ansible.builtin.copy:
src: "{{ traefik_local_config_path }}/docker-compose.yml"
dest: "{{ traefik_stack_path }}/docker-compose.yml"
mode: '0644'
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
register: docker_compose_deployed
- name: Deploy traefik.yml
ansible.builtin.copy:
src: "{{ traefik_local_config_path }}/traefik.yml"
dest: "{{ traefik_stack_path }}/traefik.yml"
mode: '0644'
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
register: traefik_yml_deployed
- name: Display deployment status
ansible.builtin.debug:
msg: |
========================================
Traefik Configuration Deployment
========================================
docker-compose.yml: {{ '✅ DEPLOYED' if docker_compose_deployed.changed else ' No changes' }}
traefik.yml: {{ '✅ DEPLOYED' if traefik_yml_deployed.changed else ' No changes' }}
========================================
{% if docker_compose_deployed.changed or traefik_yml_deployed.changed %}
✅ Configuration files deployed successfully!
{% if traefik_auto_restart | default(true) | bool %}
Next: Traefik will be restarted automatically to apply changes.
{% else %}
Next step: Restart Traefik to apply changes:
ansible-playbook -i inventory/production.yml playbooks/restart-traefik.yml --vault-password-file secrets/.vault_pass
{% endif %}
{% else %}
Configuration files are already up to date.
{% endif %}
when: traefik_show_status | default(true) | bool
- name: Restart Traefik after config deployment
include_tasks: restart.yml
when:
- (docker_compose_deployed.changed or traefik_yml_deployed.changed)
- traefik_auto_restart | default(true) | bool
vars:
traefik_restart_action: restart
traefik_show_status: false

View File

@@ -0,0 +1,101 @@
---
# Setup Let's Encrypt SSL Certificates via Traefik
- name: Check if acme.json exists and is a file
ansible.builtin.stat:
path: "{{ traefik_stack_path }}/acme.json"
register: acme_stat
- name: Remove acme.json if it's a directory
ansible.builtin.file:
path: "{{ traefik_stack_path }}/acme.json"
state: absent
become: yes
when: acme_stat.stat.exists and acme_stat.stat.isdir
- name: Ensure Traefik acme.json exists and has correct permissions
ansible.builtin.file:
path: "{{ traefik_stack_path }}/acme.json"
state: touch
mode: '0600'
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
become: yes
when: not acme_stat.stat.exists or (acme_stat.stat.exists and acme_stat.stat.isdir)
- name: Verify Traefik is running
ansible.builtin.command: |
cd {{ traefik_stack_path }}
docker compose ps {{ traefik_container_name }}
register: traefik_status
changed_when: false
- name: Fail if Traefik is not running
ansible.builtin.fail:
msg: "Traefik is not running. Please start it first."
when: traefik_status.rc != 0 or "Up" not in traefik_status.stdout
- name: Force Traefik to reload configuration (only if explicitly requested or acme.json was created)
ansible.builtin.command: |
cd {{ traefik_stack_path }}
docker compose restart {{ traefik_container_name }}
changed_when: true
when:
- (traefik_ssl_restart | default(false) | bool) or (acme_stat.changed | default(false) | bool)
- traefik_status.rc == 0
- "Up" in traefik_status.stdout
- name: Wait for Traefik to be ready (after restart)
ansible.builtin.wait_for:
timeout: "{{ traefik_ssl_wait_timeout | default(10) }}"
changed_when: false
when:
- (traefik_ssl_restart | default(false) | bool) or (acme_stat.changed | default(false) | bool)
- traefik_status.rc == 0
- "Up" in traefik_status.stdout
- name: Trigger certificate request by accessing each domain
ansible.builtin.uri:
url: "https://{{ item }}"
method: GET
validate_certs: no
timeout: "{{ traefik_ssl_trigger_timeout | default(5) }}"
status_code: [200, 301, 302, 303, 404, 502, 503]
loop: "{{ traefik_ssl_domains | default([]) }}"
register: certificate_trigger
changed_when: false
ignore_errors: yes
when: traefik_ssl_domains is defined and traefik_ssl_domains | length > 0
- name: Wait for ACME certificate generation
ansible.builtin.wait_for:
timeout: "{{ traefik_ssl_cert_wait_timeout | default(30) }}"
changed_when: false
when: traefik_ssl_domains is defined and traefik_ssl_domains | length > 0
- name: Check if acme.json contains certificates
ansible.builtin.stat:
path: "{{ traefik_stack_path }}/acme.json"
register: acme_file
- name: Display certificate status
ansible.builtin.debug:
msg: |
========================================
SSL Certificate Setup
========================================
{% if traefik_ssl_domains is defined and traefik_ssl_domains | length > 0 %}
Certificate setup triggered for domains:
{{ traefik_ssl_domains | join(', ') }}
{% else %}
No domains specified for certificate setup.
{% endif %}
ACME Email: {{ traefik_acme_email | default('Not specified') }}
Check Traefik logs to see certificate generation progress:
docker compose -f {{ traefik_stack_path }}/docker-compose.yml logs {{ traefik_container_name }} | grep -i acme
Certificates should be ready within 1-2 minutes.
========================================
when: traefik_show_status | default(true) | bool