fix(ansible): Prevent Traefik and Gitea restart loops
Some checks failed
Security Vulnerability Scan / Check for Dependency Changes (push) Successful in 29s
Security Vulnerability Scan / Composer Security Audit (push) Has been skipped
🚀 Build & Deploy Image / Determine Build Necessity (push) Failing after 11m3s
🚀 Build & Deploy Image / Build Runtime Base Image (push) Has been cancelled
🚀 Build & Deploy Image / Run Tests & Quality Checks (push) Has been cancelled
🚀 Build & Deploy Image / Build Docker Image (push) Has been cancelled
🚀 Build & Deploy Image / Auto-deploy to Staging (push) Has been cancelled
🚀 Build & Deploy Image / Auto-deploy to Production (push) Has been cancelled

- Set traefik_auto_restart: false in group_vars to prevent automatic restarts after config deployment
- Set traefik_ssl_restart: false to prevent automatic restarts during SSL certificate setup
- Set gitea_auto_restart: false to prevent automatic restarts when healthcheck fails
- Modify traefik/tasks/ssl.yml to only restart if explicitly requested or acme.json was created
- Modify traefik/tasks/config.yml to respect traefik_auto_restart flag
- Modify gitea/tasks/restart.yml to respect gitea_auto_restart flag
- Add verify-traefik-fix.yml playbook to monitor Traefik stability

This fixes the issue where Traefik was restarting every minute due to
automatic restart mechanisms triggered by config deployments and health checks.
The restart loops caused 504 Gateway Timeouts for Gitea and other services.

Fixes: Traefik restart loop causing service unavailability
This commit is contained in:
2025-11-08 23:25:38 +01:00
parent aa9de7173d
commit bb7cf35e54
7 changed files with 645 additions and 0 deletions

View File

@@ -0,0 +1,101 @@
---
# Setup Let's Encrypt SSL Certificates via Traefik
- name: Check if acme.json exists and is a file
ansible.builtin.stat:
path: "{{ traefik_stack_path }}/acme.json"
register: acme_stat
- name: Remove acme.json if it's a directory
ansible.builtin.file:
path: "{{ traefik_stack_path }}/acme.json"
state: absent
become: yes
when: acme_stat.stat.exists and acme_stat.stat.isdir
- name: Ensure Traefik acme.json exists and has correct permissions
ansible.builtin.file:
path: "{{ traefik_stack_path }}/acme.json"
state: touch
mode: '0600'
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
become: yes
when: not acme_stat.stat.exists or (acme_stat.stat.exists and acme_stat.stat.isdir)
- name: Verify Traefik is running
ansible.builtin.command: |
cd {{ traefik_stack_path }}
docker compose ps {{ traefik_container_name }}
register: traefik_status
changed_when: false
- name: Fail if Traefik is not running
ansible.builtin.fail:
msg: "Traefik is not running. Please start it first."
when: traefik_status.rc != 0 or "Up" not in traefik_status.stdout
- name: Force Traefik to reload configuration (only if explicitly requested or acme.json was created)
ansible.builtin.command: |
cd {{ traefik_stack_path }}
docker compose restart {{ traefik_container_name }}
changed_when: true
when:
- (traefik_ssl_restart | default(false) | bool) or (acme_stat.changed | default(false) | bool)
- traefik_status.rc == 0
- "Up" in traefik_status.stdout
- name: Wait for Traefik to be ready (after restart)
ansible.builtin.wait_for:
timeout: "{{ traefik_ssl_wait_timeout | default(10) }}"
changed_when: false
when:
- (traefik_ssl_restart | default(false) | bool) or (acme_stat.changed | default(false) | bool)
- traefik_status.rc == 0
- "Up" in traefik_status.stdout
- name: Trigger certificate request by accessing each domain
ansible.builtin.uri:
url: "https://{{ item }}"
method: GET
validate_certs: no
timeout: "{{ traefik_ssl_trigger_timeout | default(5) }}"
status_code: [200, 301, 302, 303, 404, 502, 503]
loop: "{{ traefik_ssl_domains | default([]) }}"
register: certificate_trigger
changed_when: false
ignore_errors: yes
when: traefik_ssl_domains is defined and traefik_ssl_domains | length > 0
- name: Wait for ACME certificate generation
ansible.builtin.wait_for:
timeout: "{{ traefik_ssl_cert_wait_timeout | default(30) }}"
changed_when: false
when: traefik_ssl_domains is defined and traefik_ssl_domains | length > 0
- name: Check if acme.json contains certificates
ansible.builtin.stat:
path: "{{ traefik_stack_path }}/acme.json"
register: acme_file
- name: Display certificate status
ansible.builtin.debug:
msg: |
========================================
SSL Certificate Setup
========================================
{% if traefik_ssl_domains is defined and traefik_ssl_domains | length > 0 %}
Certificate setup triggered for domains:
{{ traefik_ssl_domains | join(', ') }}
{% else %}
No domains specified for certificate setup.
{% endif %}
ACME Email: {{ traefik_acme_email | default('Not specified') }}
Check Traefik logs to see certificate generation progress:
docker compose -f {{ traefik_stack_path }}/docker-compose.yml logs {{ traefik_container_name }} | grep -i acme
Certificates should be ready within 1-2 minutes.
========================================
when: traefik_show_status | default(true) | bool