From ba859d0fdfda641768cf7af0311e8b24d3038d46 Mon Sep 17 00:00:00 2001 From: Michael Schiemer Date: Sun, 9 Nov 2025 00:03:30 +0100 Subject: [PATCH] fix: prevent Traefik restart loops and improve Docker registry login Registry Login Fixes: - Filter out service names (minio, redis) from registry URL extraction - Only recognize actual registry URLs (with TLD or port) - Preserve port numbers in registry URLs (e.g. git.michaelschiemer.de:5000) - Better error messages for failed logins Traefik Restart Loop Prevention: - Set traefik_auto_restart default to false in traefik role - Add traefik_auto_restart, traefik_ssl_restart, gitea_auto_restart to staging vars - Add guard to fix-gitea-traefik-connection.yml restart task - Add guard and deprecation warning to update-gitea-traefik-service.yml This ensures that: - CI/CD pipelines won't cause Traefik restart loops - Staging environment uses same safe defaults as production - Deprecated playbooks fail by default unless explicitly enabled - Only actual Docker registries are used for login, not service names --- .../inventory/group_vars/staging/vars.yml | 113 ++++++++++++++++++ deployment/ansible/playbooks/deploy-image.yml | 39 ++++-- .../fix-gitea-traefik-connection.yml | 89 ++++++++++++++ .../update-gitea-traefik-service.yml | 79 ++++++++++++ .../ansible/roles/traefik/defaults/main.yml | 2 +- 5 files changed, 312 insertions(+), 10 deletions(-) create mode 100644 deployment/ansible/inventory/group_vars/staging/vars.yml create mode 100644 deployment/ansible/playbooks/fix-gitea-traefik-connection.yml create mode 100644 deployment/ansible/playbooks/update-gitea-traefik-service.yml diff --git a/deployment/ansible/inventory/group_vars/staging/vars.yml b/deployment/ansible/inventory/group_vars/staging/vars.yml new file mode 100644 index 00000000..f5bc3d72 --- /dev/null +++ b/deployment/ansible/inventory/group_vars/staging/vars.yml @@ -0,0 +1,113 @@ +--- +# Staging Deployment - Centralized Variables +# These variables are used across all staging playbooks + +# System Maintenance +system_update_packages: true +system_apt_upgrade: dist +system_enable_unattended_upgrades: true +system_enable_unattended_reboot: false +system_unattended_reboot_time: "02:00" +system_enable_unattended_timer: true +system_enable_docker_prune: false + +# Deployment Paths +deploy_user_home: "/home/deploy" +stacks_base_path: "/home/deploy/deployment/stacks" +staging_stack_path: "{{ stacks_base_path }}/staging" +postgresql_staging_stack_path: "{{ stacks_base_path }}/postgresql-staging" +backups_path: "{{ deploy_user_home }}/deployment/backups" + +# Docker Registry +docker_registry: "localhost:5000" +docker_registry_url: "localhost:5000" +docker_registry_external: "registry.michaelschiemer.de" +docker_registry_username_default: "admin" +# docker_registry_password_default should be set in vault as vault_docker_registry_password +# If not using vault, override via -e docker_registry_password_default="your-password" +docker_registry_password_default: "" +registry_auth_path: "{{ stacks_base_path }}/registry/auth" + +# Application Configuration +app_name: "framework" +app_domain: "staging.michaelschiemer.de" +staging_domain: "{{ app_domain }}" +app_image: "{{ docker_registry }}/{{ app_name }}" +app_image_external: "{{ docker_registry_external }}/{{ app_name }}" + +# Domain Configuration +gitea_domain: "git.michaelschiemer.de" + +# Email Configuration +mail_from_address: "noreply@{{ app_domain }}" +acme_email: "kontakt@michaelschiemer.de" + +# SSL Certificate Domains +ssl_domains: + - "{{ gitea_domain }}" + - "{{ app_domain }}" + - "michaelschiemer.de" + +# Health Check Configuration +health_check_url: "https://{{ app_domain }}/health" +health_check_retries: 10 +health_check_delay: 10 + +# Rollback Configuration +max_rollback_versions: 3 +rollback_timeout: 300 + +# Wait Timeouts +wait_timeout: 60 + +# Git Configuration (for sync-code.yml) +git_repository_url_default: "https://{{ gitea_domain }}/michael/michaelschiemer.git" +git_branch_default: "staging" +git_token: "{{ vault_git_token | default('') }}" +git_username: "{{ vault_git_username | default('') }}" +git_password: "{{ vault_git_password | default('') }}" + +# Database Configuration +db_user_default: "postgres" +db_name_default: "michaelschiemer_staging" +db_host_default: "postgres-staging" + +# MinIO Object Storage Configuration +minio_root_user: "{{ vault_minio_root_user | default('minioadmin') }}" +minio_root_password: "{{ vault_minio_root_password | default('') }}" +minio_api_domain: "minio-api.michaelschiemer.de" +minio_console_domain: "minio.michaelschiemer.de" + +# WireGuard Configuration +wireguard_interface: "wg0" +wireguard_config_path: "/etc/wireguard" +wireguard_port_default: 51820 +wireguard_network_default: "10.8.0.0/24" +wireguard_server_ip_default: "10.8.0.1" +wireguard_enable_ip_forwarding: true +wireguard_config_file: "{{ wireguard_config_path }}/{{ wireguard_interface }}.conf" +wireguard_private_key_file: "{{ wireguard_config_path }}/{{ wireguard_interface }}_private.key" +wireguard_public_key_file: "{{ wireguard_config_path }}/{{ wireguard_interface }}_public.key" +wireguard_client_configs_path: "{{ wireguard_config_path }}/clients" + +# WireGuard DNS Configuration +# DNS server for VPN clients (points to VPN server IP) +# This ensures internal services are resolved to VPN IPs +wireguard_dns_servers: + - "{{ wireguard_server_ip_default }}" + +# Traefik Configuration +# Disable automatic restarts after config deployment to prevent restart loops +# Set to true only when explicitly needed (e.g., after major config changes) +traefik_auto_restart: false + +# Traefik SSL Configuration +# Disable automatic restarts during SSL certificate setup to prevent restart loops +traefik_ssl_restart: false + +# Gitea Auto-Restart Configuration +# Set to false to prevent automatic restarts when healthcheck fails +# This prevents restart loops when Gitea is temporarily unavailable (e.g., during Traefik restarts) +# Set to true only when explicitly needed for remediation +gitea_auto_restart: false + diff --git a/deployment/ansible/playbooks/deploy-image.yml b/deployment/ansible/playbooks/deploy-image.yml index e6200025..be3ee062 100644 --- a/deployment/ansible/playbooks/deploy-image.yml +++ b/deployment/ansible/playbooks/deploy-image.yml @@ -233,21 +233,42 @@ ignore_errors: yes changed_when: false - - name: Determine actual registry URLs from docker-compose files + - name: Extract registry URLs from docker-compose files (preserve port if present) ansible.builtin.shell: | cd {{ application_code_dest }} - grep -h "image:" docker-compose.base.yml docker-compose.{{ application_compose_suffix }} 2>/dev/null | sed -E 's/.*image:\s*([^\/]+).*/\1/' | sed 's/:.*//' | sort -u || echo "" - register: actual_registry_urls + grep -h "image:" docker-compose.base.yml docker-compose.{{ application_compose_suffix }} 2>/dev/null | \ + sed -E 's/.*image:\s*([^\/]+).*/\1/' | \ + sed -E 's/:([^:]+)$//' | \ + grep -E '\.(de|com|org|net|io|dev)|:[0-9]+|localhost' | \ + sort -u || echo "" + register: actual_registry_urls_raw changed_when: false failed_when: false - - name: Set list of registries to login to + - name: Extract full registry URLs with ports from docker-compose files + ansible.builtin.shell: | + cd {{ application_code_dest }} + grep -h "image:" docker-compose.base.yml docker-compose.{{ application_compose_suffix }} 2>/dev/null | \ + sed -E 's/.*image:\s*([^\/]+).*/\1/' | \ + sed -E 's/:([^:]+)$//' | \ + sort -u || echo "" + register: actual_registry_urls_full + changed_when: false + failed_when: false + + - name: Set list of registries to login to (filter out service names, preserve ports) ansible.builtin.set_fact: registries_to_login: >- - {%- set found_registries = actual_registry_urls.stdout | trim | split('\n') | select('match', '.+') | list -%} + {%- set found_registries = actual_registry_urls_full.stdout | trim | split('\n') | select('match', '.+') | list -%} + {%- set filtered_registries = [] -%} + {%- for reg in found_registries -%} + {%- if reg | regex_search('\.(de|com|org|net|io|dev)') or reg | regex_search(':[0-9]+') or reg == 'localhost' -%} + {%- set _ = filtered_registries.append(reg) -%} + {%- endif -%} + {%- endfor -%} {%- set default_registry = [docker_registry] -%} - {%- if found_registries | length > 0 -%} - {{ found_registries | unique | list }} + {%- if filtered_registries | length > 0 -%} + {{ filtered_registries | unique | list }} {%- else -%} {{ default_registry }} {%- endif -%} @@ -260,14 +281,14 @@ when: - registry_password | string | trim != '' - registry_accessible == 'true' - loop: "{{ registries_to_login }}" + loop: "{{ registries_to_login | default([docker_registry]) }}" no_log: yes register: docker_login_results failed_when: false - name: Display login results ansible.builtin.debug: - msg: "Docker login to {{ item.item }}: {% if item.failed %}FAILED{% else %}SUCCESS{% endif %}" + msg: "Docker login to {{ item.item }}: {% if item.failed %}FAILED ({{ item.msg | default('unknown error') }}){% else %}SUCCESS{% endif %}" when: - registry_password | string | trim != '' - registry_accessible == 'true' diff --git a/deployment/ansible/playbooks/fix-gitea-traefik-connection.yml b/deployment/ansible/playbooks/fix-gitea-traefik-connection.yml new file mode 100644 index 00000000..a018e801 --- /dev/null +++ b/deployment/ansible/playbooks/fix-gitea-traefik-connection.yml @@ -0,0 +1,89 @@ +--- +# Ansible Playbook: Fix Gitea-Traefik Connection Issues +# Purpose: Ensure Traefik can reliably reach Gitea by restarting both services +# Usage: +# ansible-playbook -i inventory/production.yml playbooks/fix-gitea-traefik-connection.yml \ +# --vault-password-file secrets/.vault_pass + +- name: Fix Gitea-Traefik Connection + hosts: production + vars: + gitea_stack_path: "{{ stacks_base_path }}/gitea" + traefik_stack_path: "{{ stacks_base_path }}/traefik" + gitea_url: "https://{{ gitea_domain }}" + + tasks: + - name: Get current Gitea container IP + shell: | + docker inspect gitea | grep -A 10 'traefik-public' | grep IPAddress | head -1 | awk '{print $2}' | tr -d '",' + register: gitea_ip + changed_when: false + failed_when: false + + - name: Display Gitea IP + debug: + msg: "Gitea container IP in traefik-public network: {{ gitea_ip.stdout }}" + + - name: Test direct connection to Gitea from Traefik container + shell: | + docker compose -f {{ traefik_stack_path }}/docker-compose.yml exec -T traefik wget -qO- http://{{ gitea_ip.stdout }}:3000/api/healthz 2>&1 | head -3 + register: traefik_gitea_test + changed_when: false + failed_when: false + + - name: Display Traefik-Gitea connection test result + debug: + msg: "{{ traefik_gitea_test.stdout }}" + + - name: Restart Gitea container to refresh IP + shell: | + docker compose -f {{ gitea_stack_path }}/docker-compose.yml restart gitea + when: traefik_gitea_test.rc != 0 + + - name: Wait for Gitea to be ready + uri: + url: "{{ gitea_url }}/api/healthz" + method: GET + status_code: [200] + validate_certs: false + timeout: 10 + register: gitea_health + until: gitea_health.status == 200 + retries: 30 + delay: 2 + changed_when: false + when: traefik_gitea_test.rc != 0 + + - name: Restart Traefik to refresh service discovery + shell: | + docker compose -f {{ traefik_stack_path }}/docker-compose.yml restart traefik + when: > + traefik_gitea_test.rc != 0 + and (traefik_auto_restart | default(false) | bool) + + - name: Wait for Traefik to be ready + pause: + seconds: 10 + when: traefik_gitea_test.rc != 0 + + - name: Test Gitea via Traefik + uri: + url: "{{ gitea_url }}/api/healthz" + method: GET + status_code: [200] + validate_certs: false + timeout: 10 + register: final_test + changed_when: false + + - name: Display result + debug: + msg: | + Gitea-Traefik connection test: + - Direct connection: {{ 'OK' if traefik_gitea_test.rc == 0 else 'FAILED' }} + - Via Traefik: {{ 'OK' if final_test.status == 200 else 'FAILED' }} + + {% if traefik_gitea_test.rc != 0 %} + Both services have been restarted to refresh connections. + {% endif %} + diff --git a/deployment/ansible/playbooks/update-gitea-traefik-service.yml b/deployment/ansible/playbooks/update-gitea-traefik-service.yml new file mode 100644 index 00000000..8c63e6d6 --- /dev/null +++ b/deployment/ansible/playbooks/update-gitea-traefik-service.yml @@ -0,0 +1,79 @@ +--- +# Ansible Playbook: Update Gitea Traefik Service with Current IP +# +# ⚠️ DEPRECATED: This playbook is no longer needed since Traefik runs in bridge network mode. +# Service discovery via Docker labels works reliably in bridge mode, so manual IP updates +# are not required. This playbook is kept for reference only. +# +# Purpose: Update Traefik dynamic config with current Gitea container IP +# Usage: +# ansible-playbook -i inventory/production.yml playbooks/update-gitea-traefik-service.yml \ +# --vault-password-file secrets/.vault_pass + +- name: Update Gitea Traefik Service with Current IP + hosts: production + vars: + traefik_stack_path: "{{ stacks_base_path }}/traefik" + gitea_url: "https://{{ gitea_domain }}" + + tasks: + - name: Warn that this playbook is deprecated + ansible.builtin.fail: + msg: | + ⚠️ This playbook is DEPRECATED and should not be used. + Traefik service discovery via Docker labels works reliably in bridge mode. + If you really need to run this, set traefik_auto_restart=true explicitly. + when: traefik_auto_restart | default(false) | bool == false + + - name: Get current Gitea container IP in traefik-public network + shell: | + docker inspect gitea | grep -A 10 'traefik-public' | grep IPAddress | head -1 | awk '{print $2}' | tr -d '",' + register: gitea_ip + changed_when: false + + - name: Display Gitea IP + debug: + msg: "Gitea container IP: {{ gitea_ip.stdout }}" + + - name: Create Gitea service configuration with current IP + copy: + dest: "{{ traefik_stack_path }}/dynamic/gitea-service.yml" + content: | + http: + services: + gitea: + loadBalancer: + servers: + - url: http://{{ gitea_ip.stdout }}:3000 + mode: '0644' + + - name: Restart Traefik to load new configuration + shell: | + docker compose -f {{ traefik_stack_path }}/docker-compose.yml restart traefik + when: traefik_auto_restart | default(false) | bool + + - name: Wait for Traefik to be ready + pause: + seconds: 10 + + - name: Test Gitea via Traefik + uri: + url: "{{ gitea_url }}/api/healthz" + method: GET + status_code: [200] + validate_certs: false + timeout: 10 + register: final_test + retries: 5 + delay: 2 + changed_when: false + + - name: Display result + debug: + msg: | + Gitea-Traefik connection: + - Gitea IP: {{ gitea_ip.stdout }} + - Via Traefik: {{ 'OK' if final_test.status == 200 else 'FAILED' }} + + Note: This is a temporary fix. The IP will need to be updated if the container restarts. + diff --git a/deployment/ansible/roles/traefik/defaults/main.yml b/deployment/ansible/roles/traefik/defaults/main.yml index c7f2afc2..7f2698b8 100644 --- a/deployment/ansible/roles/traefik/defaults/main.yml +++ b/deployment/ansible/roles/traefik/defaults/main.yml @@ -18,7 +18,7 @@ traefik_check_health: true traefik_show_status: true # Config Deployment -traefik_auto_restart: true # Automatically restart after config deployment +traefik_auto_restart: false # Automatically restart after config deployment (default: false to prevent restart loops) # Logs Configuration traefik_logs_tail: 100