Files
michaelschiemer/deployment/ansible/playbooks/diagnose-gitea-timeout-live.yml
Michael Schiemer 36ef2a1e2c
Some checks failed
🚀 Build & Deploy Image / Determine Build Necessity (push) Failing after 10m14s
🚀 Build & Deploy Image / Build Runtime Base Image (push) Has been skipped
🚀 Build & Deploy Image / Build Docker Image (push) Has been skipped
🚀 Build & Deploy Image / Run Tests & Quality Checks (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Staging (push) Has been skipped
🚀 Build & Deploy Image / Auto-deploy to Production (push) Has been skipped
Security Vulnerability Scan / Check for Dependency Changes (push) Failing after 11m25s
Security Vulnerability Scan / Composer Security Audit (push) Has been cancelled
fix: Gitea Traefik routing and connection pool optimization
- Remove middleware reference from Gitea Traefik labels (caused routing issues)
- Optimize Gitea connection pool settings (MAX_IDLE_CONNS=30, authentication_timeout=180s)
- Add explicit service reference in Traefik labels
- Fix intermittent 504 timeouts by improving PostgreSQL connection handling

Fixes Gitea unreachability via git.michaelschiemer.de
2025-11-09 14:46:15 +01:00

344 lines
16 KiB
YAML

---
# Diagnose Gitea Timeout - Live während Request
# Führt alle Checks während eines tatsächlichen Requests durch
- name: Diagnose Gitea Timeout During Request
hosts: production
gather_facts: yes
become: no
vars:
gitea_stack_path: "{{ stacks_base_path }}/gitea"
traefik_stack_path: "{{ stacks_base_path }}/traefik"
gitea_url: "https://{{ gitea_domain }}"
test_duration_seconds: 60 # Wie lange wir testen
test_timestamp: "{{ ansible_date_time.epoch }}"
tasks:
- name: Display diagnostic plan
ansible.builtin.debug:
msg: |
================================================================================
GITEA TIMEOUT DIAGNOSE - LIVE WÄHREND REQUEST
================================================================================
Diese Diagnose führt alle Checks während eines tatsächlichen Requests durch:
1. Docker Stats (CPU/RAM/IO) während Request
2. Gitea Logs (DB-Timeouts, Panics, "context deadline exceeded")
3. Postgres Logs (Connection issues)
4. Traefik Logs ("backend connection error", "EOF")
5. Direkter Test Traefik → Gitea
Test-Dauer: {{ test_duration_seconds }} Sekunden
Timestamp: {{ test_timestamp }}
================================================================================
- name: Get initial container stats (baseline)
ansible.builtin.shell: |
docker stats --no-stream --format "table {{ '{{' }}.Name{{ '}}' }}\t{{ '{{' }}.CPUPerc{{ '}}' }}\t{{ '{{' }}.MemUsage{{ '}}' }}\t{{ '{{' }}.NetIO{{ '}}' }}\t{{ '{{' }}.BlockIO{{ '}}' }}" gitea gitea-postgres gitea-redis traefik 2>/dev/null || echo "Stats collection failed"
register: initial_stats
changed_when: false
- name: Start collecting Docker stats in background
ansible.builtin.shell: |
timeout {{ test_duration_seconds }} docker stats --format "{{ '{{' }}.Name{{ '}}' }},{{ '{{' }}.CPUPerc{{ '}}' }},{{ '{{' }}.MemUsage{{ '}}' }},{{ '{{' }}.NetIO{{ '}}' }},{{ '{{' }}.BlockIO{{ '}}' }}" gitea gitea-postgres gitea-redis traefik 2>/dev/null | while read line; do
echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] $line"
done > /tmp/gitea_stats_{{ test_timestamp }}.log 2>&1 &
STATS_PID=$!
echo $STATS_PID
register: stats_pid
changed_when: false
- name: Start collecting Gitea logs in background
ansible.builtin.shell: |
cd {{ gitea_stack_path }}
timeout {{ test_duration_seconds }} docker compose logs -f gitea 2>&1 | while read line; do
echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] $line"
done > /tmp/gitea_logs_{{ test_timestamp }}.log 2>&1 &
echo $!
register: gitea_logs_pid
changed_when: false
- name: Start collecting Postgres logs in background
ansible.builtin.shell: |
cd {{ gitea_stack_path }}
timeout {{ test_duration_seconds }} docker compose logs -f gitea-postgres 2>&1 | while read line; do
echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] $line"
done > /tmp/postgres_logs_{{ test_timestamp }}.log 2>&1 &
echo $!
register: postgres_logs_pid
changed_when: false
- name: Start collecting Traefik logs in background
ansible.builtin.shell: |
cd {{ traefik_stack_path }}
timeout {{ test_duration_seconds }} docker compose logs -f traefik 2>&1 | while read line; do
echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] $line"
done > /tmp/traefik_logs_{{ test_timestamp }}.log 2>&1 &
echo $!
register: traefik_logs_pid
changed_when: false
- name: Wait a moment for log collection to start
ansible.builtin.pause:
seconds: 2
- name: Trigger Gitea request via Traefik (with timeout)
ansible.builtin.shell: |
echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Starting request to {{ gitea_url }}/api/healthz"
timeout 35 curl -k -v -s -o /tmp/gitea_response_{{ test_timestamp }}.log -w "\nHTTP_CODE:%{http_code}\nTIME_TOTAL:%{time_total}\nTIME_CONNECT:%{time_connect}\nTIME_STARTTRANSFER:%{time_starttransfer}\n" "{{ gitea_url }}/api/healthz" 2>&1 | tee /tmp/gitea_curl_{{ test_timestamp }}.log
echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Request completed"
register: gitea_request
changed_when: false
failed_when: false
- name: Test direct connection Traefik → Gitea (parallel)
ansible.builtin.shell: |
echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Starting direct test Traefik → Gitea"
cd {{ traefik_stack_path }}
timeout 35 docker compose exec -T traefik wget -qO- --timeout=30 http://gitea:3000/api/healthz 2>&1 | tee /tmp/traefik_gitea_direct_{{ test_timestamp }}.log || echo "DIRECT_TEST_FAILED" > /tmp/traefik_gitea_direct_{{ test_timestamp }}.log
echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Direct test completed"
register: traefik_direct_test
changed_when: false
failed_when: false
- name: Wait for log collection to complete
ansible.builtin.pause:
seconds: "{{ test_duration_seconds - 5 }}"
- name: Stop background processes
ansible.builtin.shell: |
pkill -f "docker.*stats.*gitea" || true
pkill -f "docker compose logs.*gitea" || true
pkill -f "docker compose logs.*postgres" || true
pkill -f "docker compose logs.*traefik" || true
sleep 2
changed_when: false
failed_when: false
- name: Collect stats results
ansible.builtin.slurp:
src: "/tmp/gitea_stats_{{ test_timestamp }}.log"
register: stats_results
changed_when: false
failed_when: false
- name: Collect Gitea logs results
ansible.builtin.slurp:
src: "/tmp/gitea_logs_{{ test_timestamp }}.log"
register: gitea_logs_results
changed_when: false
failed_when: false
- name: Collect Postgres logs results
ansible.builtin.slurp:
src: "/tmp/postgres_logs_{{ test_timestamp }}.log"
register: postgres_logs_results
changed_when: false
failed_when: false
- name: Collect Traefik logs results
ansible.builtin.slurp:
src: "/tmp/traefik_logs_{{ test_timestamp }}.log"
register: traefik_logs_results
changed_when: false
failed_when: false
- name: Get request result
ansible.builtin.slurp:
src: "/tmp/gitea_curl_{{ test_timestamp }}.log"
register: request_result
changed_when: false
failed_when: false
- name: Get direct test result
ansible.builtin.slurp:
src: "/tmp/traefik_gitea_direct_{{ test_timestamp }}.log"
register: direct_test_result
changed_when: false
failed_when: false
- name: Analyze stats for high CPU/Memory/IO
ansible.builtin.shell: |
if [ -f /tmp/gitea_stats_{{ test_timestamp }}.log ]; then
echo "=== STATS SUMMARY ==="
echo "Total samples: $(wc -l < /tmp/gitea_stats_{{ test_timestamp }}.log)"
echo ""
echo "=== HIGH CPU (>80%) ==="
grep -E "gitea|gitea-postgres" /tmp/gitea_stats_{{ test_timestamp }}.log | awk -F',' '{cpu=$2; gsub(/%/, "", cpu); if (cpu+0 > 80) print $0}' | head -10 || echo "No high CPU usage found"
echo ""
echo "=== MEMORY USAGE ==="
grep -E "gitea" /tmp/gitea_stats_{{ test_timestamp }}.log | tail -5 || echo "No memory stats"
echo ""
echo "=== NETWORK IO ==="
grep -E "gitea" /tmp/gitea_stats_{{ test_timestamp }}.log | tail -5 || echo "No network activity"
else
echo "Stats file not found"
fi
register: stats_analysis
changed_when: false
failed_when: false
- name: Analyze Gitea logs for errors
ansible.builtin.shell: |
if [ -f /tmp/gitea_logs_{{ test_timestamp }}.log ]; then
echo "=== DB-TIMEOUTS / CONNECTION ERRORS ==="
grep -iE "timeout|deadline exceeded|connection.*failed|database.*error|postgres.*error|context.*deadline" /tmp/gitea_logs_{{ test_timestamp }}.log | tail -20 || echo "No DB-timeouts found"
echo ""
echo "=== PANICS / FATAL ERRORS ==="
grep -iE "panic|fatal|error.*fatal" /tmp/gitea_logs_{{ test_timestamp }}.log | tail -10 || echo "No panics found"
echo ""
echo "=== SLOW QUERIES / PERFORMANCE ==="
grep -iE "slow|performance|took.*ms|duration" /tmp/gitea_logs_{{ test_timestamp }}.log | tail -10 || echo "No slow queries found"
echo ""
echo "=== RECENT LOG ENTRIES (last 10) ==="
tail -10 /tmp/gitea_logs_{{ test_timestamp }}.log || echo "No recent logs"
else
echo "Gitea logs file not found"
fi
register: gitea_logs_analysis
changed_when: false
failed_when: false
- name: Analyze Postgres logs for errors
ansible.builtin.shell: |
if [ -f /tmp/postgres_logs_{{ test_timestamp }}.log ]; then
echo "=== POSTGRES ERRORS ==="
grep -iE "error|timeout|deadlock|connection.*refused|too many connections" /tmp/postgres_logs_{{ test_timestamp }}.log | tail -20 || echo "No Postgres errors found"
echo ""
echo "=== SLOW QUERIES ==="
grep -iE "slow|duration|statement.*took" /tmp/postgres_logs_{{ test_timestamp }}.log | tail -10 || echo "No slow queries found"
echo ""
echo "=== RECENT LOG ENTRIES (last 10) ==="
tail -10 /tmp/postgres_logs_{{ test_timestamp }}.log || echo "No recent logs"
else
echo "Postgres logs file not found"
fi
register: postgres_logs_analysis
changed_when: false
failed_when: false
- name: Analyze Traefik logs for backend errors
ansible.builtin.shell: |
if [ -f /tmp/traefik_logs_{{ test_timestamp }}.log ]; then
echo "=== BACKEND CONNECTION ERRORS ==="
grep -iE "backend.*error|connection.*error|EOF|gitea.*error|git\.michaelschiemer\.de.*error" /tmp/traefik_logs_{{ test_timestamp }}.log | tail -20 || echo "No backend errors found"
echo ""
echo "=== TIMEOUT ERRORS ==="
grep -iE "timeout|504|gateway.*timeout" /tmp/traefik_logs_{{ test_timestamp }}.log | tail -10 || echo "No timeout errors found"
echo ""
echo "=== RECENT LOG ENTRIES (last 10) ==="
tail -10 /tmp/traefik_logs_{{ test_timestamp }}.log || echo "No recent logs"
else
echo "Traefik logs file not found"
fi
register: traefik_logs_analysis
changed_when: false
failed_when: false
- name: Display comprehensive diagnosis
ansible.builtin.debug:
msg: |
================================================================================
GITEA TIMEOUT DIAGNOSE - ERGEBNISSE
================================================================================
BASELINE STATS (vor Request):
{{ initial_stats.stdout }}
REQUEST ERGEBNIS:
{% if request_result.content is defined and request_result.content != '' %}
{{ request_result.content | b64decode }}
{% else %}
Request-Ergebnis nicht verfügbar
{% endif %}
DIREKTER TEST TRAEFIK → GITEA:
{% if direct_test_result.content is defined and direct_test_result.content != '' %}
{{ direct_test_result.content | b64decode }}
{% else %}
Direkter Test-Ergebnis nicht verfügbar
{% endif %}
================================================================================
STATS-ANALYSE (während Request):
================================================================================
{{ stats_analysis.stdout }}
================================================================================
GITEA LOGS-ANALYSE:
================================================================================
{{ gitea_logs_analysis.stdout }}
================================================================================
POSTGRES LOGS-ANALYSE:
================================================================================
{{ postgres_logs_analysis.stdout }}
================================================================================
TRAEFIK LOGS-ANALYSE:
================================================================================
{{ traefik_logs_analysis.stdout }}
================================================================================
INTERPRETATION:
================================================================================
{% set request_content = request_result.content | default('') | b64decode | default('') %}
{% set direct_content = direct_test_result.content | default('') | b64decode | default('') %}
{% set traefik_errors = traefik_logs_analysis.stdout | default('') %}
{% set gitea_errors = gitea_logs_analysis.stdout | default('') %}
{% set postgres_errors = postgres_logs_analysis.stdout | default('') %}
{% set stats_content = stats_analysis.stdout | default('') %}
{% if 'timeout' in request_content or '504' in request_content or 'HTTP_CODE:504' in request_content %}
⚠️ REQUEST HAT TIMEOUT/504:
{% if 'EOF' in traefik_errors or 'backend' in traefik_errors | lower or 'connection.*error' in traefik_errors | lower %}
→ Traefik meldet Backend-Connection-Error
→ Gitea antwortet nicht auf Traefik's Verbindungsversuche
{% endif %}
{% if 'timeout' in gitea_errors | lower or 'deadline exceeded' in gitea_errors | lower %}
→ Gitea hat DB-Timeouts oder Context-Deadline-Exceeded
→ Postgres könnte blockieren oder zu langsam sein
{% endif %}
{% if 'too many connections' in postgres_errors | lower %}
→ Postgres hat zu viele Verbindungen
→ Connection Pool könnte überlastet sein
{% endif %}
{% if 'HIGH CPU' in stats_content or '>80' in stats_content %}
→ Gitea oder Postgres haben hohe CPU-Last
→ Performance-Problem, nicht Timeout-Konfiguration
{% endif %}
{% if 'DIRECT_TEST_FAILED' in direct_content or direct_content == '' %}
→ Direkter Test Traefik → Gitea schlägt fehl
→ Problem liegt bei Gitea selbst, nicht bei Traefik-Routing
{% endif %}
{% else %}
✅ REQUEST WAR ERFOLGREICH:
→ Problem tritt nur intermittierend auf
→ Prüfe Logs auf sporadische Fehler
{% endif %}
================================================================================
NÄCHSTE SCHRITTE:
================================================================================
1. Prüfe ob hohe CPU/Memory bei Gitea oder Postgres
2. Prüfe ob DB-Timeouts in Gitea-Logs
3. Prüfe ob Postgres "too many connections" meldet
4. Prüfe ob Traefik "backend connection error" oder "EOF" meldet
5. Prüfe ob direkter Test Traefik → Gitea funktioniert
================================================================================
- name: Cleanup temporary files
ansible.builtin.file:
path: "/tmp/gitea_{{ test_timestamp }}.log"
state: absent
failed_when: false