--- # Diagnose Gitea Timeout - Deep Analysis während Request # Führt alle Checks während eines tatsächlichen Requests durch, inkl. pg_stat_activity, Redis, Backpressure-Tests - name: Diagnose Gitea Timeout Deep Analysis During Request hosts: production gather_facts: yes become: no vars: gitea_stack_path: "{{ stacks_base_path }}/gitea" traefik_stack_path: "{{ stacks_base_path }}/traefik" gitea_url: "https://{{ gitea_domain }}" test_duration_seconds: 60 # Wie lange wir testen test_timestamp: "{{ ansible_date_time.epoch }}" postgres_max_connections: 300 tasks: - name: Display diagnostic plan ansible.builtin.debug: msg: | ================================================================================ GITEA TIMEOUT DEEP DIAGNOSE - LIVE WÄHREND REQUEST ================================================================================ Diese erweiterte Diagnose führt alle Checks während eines tatsächlichen Requests durch: 1. Docker Stats (CPU/RAM/IO) während Request 2. pg_stat_activity: Connection Count vs max_connections ({{ postgres_max_connections }}) 3. Redis Ping Check (Session-Store-Blockaden) 4. Gitea localhost Test (Backpressure-Analyse) 5. Gitea Logs (DB-Timeouts, Panics, "context deadline exceeded", SESSION: context canceled) 6. Postgres Logs (Connection issues, authentication timeouts) 7. Traefik Logs ("backend connection error", "EOF") 8. Runner Status und git-upload-pack/git gc Jobs Test-Dauer: {{ test_duration_seconds }} Sekunden Timestamp: {{ test_timestamp }} ================================================================================ - name: Get initial container stats (baseline) ansible.builtin.shell: | docker stats --no-stream --format "table {{ '{{' }}.Name{{ '}}' }}\t{{ '{{' }}.CPUPerc{{ '}}' }}\t{{ '{{' }}.MemUsage{{ '}}' }}\t{{ '{{' }}.NetIO{{ '}}' }}\t{{ '{{' }}.BlockIO{{ '}}' }}" gitea gitea-postgres gitea-redis traefik 2>/dev/null || echo "Stats collection failed" register: initial_stats changed_when: false - name: Get initial PostgreSQL connection count ansible.builtin.shell: | cd {{ gitea_stack_path }} docker compose exec -T postgres psql -U gitea -d gitea -c "SELECT count(*) as connection_count FROM pg_stat_activity;" 2>&1 | grep -E "^[[:space:]]*[0-9]+" | head -1 || echo "0" register: initial_pg_connections changed_when: false failed_when: false - name: Start collecting Docker stats in background ansible.builtin.shell: | timeout {{ test_duration_seconds }} docker stats --format "{{ '{{' }}.Name{{ '}}' }},{{ '{{' }}.CPUPerc{{ '}}' }},{{ '{{' }}.MemUsage{{ '}}' }},{{ '{{' }}.NetIO{{ '}}' }},{{ '{{' }}.BlockIO{{ '}}' }}" gitea gitea-postgres gitea-redis traefik 2>/dev/null | while read line; do echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] $line" done > /tmp/gitea_stats_{{ test_timestamp }}.log 2>&1 & STATS_PID=$! echo $STATS_PID register: stats_pid changed_when: false - name: Start collecting Gitea logs in background ansible.builtin.shell: | cd {{ gitea_stack_path }} timeout {{ test_duration_seconds }} docker compose logs -f gitea 2>&1 | while read line; do echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] $line" done > /tmp/gitea_logs_{{ test_timestamp }}.log 2>&1 & echo $! register: gitea_logs_pid changed_when: false - name: Start collecting Postgres logs in background ansible.builtin.shell: | cd {{ gitea_stack_path }} timeout {{ test_duration_seconds }} docker compose logs -f postgres 2>&1 | while read line; do echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] $line" done > /tmp/postgres_logs_{{ test_timestamp }}.log 2>&1 & echo $! register: postgres_logs_pid changed_when: false - name: Start collecting Traefik logs in background ansible.builtin.shell: | cd {{ traefik_stack_path }} timeout {{ test_duration_seconds }} docker compose logs -f traefik 2>&1 | while read line; do echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] $line" done > /tmp/traefik_logs_{{ test_timestamp }}.log 2>&1 & echo $! register: traefik_logs_pid changed_when: false - name: Start monitoring pg_stat_activity in background ansible.builtin.shell: | cd {{ gitea_stack_path }} for i in $(seq 1 {{ (test_duration_seconds / 5) | int }}); do echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] $(docker compose exec -T postgres psql -U gitea -d gitea -t -c 'SELECT count(*) FROM pg_stat_activity;' 2>&1 | tr -d ' ' || echo 'ERROR')" sleep 5 done > /tmp/pg_stat_activity_{{ test_timestamp }}.log 2>&1 & echo $! register: pg_stat_pid changed_when: false - name: Wait a moment for log collection to start ansible.builtin.pause: seconds: 2 - name: Trigger Gitea request via Traefik (with timeout) ansible.builtin.shell: | echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Starting request to {{ gitea_url }}/api/healthz" timeout 35 curl -k -v -s -o /tmp/gitea_response_{{ test_timestamp }}.log -w "\nHTTP_CODE:%{http_code}\nTIME_TOTAL:%{time_total}\nTIME_CONNECT:%{time_connect}\nTIME_STARTTRANSFER:%{time_starttransfer}\n" "{{ gitea_url }}/api/healthz" 2>&1 | tee /tmp/gitea_curl_{{ test_timestamp }}.log echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Request completed" register: gitea_request changed_when: false failed_when: false - name: Test Gitea localhost (Backpressure-Test) ansible.builtin.shell: | echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Starting localhost test" cd {{ gitea_stack_path }} timeout 35 docker compose exec -T gitea curl -f -s -w "\nHTTP_CODE:%{http_code}\nTIME_TOTAL:%{time_total}\n" http://localhost:3000/api/healthz 2>&1 | tee /tmp/gitea_localhost_{{ test_timestamp }}.log || echo "LOCALHOST_TEST_FAILED" > /tmp/gitea_localhost_{{ test_timestamp }}.log echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Localhost test completed" register: gitea_localhost_test changed_when: false failed_when: false - name: Test direct connection Traefik → Gitea (parallel) ansible.builtin.shell: | echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Starting direct test Traefik → Gitea" cd {{ traefik_stack_path }} timeout 35 docker compose exec -T traefik wget -qO- --timeout=30 http://gitea:3000/api/healthz 2>&1 | tee /tmp/traefik_gitea_direct_{{ test_timestamp }}.log || echo "DIRECT_TEST_FAILED" > /tmp/traefik_gitea_direct_{{ test_timestamp }}.log echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Direct test completed" register: traefik_direct_test changed_when: false failed_when: false - name: Test Redis connection during request ansible.builtin.shell: | echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Testing Redis connection" cd {{ gitea_stack_path }} docker compose exec -T redis redis-cli ping 2>&1 | tee /tmp/redis_ping_{{ test_timestamp }}.log || echo "REDIS_PING_FAILED" > /tmp/redis_ping_{{ test_timestamp }}.log echo "[$(date '+%Y-%m-%d %H:%M:%S.%3N')] Redis ping completed" register: redis_ping_test changed_when: false failed_when: false - name: Check Gitea Runner status ansible.builtin.shell: | docker ps --format "{{ '{{' }}.Names{{ '}}' }}" | grep -q "gitea-runner" && echo "RUNNING" || echo "STOPPED" register: runner_status changed_when: false failed_when: false - name: Wait for log collection to complete ansible.builtin.pause: seconds: "{{ test_duration_seconds - 5 }}" - name: Stop background processes ansible.builtin.shell: | pkill -f "docker.*stats.*gitea" || true pkill -f "docker compose logs.*gitea" || true pkill -f "docker compose logs.*postgres" || true pkill -f "docker compose logs.*traefik" || true pkill -f "pg_stat_activity" || true sleep 2 changed_when: false failed_when: false - name: Get final PostgreSQL connection count ansible.builtin.shell: | cd {{ gitea_stack_path }} docker compose exec -T postgres psql -U gitea -d gitea -c "SELECT count(*) as connection_count FROM pg_stat_activity;" 2>&1 | grep -E "^[[:space:]]*[0-9]+" | head -1 || echo "0" register: final_pg_connections changed_when: false failed_when: false - name: Collect stats results ansible.builtin.slurp: src: "/tmp/gitea_stats_{{ test_timestamp }}.log" register: stats_results changed_when: false failed_when: false - name: Collect pg_stat_activity results ansible.builtin.slurp: src: "/tmp/pg_stat_activity_{{ test_timestamp }}.log" register: pg_stat_results changed_when: false failed_when: false - name: Collect Gitea logs results ansible.builtin.slurp: src: "/tmp/gitea_logs_{{ test_timestamp }}.log" register: gitea_logs_results changed_when: false failed_when: false - name: Collect Postgres logs results ansible.builtin.slurp: src: "/tmp/postgres_logs_{{ test_timestamp }}.log" register: postgres_logs_results changed_when: false failed_when: false - name: Collect Traefik logs results ansible.builtin.slurp: src: "/tmp/traefik_logs_{{ test_timestamp }}.log" register: traefik_logs_results changed_when: false failed_when: false - name: Get request result ansible.builtin.slurp: src: "/tmp/gitea_curl_{{ test_timestamp }}.log" register: request_result changed_when: false failed_when: false - name: Get localhost test result ansible.builtin.slurp: src: "/tmp/gitea_localhost_{{ test_timestamp }}.log" register: localhost_result changed_when: false failed_when: false - name: Get direct test result ansible.builtin.slurp: src: "/tmp/traefik_gitea_direct_{{ test_timestamp }}.log" register: direct_test_result changed_when: false failed_when: false - name: Get Redis ping result ansible.builtin.slurp: src: "/tmp/redis_ping_{{ test_timestamp }}.log" register: redis_ping_result changed_when: false failed_when: false - name: Analyze pg_stat_activity for connection count ansible.builtin.shell: | if [ -f /tmp/pg_stat_activity_{{ test_timestamp }}.log ]; then echo "=== POSTGRES CONNECTION COUNT ANALYSIS ===" echo "Initial connections: {{ initial_pg_connections.stdout }}" echo "Final connections: {{ final_pg_connections.stdout }}" echo "Max connections: {{ postgres_max_connections }}" echo "" echo "=== CONNECTION COUNT TIMELINE ===" cat /tmp/pg_stat_activity_{{ test_timestamp }}.log | tail -20 || echo "No connection count data" echo "" echo "=== CONNECTION COUNT ANALYSIS ===" MAX_COUNT=$(cat /tmp/pg_stat_activity_{{ test_timestamp }}.log | grep -E "^\[.*\] [0-9]+" | awk -F'] ' '{print $2}' | sort -n | tail -1 || echo "0") if [ "$MAX_COUNT" != "0" ] && [ "$MAX_COUNT" != "" ]; then echo "Maximum connections during test: $MAX_COUNT" WARNING_THRESHOLD=$(({{ postgres_max_connections }} * 80 / 100)) if [ "$MAX_COUNT" -gt "$WARNING_THRESHOLD" ]; then echo "⚠️ WARNING: Connection count ($MAX_COUNT) is above 80% of max_connections ({{ postgres_max_connections }})" echo " Consider reducing MAX_OPEN_CONNS or increasing max_connections" else echo "✅ Connection count is within safe limits" fi fi else echo "pg_stat_activity log file not found" fi register: pg_stat_analysis changed_when: false failed_when: false - name: Analyze stats for high CPU/Memory/IO ansible.builtin.shell: | if [ -f /tmp/gitea_stats_{{ test_timestamp }}.log ]; then echo "=== STATS SUMMARY ===" echo "Total samples: $(wc -l < /tmp/gitea_stats_{{ test_timestamp }}.log)" echo "" echo "=== HIGH CPU (>80%) ===" grep -E "gitea|gitea-postgres" /tmp/gitea_stats_{{ test_timestamp }}.log | awk -F',' '{cpu=$2; gsub(/%/, "", cpu); if (cpu+0 > 80) print $0}' | head -10 || echo "No high CPU usage found" echo "" echo "=== MEMORY USAGE ===" grep -E "gitea" /tmp/gitea_stats_{{ test_timestamp }}.log | tail -5 || echo "No memory stats" else echo "Stats file not found" fi register: stats_analysis changed_when: false failed_when: false - name: Analyze Gitea logs for errors (including SESSION context canceled, panic, git-upload-pack) ansible.builtin.shell: | if [ -f /tmp/gitea_logs_{{ test_timestamp }}.log ]; then echo "=== DB-TIMEOUTS / CONNECTION ERRORS ===" grep -iE "timeout|deadline exceeded|connection.*failed|database.*error|postgres.*error|context.*deadline" /tmp/gitea_logs_{{ test_timestamp }}.log | tail -20 || echo "No DB-timeouts found" echo "" echo "=== SESSION: CONTEXT CANCELED ===" grep -iE "SESSION.*context canceled|session.*release.*context canceled" /tmp/gitea_logs_{{ test_timestamp }}.log | tail -10 || echo "No SESSION: context canceled found" echo "" echo "=== PANICS / FATAL ERRORS ===" grep -iE "panic|fatal|error.*fatal" /tmp/gitea_logs_{{ test_timestamp }}.log | tail -10 || echo "No panics found" echo "" echo "=== GIT-UPLOAD-PACK REQUESTS (can block) ===" grep -iE "git-upload-pack|ServiceUploadPack" /tmp/gitea_logs_{{ test_timestamp }}.log | tail -10 || echo "No git-upload-pack requests found" echo "" echo "=== GIT GC JOBS (can hold connections) ===" grep -iE "git.*gc|garbage.*collect" /tmp/gitea_logs_{{ test_timestamp }}.log | tail -10 || echo "No git gc jobs found" echo "" echo "=== SLOW QUERIES / PERFORMANCE ===" grep -iE "slow|performance|took.*ms|duration" /tmp/gitea_logs_{{ test_timestamp }}.log | tail -10 || echo "No slow queries found" else echo "Gitea logs file not found" fi register: gitea_logs_analysis changed_when: false failed_when: false - name: Analyze Postgres logs for errors ansible.builtin.shell: | if [ -f /tmp/postgres_logs_{{ test_timestamp }}.log ]; then echo "=== POSTGRES ERRORS ===" grep -iE "error|timeout|deadlock|connection.*refused|too many connections|authentication.*timeout" /tmp/postgres_logs_{{ test_timestamp }}.log | tail -20 || echo "No Postgres errors found" echo "" echo "=== SLOW QUERIES ===" grep -iE "slow|duration|statement.*took" /tmp/postgres_logs_{{ test_timestamp }}.log | tail -10 || echo "No slow queries found" else echo "Postgres logs file not found" fi register: postgres_logs_analysis changed_when: false failed_when: false - name: Analyze Traefik logs for backend errors ansible.builtin.shell: | if [ -f /tmp/traefik_logs_{{ test_timestamp }}.log ]; then echo "=== BACKEND CONNECTION ERRORS ===" grep -iE "backend.*error|connection.*error|EOF|gitea.*error|git\.michaelschiemer\.de.*error" /tmp/traefik_logs_{{ test_timestamp }}.log | tail -20 || echo "No backend errors found" echo "" echo "=== TIMEOUT ERRORS ===" grep -iE "timeout|504|gateway.*timeout" /tmp/traefik_logs_{{ test_timestamp }}.log | tail -10 || echo "No timeout errors found" else echo "Traefik logs file not found" fi register: traefik_logs_analysis changed_when: false failed_when: false - name: Display comprehensive diagnosis ansible.builtin.debug: msg: | ================================================================================ GITEA TIMEOUT DEEP DIAGNOSE - ERGEBNISSE ================================================================================ BASELINE STATS (vor Request): {{ initial_stats.stdout }} POSTGRES CONNECTION COUNT: {{ pg_stat_analysis.stdout }} REQUEST ERGEBNIS (Traefik → Gitea): {% if request_result.content is defined and request_result.content != '' %} {{ request_result.content | b64decode }} {% else %} Request-Ergebnis nicht verfügbar {% endif %} BACKPRESSURE TEST - GITEA LOCALHOST: {% if localhost_result.content is defined and localhost_result.content != '' %} {{ localhost_result.content | b64decode }} {% else %} Localhost-Test-Ergebnis nicht verfügbar {% endif %} DIREKTER TEST TRAEFIK → GITEA: {% if direct_test_result.content is defined and direct_test_result.content != '' %} {{ direct_test_result.content | b64decode }} {% else %} Direkter Test-Ergebnis nicht verfügbar {% endif %} REDIS PING TEST: {% if redis_ping_result.content is defined and redis_ping_result.content != '' %} {{ redis_ping_result.content | b64decode }} {% else %} Redis-Ping-Ergebnis nicht verfügbar {% endif %} RUNNER STATUS: - Status: {{ runner_status.stdout }} ================================================================================ STATS-ANALYSE (während Request): ================================================================================ {{ stats_analysis.stdout }} ================================================================================ GITEA LOGS-ANALYSE: ================================================================================ {{ gitea_logs_analysis.stdout }} ================================================================================ POSTGRES LOGS-ANALYSE: ================================================================================ {{ postgres_logs_analysis.stdout }} ================================================================================ TRAEFIK LOGS-ANALYSE: ================================================================================ {{ traefik_logs_analysis.stdout }} ================================================================================ INTERPRETATION: ================================================================================ {% set request_content = request_result.content | default('') | b64decode | default('') %} {% set localhost_content = localhost_result.content | default('') | b64decode | default('') %} {% set direct_content = direct_test_result.content | default('') | b64decode | default('') %} {% set redis_content = redis_ping_result.content | default('') | b64decode | default('') %} {% set traefik_errors = traefik_logs_analysis.stdout | default('') %} {% set gitea_errors = gitea_logs_analysis.stdout | default('') %} {% set postgres_errors = postgres_logs_analysis.stdout | default('') %} {% set stats_content = stats_analysis.stdout | default('') %} {% if 'timeout' in request_content or '504' in request_content or 'HTTP_CODE:504' in request_content %} ⚠️ REQUEST HAT TIMEOUT/504: BACKPRESSURE-ANALYSE: {% if 'LOCALHOST_TEST_FAILED' in localhost_content or localhost_content == '' %} → Gitea localhost Test schlägt fehl oder blockiert → Problem liegt IN Gitea/DB selbst, nicht zwischen Traefik und Gitea {% elif 'HTTP_CODE:200' in localhost_content or '200 OK' in localhost_content %} → Gitea localhost Test funktioniert schnell → Problem liegt ZWISCHEN Traefik und Gitea (Netzwerk, Firewall, Limit) {% endif %} {% if 'REDIS_PING_FAILED' in redis_content or redis_content == '' or 'PONG' not in redis_content %} → Redis ist nicht erreichbar → Session-Store blockiert, Gitea läuft in "context canceled" {% else %} → Redis ist erreichbar {% endif %} {% if 'SESSION.*context canceled' in gitea_errors or 'session.*release.*context canceled' in gitea_errors %} → Gitea hat SESSION: context canceled Fehler → Session-Store (Redis) könnte blockieren oder Session-Locks hängen {% endif %} {% if 'git-upload-pack' in gitea_errors %} → git-upload-pack Requests gefunden (können blockieren) → Prüfe ob Runner aktiv ist und viele Git-Operationen durchführt {% endif %} {% if 'git.*gc' in gitea_errors %} → git gc Jobs gefunden (können Verbindungen halten) → Prüfe ob git gc Jobs hängen {% endif %} {% if 'EOF' in traefik_errors or 'backend' in traefik_errors | lower or 'connection.*error' in traefik_errors | lower %} → Traefik meldet Backend-Connection-Error → Gitea antwortet nicht auf Traefik's Verbindungsversuche {% endif %} {% if 'timeout' in gitea_errors | lower or 'deadline exceeded' in gitea_errors | lower %} → Gitea hat DB-Timeouts oder Context-Deadline-Exceeded → Postgres könnte blockieren oder zu langsam sein {% endif %} {% if 'too many connections' in postgres_errors | lower %} → Postgres hat zu viele Verbindungen → Connection Pool könnte überlastet sein {% endif %} {% if 'HIGH CPU' in stats_content or '>80' in stats_content %} → Gitea oder Postgres haben hohe CPU-Last → Performance-Problem, nicht Timeout-Konfiguration {% endif %} {% else %} ✅ REQUEST WAR ERFOLGREICH: → Problem tritt nur intermittierend auf → Prüfe Logs auf sporadische Fehler {% endif %} ================================================================================ NÄCHSTE SCHRITTE: ================================================================================ 1. Prüfe pg_stat_activity: Connection Count nahe max_connections? 2. Prüfe ob Redis erreichbar ist (Session-Store-Blockaden) 3. Prüfe Backpressure: localhost schnell aber Traefik langsam = Netzwerk-Problem 4. Prüfe SESSION: context canceled Fehler (Session-Locks) 5. Prüfe git-upload-pack Requests (Runner-Überlastung) 6. Prüfe git gc Jobs (hängen und halten Verbindungen) ================================================================================ - name: Cleanup temporary files ansible.builtin.file: path: "/tmp/gitea_{{ test_timestamp }}.log" state: absent failed_when: false