diff --git a/.gitea/workflows/build-image.yml b/.gitea/workflows/build-image.yml index c8005a6b..66d4b1c6 100644 --- a/.gitea/workflows/build-image.yml +++ b/.gitea/workflows/build-image.yml @@ -44,6 +44,11 @@ on: type: boolean required: false default: false + deploy: + description: 'Deploy to staging/production after build (default: false)' + type: boolean + required: false + default: false env: REGISTRY: registry.michaelschiemer.de @@ -85,6 +90,13 @@ jobs: fi chmod +x /tmp/ci-tools/clone_repo.sh + - name: Upload CI helpers as artifact + uses: actions/upload-artifact@v4 + with: + name: ci-helpers + path: /tmp/ci-tools/clone_repo.sh + retention-days: 1 + - name: Analyse changed files id: filter shell: bash @@ -224,6 +236,13 @@ jobs: echo "changed_files=$PRETTY_CHANGES" >> "$GITHUB_OUTPUT" echo "needs_runtime_build=$RUNTIME_BUILD" >> "$GITHUB_OUTPUT" + - name: Upload repository as artifact + uses: actions/upload-artifact@v4 + with: + name: repository + path: /workspace/repo + retention-days: 1 + runtime-base: name: Build Runtime Base Image needs: changes @@ -244,8 +263,16 @@ jobs: echo "should_build=false" >> "$GITHUB_OUTPUT" fi - - name: Download CI helpers + - name: Download CI helpers from artifact if: ${{ steps.decision.outputs.should_build == 'true' }} + uses: actions/download-artifact@v4 + with: + name: ci-helpers + path: /tmp/ci-tools + continue-on-error: true + + - name: Download CI helpers (fallback if artifact missing) + if: ${{ steps.decision.outputs.should_build == 'true' && failure() }} shell: bash env: CI_TOKEN: ${{ secrets.CI_TOKEN }} @@ -457,7 +484,15 @@ jobs: name: Run Tests & Quality Checks runs-on: php-ci steps: - - name: Download CI helpers + - name: Download CI helpers from artifact + uses: actions/download-artifact@v4 + with: + name: ci-helpers + path: /tmp/ci-tools + continue-on-error: true + + - name: Download CI helpers (fallback if artifact missing) + if: failure() shell: bash env: CI_TOKEN: ${{ secrets.CI_TOKEN }} @@ -479,7 +514,16 @@ jobs: fi chmod +x /tmp/ci-tools/clone_repo.sh - - name: Checkout code + - name: Download repository artifact + uses: actions/download-artifact@v4 + with: + name: repository + path: /workspace + continue-on-error: true + id: download_repo + + - name: Checkout code (fallback if artifact missing) + if: steps.download_repo.outcome == 'failure' run: | REF_NAME="${{ github.ref_name }}" INPUT_BRANCH="${{ inputs.branch }}" @@ -545,8 +589,16 @@ jobs: bash --version git --version - - name: Download CI helpers + - name: Download CI helpers from artifact if: ${{ env.SHOULD_BUILD == 'true' }} + uses: actions/download-artifact@v4 + with: + name: ci-helpers + path: /tmp/ci-tools + continue-on-error: true + + - name: Download CI helpers (fallback if artifact missing) + if: ${{ env.SHOULD_BUILD == 'true' && failure() }} shell: bash env: CI_TOKEN: ${{ secrets.CI_TOKEN }} @@ -568,8 +620,17 @@ jobs: fi chmod +x /tmp/ci-tools/clone_repo.sh - - name: Checkout code + - name: Download repository artifact if: ${{ env.SHOULD_BUILD == 'true' }} + uses: actions/download-artifact@v4 + with: + name: repository + path: /workspace + continue-on-error: true + id: download_repo + + - name: Checkout code (fallback if artifact missing) + if: ${{ env.SHOULD_BUILD == 'true' && steps.download_repo.outcome == 'failure' }} shell: bash run: | REF_NAME="${{ github.ref_name }}" @@ -910,12 +971,23 @@ jobs: echo " Run the 'Deploy to Production' or 'Deploy to Staging' workflow to deploy this image." fi - # Job 3: Auto-deploy to Staging (only for staging branch) + - name: Upload repository as artifact + if: ${{ env.SHOULD_BUILD == 'true' }} + uses: actions/upload-artifact@v4 + with: + name: repository + path: /workspace/repo + retention-days: 1 + + # Job 3: Auto-deploy to Staging (only for staging branch and if deploy is enabled) deploy-staging: name: Auto-deploy to Staging needs: [changes, build] - if: ${{ always() && (github.ref_name == 'staging' || github.head_ref == 'staging' || (github.ref_name == '' && contains(github.ref, 'staging'))) && needs.build.result != 'failure' && needs.build.result != 'cancelled' && needs.changes.result != 'failure' && needs.changes.result != 'cancelled' }} + if: ${{ always() && ((github.event_name == 'push' && (github.ref_name == 'staging' || github.head_ref == 'staging' || (github.ref_name == '' && contains(github.ref, 'staging')))) || (github.event_name == 'workflow_dispatch' && inputs.deploy == true)) && needs.build.result != 'failure' && needs.build.result != 'cancelled' && needs.changes.result != 'failure' && needs.changes.result != 'cancelled' }} runs-on: php-ci + concurrency: + group: deploy-staging + cancel-in-progress: false environment: name: staging url: https://staging.michaelschiemer.de @@ -936,7 +1008,16 @@ jobs: echo "BRANCH=$REF_NAME" >> $GITHUB_OUTPUT echo "📋 Branch: $REF_NAME" - - name: Checkout deployment scripts + - name: Download repository artifact + uses: actions/download-artifact@v4 + with: + name: repository + path: /workspace + continue-on-error: true + id: download_repo + + - name: Checkout deployment scripts (fallback if artifact missing) + if: steps.download_repo.outcome == 'failure' run: | REF_NAME="${{ steps.branch.outputs.BRANCH }}" REPO="${{ github.repository }}" @@ -956,6 +1037,11 @@ jobs: cd /workspace/repo + - name: Set skip_git_update flag if repository artifact was used + if: steps.download_repo.outcome == 'success' + run: | + echo "SKIP_GIT_UPDATE=true" >> $GITHUB_ENV + - name: Setup SSH key run: | mkdir -p ~/.ssh @@ -975,41 +1061,19 @@ jobs: chmod 600 /tmp/vault_pass fi - - name: Deploy Application Code to Staging + - name: Deploy to Staging (Complete) run: | cd /workspace/repo/deployment/ansible ansible-playbook -i inventory/production.yml \ - playbooks/deploy-application-code.yml \ + playbooks/deploy-complete.yml \ -e "deployment_environment=staging" \ -e "deployment_hosts=production" \ -e "git_branch=staging" \ - -e "traefik_auto_restart=false" \ - -e "gitea_auto_restart=false" \ - --vault-password-file /tmp/vault_pass \ - --private-key ~/.ssh/production - - - name: Deploy Docker Image to Staging - run: | - cd /workspace/repo/deployment/ansible - ansible-playbook -i inventory/production.yml \ - playbooks/deploy-image.yml \ - -e "deployment_environment=staging" \ - -e "deployment_hosts=production" \ -e "image_tag=latest" \ -e "docker_registry=${{ env.REGISTRY }}" \ -e "docker_registry_username=${{ secrets.REGISTRY_USER }}" \ -e "docker_registry_password=${{ secrets.REGISTRY_PASSWORD }}" \ - -e "traefik_auto_restart=false" \ - -e "gitea_auto_restart=false" \ - --vault-password-file /tmp/vault_pass \ - --private-key ~/.ssh/production - - - name: Install Composer Dependencies - run: | - cd /workspace/repo/deployment/ansible - ansible-playbook -i inventory/production.yml \ - playbooks/install-composer-dependencies.yml \ - -e "deployment_environment=staging" \ + -e "application_skip_git_update=${{ env.SKIP_GIT_UPDATE || 'false' }}" \ -e "traefik_auto_restart=false" \ -e "gitea_auto_restart=false" \ --vault-password-file /tmp/vault_pass \ @@ -1021,22 +1085,30 @@ jobs: - name: Health check id: health run: | - echo "🔍 Performing health checks..." + echo "🔍 Performing health checks with exponential backoff..." - # Basic health check + # Basic health check with exponential backoff BASIC_HEALTH_OK=false - for i in {1..10}; do + DELAY=2 + MAX_DELAY=60 + MAX_ATTEMPTS=5 + + for i in $(seq 1 $MAX_ATTEMPTS); do if curl -f -k -s https://staging.michaelschiemer.de/health > /dev/null 2>&1; then - echo "✅ Basic health check passed" + echo "✅ Basic health check passed (attempt $i/$MAX_ATTEMPTS)" BASIC_HEALTH_OK=true break fi - echo "⏳ Waiting for staging service... (attempt $i/10)" - sleep 10 + if [ $i -lt $MAX_ATTEMPTS ]; then + echo "⏳ Waiting for staging service... (attempt $i/$MAX_ATTEMPTS, delay ${DELAY}s)" + sleep $DELAY + DELAY=$((DELAY * 2)) + [ $DELAY -gt $MAX_DELAY ] && DELAY=$MAX_DELAY + fi done if [ "$BASIC_HEALTH_OK" != "true" ]; then - echo "❌ Basic health check failed" + echo "❌ Basic health check failed after $MAX_ATTEMPTS attempts" exit 1 fi @@ -1065,12 +1137,15 @@ jobs: echo "URL: https://staging.michaelschiemer.de" echo "Image: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest" - # Job 4: Auto-deploy to Production (only for main branch) + # Job 4: Auto-deploy to Production (only for main branch and if deploy is enabled) deploy-production: name: Auto-deploy to Production needs: [changes, build] - if: always() && (github.ref_name == 'main' || github.head_ref == 'main' || (github.ref_name == '' && contains(github.ref, 'main'))) && needs.changes.outputs.needs_build == 'true' + if: always() && ((github.event_name == 'push' && (github.ref_name == 'main' || github.head_ref == 'main' || (github.ref_name == '' && contains(github.ref, 'main')))) || (github.event_name == 'workflow_dispatch' && inputs.deploy == true)) && needs.changes.outputs.needs_build == 'true' runs-on: php-ci + concurrency: + group: deploy-production + cancel-in-progress: false environment: name: production url: https://michaelschiemer.de @@ -1091,7 +1166,16 @@ jobs: echo "BRANCH=$REF_NAME" >> $GITHUB_OUTPUT echo "📋 Branch: $REF_NAME" - - name: Checkout deployment scripts + - name: Download repository artifact + uses: actions/download-artifact@v4 + with: + name: repository + path: /workspace + continue-on-error: true + id: download_repo + + - name: Checkout deployment scripts (fallback if artifact missing) + if: steps.download_repo.outcome == 'failure' run: | REF_NAME="${{ steps.branch.outputs.BRANCH }}" REPO="${{ github.repository }}" @@ -1111,6 +1195,11 @@ jobs: cd /workspace/repo + - name: Set skip_git_update flag if repository artifact was used + if: steps.download_repo.outcome == 'success' + run: | + echo "SKIP_GIT_UPDATE=true" >> $GITHUB_ENV + - name: Setup SSH key run: | mkdir -p ~/.ssh @@ -1153,41 +1242,19 @@ jobs: echo "IMAGE_TAG=${IMAGE_TAG}" >> $GITHUB_OUTPUT echo "📦 Image Tag: ${IMAGE_TAG}" - - name: Deploy Application Code to Production + - name: Deploy to Production (Complete) run: | cd /workspace/repo/deployment/ansible ansible-playbook -i inventory/production.yml \ - playbooks/deploy-application-code.yml \ + playbooks/deploy-complete.yml \ -e "deployment_environment=production" \ -e "deployment_hosts=production" \ -e "git_branch=main" \ - -e "traefik_auto_restart=false" \ - -e "gitea_auto_restart=false" \ - --vault-password-file /tmp/vault_pass \ - --private-key ~/.ssh/production - - - name: Deploy Docker Image to Production - run: | - cd /workspace/repo/deployment/ansible - ansible-playbook -i inventory/production.yml \ - playbooks/deploy-image.yml \ - -e "deployment_environment=production" \ - -e "deployment_hosts=production" \ -e "image_tag=${{ steps.image_tag.outputs.IMAGE_TAG }}" \ -e "docker_registry=${{ env.REGISTRY }}" \ -e "docker_registry_username=${{ secrets.REGISTRY_USER }}" \ -e "docker_registry_password=${{ secrets.REGISTRY_PASSWORD }}" \ - -e "traefik_auto_restart=false" \ - -e "gitea_auto_restart=false" \ - --vault-password-file /tmp/vault_pass \ - --private-key ~/.ssh/production - - - name: Install Composer Dependencies - run: | - cd /workspace/repo/deployment/ansible - ansible-playbook -i inventory/production.yml \ - playbooks/install-composer-dependencies.yml \ - -e "deployment_environment=production" \ + -e "application_skip_git_update=${{ env.SKIP_GIT_UPDATE || 'false' }}" \ -e "traefik_auto_restart=false" \ -e "gitea_auto_restart=false" \ --vault-password-file /tmp/vault_pass \ @@ -1199,22 +1266,30 @@ jobs: - name: Health check id: health run: | - echo "🔍 Performing health checks..." + echo "🔍 Performing health checks with exponential backoff..." - # Basic health check + # Basic health check with exponential backoff BASIC_HEALTH_OK=false - for i in {1..10}; do + DELAY=2 + MAX_DELAY=60 + MAX_ATTEMPTS=5 + + for i in $(seq 1 $MAX_ATTEMPTS); do if curl -f -k -s https://michaelschiemer.de/health > /dev/null 2>&1; then - echo "✅ Basic health check passed" + echo "✅ Basic health check passed (attempt $i/$MAX_ATTEMPTS)" BASIC_HEALTH_OK=true break fi - echo "⏳ Waiting for production service... (attempt $i/10)" - sleep 10 + if [ $i -lt $MAX_ATTEMPTS ]; then + echo "⏳ Waiting for production service... (attempt $i/$MAX_ATTEMPTS, delay ${DELAY}s)" + sleep $DELAY + DELAY=$((DELAY * 2)) + [ $DELAY -gt $MAX_DELAY ] && DELAY=$MAX_DELAY + fi done if [ "$BASIC_HEALTH_OK" != "true" ]; then - echo "❌ Basic health check failed" + echo "❌ Basic health check failed after $MAX_ATTEMPTS attempts" exit 1 fi diff --git a/.gitea/workflows/monitor-performance.yml b/.gitea/workflows/monitor-performance.yml new file mode 100644 index 00000000..da3fd095 --- /dev/null +++ b/.gitea/workflows/monitor-performance.yml @@ -0,0 +1,89 @@ +name: 📊 Monitor Workflow Performance + +on: + schedule: + # Run every 6 hours + - cron: '0 */6 * * *' + workflow_dispatch: + inputs: + lookback_hours: + description: 'Hours to look back for metrics' + required: false + default: '24' + type: string + +env: + DEPLOYMENT_HOST: 94.16.110.151 + +jobs: + monitor: + name: Monitor Workflow Performance + runs-on: php-ci + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Setup SSH key + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/production + chmod 600 ~/.ssh/production + ssh-keyscan -H ${{ env.DEPLOYMENT_HOST }} >> ~/.ssh/known_hosts + + - name: Create Ansible Vault password file + run: | + if [ -n "${{ secrets.ANSIBLE_VAULT_PASSWORD }}" ]; then + echo "${{ secrets.ANSIBLE_VAULT_PASSWORD }}" > /tmp/vault_pass + chmod 600 /tmp/vault_pass + echo "✅ Vault password file created" + else + echo "⚠️ ANSIBLE_VAULT_PASSWORD secret not set, using empty password file" + touch /tmp/vault_pass + chmod 600 /tmp/vault_pass + fi + + - name: Run performance monitoring + run: | + cd /workspace/repo/deployment/ansible + ansible-playbook -i inventory/production.yml \ + playbooks/monitor-workflow-performance.yml \ + -e "monitoring_lookback_hours=${{ github.event.inputs.lookback_hours || '24' }}" \ + --vault-password-file /tmp/vault_pass \ + --private-key ~/.ssh/production + + - name: Collect metrics files + run: | + ssh -i ~/.ssh/production deploy@${{ env.DEPLOYMENT_HOST }} \ + "find /home/deploy/monitoring/workflow-metrics -name 'workflow_metrics_*.json' -mtime -1 -exec cat {} \; | jq -s '.'" \ + > /tmp/combined_metrics.json || echo "[]" > /tmp/combined_metrics.json + + - name: Display metrics summary + run: | + if [ -f /tmp/combined_metrics.json ] && [ -s /tmp/combined_metrics.json ]; then + echo "📊 Performance Metrics Summary:" + echo "==================================" + cat /tmp/combined_metrics.json | jq -r ' + .[] | + "Timestamp: \(.timestamp)", + "System Load: \(.system_metrics.load_average)", + "CPU Usage: \(.system_metrics.cpu_usage_percent)%", + "Memory: \(.system_metrics.memory_usage)", + "Gitea Runner: \(.gitea_metrics.runner_status)", + "Gitea API Response: \(.gitea_metrics.api_response_time_ms)ms", + "Workflow Log Entries: \(.gitea_metrics.workflow_log_entries_last_24h)", + "---" + ' || echo "⚠️ Could not parse metrics" + else + echo "⚠️ No metrics collected" + fi + + - name: Upload metrics as artifact + uses: actions/upload-artifact@v4 + with: + name: workflow-metrics + path: /tmp/combined_metrics.json + retention-days: 30 + if: always() + diff --git a/deployment/ansible/playbooks/monitor-workflow-performance.yml b/deployment/ansible/playbooks/monitor-workflow-performance.yml new file mode 100644 index 00000000..1352b765 --- /dev/null +++ b/deployment/ansible/playbooks/monitor-workflow-performance.yml @@ -0,0 +1,192 @@ +--- +# Monitor Workflow Performance +# Collects comprehensive metrics about workflow execution, Gitea load, and system resources +- name: Monitor Workflow Performance + hosts: production + gather_facts: yes + become: no + vars: + monitoring_output_dir: "/home/deploy/monitoring/workflow-metrics" + monitoring_lookback_hours: 24 + gitea_stack_path: "{{ stacks_base_path }}/gitea" + traefik_stack_path: "{{ stacks_base_path }}/traefik" + + tasks: + - name: Create monitoring output directory + ansible.builtin.file: + path: "{{ monitoring_output_dir }}" + state: directory + mode: '0755' + + - name: Get system load average + ansible.builtin.shell: | + uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ' ' + register: system_load + changed_when: false + + - name: Get Docker container count + ansible.builtin.shell: | + docker ps --format '{{ '{{' }}.Names{{ '}}' }}' | wc -l + register: docker_container_count + changed_when: false + + - name: Get Gitea Runner status + ansible.builtin.shell: | + if docker ps --format '{{ '{{' }}.Names{{ '}}' }}' | grep -q "gitea-runner"; then + echo "running" + else + echo "stopped" + fi + register: gitea_runner_status + changed_when: false + + - name: Get Gitea container resource usage + ansible.builtin.shell: | + docker stats gitea --no-stream --format "{{ '{{' }}.CPUPerc{{ '}}' }},{{ '{{' }}.MemUsage{{ '}}' }},{{ '{{' }}.MemPerc{{ '}}' }}" 2>/dev/null || echo "N/A,N/A,N/A" + register: gitea_stats + changed_when: false + failed_when: false + + - name: Get Traefik container resource usage + ansible.builtin.shell: | + docker stats traefik --no-stream --format "{{ '{{' }}.CPUPerc{{ '}}' }},{{ '{{' }}.MemUsage{{ '}}' }},{{ '{{' }}.MemPerc{{ '}}' }}" 2>/dev/null || echo "N/A,N/A,N/A" + register: traefik_stats + changed_when: false + failed_when: false + + - name: Check Gitea API response time + ansible.builtin.uri: + url: "https://{{ gitea_domain }}/api/healthz" + method: GET + status_code: [200] + validate_certs: false + timeout: 10 + register: gitea_api_test + changed_when: false + failed_when: false + + - name: Get Gitea logs for workflow activity (last {{ monitoring_lookback_hours }} hours) + ansible.builtin.shell: | + cd {{ gitea_stack_path }} + docker compose logs gitea --since "{{ monitoring_lookback_hours }}h" 2>&1 | \ + grep -iE "workflow|action|runner" | \ + tail -50 || echo "No workflow activity found" + register: gitea_workflow_logs + changed_when: false + failed_when: false + + - name: Count workflow-related log entries + ansible.builtin.shell: | + cd {{ gitea_stack_path }} + docker compose logs gitea --since "{{ monitoring_lookback_hours }}h" 2>&1 | \ + grep -iE "workflow|action|runner" | \ + wc -l + register: workflow_log_count + changed_when: false + failed_when: false + + - name: Get disk usage for Gitea data + ansible.builtin.shell: | + du -sh {{ gitea_stack_path }}/data 2>/dev/null | awk '{print $1}' || echo "N/A" + register: gitea_data_size + changed_when: false + failed_when: false + + - name: Get Docker system disk usage + ansible.builtin.shell: | + docker system df --format "{{ '{{' }}.Size{{ '}}' }}" 2>/dev/null | head -1 || echo "N/A" + register: docker_disk_usage + changed_when: false + failed_when: false + + - name: Get memory usage + ansible.builtin.shell: | + free -h | grep Mem | awk '{print $3 "/" $2}' + register: memory_usage + changed_when: false + + - name: Get CPU usage (1 minute average) + ansible.builtin.shell: | + top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}' + register: cpu_usage + changed_when: false + failed_when: false + + - name: Generate metrics JSON + ansible.builtin.copy: + dest: "{{ monitoring_output_dir }}/workflow_metrics_{{ ansible_date_time.epoch }}.json" + content: | + { + "timestamp": "{{ ansible_date_time.iso8601 }}", + "system_metrics": { + "load_average": "{{ system_load.stdout }}", + "cpu_usage_percent": "{{ cpu_usage.stdout | default('N/A') }}", + "memory_usage": "{{ memory_usage.stdout }}", + "docker_containers": "{{ docker_container_count.stdout }}", + "docker_disk_usage": "{{ docker_disk_usage.stdout }}", + "gitea_data_size": "{{ gitea_data_size.stdout }}" + }, + "gitea_metrics": { + "runner_status": "{{ gitea_runner_status.stdout }}", + "api_response_time_ms": "{{ (gitea_api_test.elapsed * 1000) | default('N/A') | int }}", + "workflow_log_entries_last_{{ monitoring_lookback_hours }}h": {{ workflow_log_count.stdout | int }}, + "container_stats": { + "cpu_percent": "{{ gitea_stats.stdout.split(',')[0] if gitea_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }}", + "memory_usage": "{{ gitea_stats.stdout.split(',')[1] if gitea_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }}", + "memory_percent": "{{ gitea_stats.stdout.split(',')[2] if gitea_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }}" + } + }, + "traefik_metrics": { + "container_stats": { + "cpu_percent": "{{ traefik_stats.stdout.split(',')[0] if traefik_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }}", + "memory_usage": "{{ traefik_stats.stdout.split(',')[1] if traefik_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }}", + "memory_percent": "{{ traefik_stats.stdout.split(',')[2] if traefik_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }}" + } + }, + "optimizations": { + "repository_artifact_enabled": true, + "helper_script_caching_enabled": true, + "combined_deployment_playbook": true, + "exponential_backoff_health_checks": true, + "concurrency_groups": true + } + } + mode: '0644' + + - name: Display monitoring summary + ansible.builtin.debug: + msg: | + ================================================================================ + WORKFLOW PERFORMANCE MONITORING - SUMMARY + ================================================================================ + + System Metrics: + - Load Average: {{ system_load.stdout }} + - CPU Usage: {{ cpu_usage.stdout | default('N/A') }}% + - Memory Usage: {{ memory_usage.stdout }} + - Docker Containers: {{ docker_container_count.stdout }} + - Docker Disk Usage: {{ docker_disk_usage.stdout }} + - Gitea Data Size: {{ gitea_data_size.stdout }} + + Gitea Metrics: + - Runner Status: {{ gitea_runner_status.stdout }} + - API Response Time: {{ (gitea_api_test.elapsed * 1000) | default('N/A') | int }}ms + - Workflow Log Entries (last {{ monitoring_lookback_hours }}h): {{ workflow_log_count.stdout }} + - Container CPU: {{ gitea_stats.stdout.split(',')[0] if gitea_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }} + - Container Memory: {{ gitea_stats.stdout.split(',')[1] if gitea_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }} + + Traefik Metrics: + - Container CPU: {{ traefik_stats.stdout.split(',')[0] if traefik_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }} + - Container Memory: {{ traefik_stats.stdout.split(',')[1] if traefik_stats.stdout != 'N/A,N/A,N/A' else 'N/A' }} + + Optimizations Enabled: + ✅ Repository Artifact Caching + ✅ Helper Script Caching + ✅ Combined Deployment Playbook + ✅ Exponential Backoff Health Checks + ✅ Concurrency Groups + + Metrics saved to: {{ monitoring_output_dir }}/workflow_metrics_{{ ansible_date_time.epoch }}.json + + ================================================================================ + diff --git a/monitoring/README.md b/monitoring/README.md new file mode 100644 index 00000000..193b15b1 --- /dev/null +++ b/monitoring/README.md @@ -0,0 +1,173 @@ +# Workflow Performance Monitoring + +Dieses Verzeichnis enthält Tools und Metriken zur Überwachung der Workflow-Performance und Systemressourcen. + +## Übersicht + +Das Monitoring-System sammelt Metriken über: +- Workflow-Ausführungszeiten +- Gitea-Last und API-Antwortzeiten +- Systemressourcen (CPU, Memory, Load) +- Docker-Container-Status +- Workflow-Optimierungen + +## Komponenten + +### 1. Monitoring-Script (`scripts/ci/monitor-workflow-performance.sh`) + +Lokales Script zur Sammlung von Workflow-Metriken über die Gitea API. + +**Verwendung:** +```bash +export GITEA_TOKEN="your-token" +export GITEA_URL="https://git.michaelschiemer.de" +export GITHUB_REPOSITORY="michael/michaelschiemer" + +./scripts/ci/monitor-workflow-performance.sh +``` + +**Ausgabe:** +- JSON-Datei mit Metriken in `monitoring/workflow-metrics/` +- Konsolen-Zusammenfassung + +### 2. Ansible Playbook (`deployment/ansible/playbooks/monitor-workflow-performance.yml`) + +Server-seitiges Monitoring. + +**Verwendung:** +```bash +cd deployment/ansible +ansible-playbook -i inventory/production.yml \ + playbooks/monitor-workflow-performance.yml \ + -e "monitoring_lookback_hours=24" +``` + +**Gesammelte Metriken:** +- System Load Average +- CPU- und Memory-Nutzung +- Docker-Container-Status +- Gitea Runner-Status +- Gitea API-Antwortzeiten +- Workflow-Log-Einträge +- Container-Ressourcennutzung (Gitea, Traefik) + +**Ausgabe:** +- JSON-Datei auf dem Server: `/home/deploy/monitoring/workflow-metrics/workflow_metrics_.json` +- Konsolen-Zusammenfassung + +### 3. Gitea Workflow (`.gitea/workflows/monitor-performance.yml`) + +Automatisches Monitoring-Workflow, der alle 6 Stunden läuft. + +**Manuelle Ausführung:** +- Über Gitea UI: Actions → Monitor Workflow Performance → Run workflow +- Optional: `lookback_hours` Parameter anpassen + +**Ausgabe:** +- Artifact mit kombinierten Metriken (30 Tage Retention) +- Workflow-Logs mit Zusammenfassung + +## Metriken-Format + +### System-Metriken +```json +{ + "system_metrics": { + "load_average": "0.5", + "cpu_usage_percent": "15.2", + "memory_usage": "2.1G/8.0G", + "docker_containers": "12", + "docker_disk_usage": "5.2GB", + "gitea_data_size": "1.2G" + } +} +``` + +### Gitea-Metriken +```json +{ + "gitea_metrics": { + "runner_status": "running", + "api_response_time_ms": 45, + "workflow_log_entries_last_24h": 150, + "container_stats": { + "cpu_percent": "2.5%", + "memory_usage": "512MiB / 2GiB", + "memory_percent": "25.0%" + } + } +} +``` + +### Workflow-Metriken +```json +{ + "workflow_metrics": { + "build_image": { + "average_duration_seconds": 420, + "recent_runs": 20 + }, + "manual_deploy": { + "average_duration_seconds": 180, + "recent_runs": 10 + } + } +} +``` + +## Optimierungen + +Das Monitoring-System trackt folgende Optimierungen: + +- ✅ **Repository Artifact Caching**: Repository wird als Artifact zwischen Jobs geteilt +- ✅ **Helper Script Caching**: CI-Helper-Scripts werden als Artifact gecacht +- ✅ **Combined Deployment Playbook**: Einzelnes Playbook für alle Deployment-Schritte +- ✅ **Exponential Backoff Health Checks**: Intelligente Retry-Strategie +- ✅ **Concurrency Groups**: Verhindert parallele Deployments + +## Interpretation der Metriken + +### Gute Werte +- **Load Average**: < 1.0 (für Single-Core), < Anzahl Cores (für Multi-Core) +- **Gitea API Response**: < 100ms +- **Workflow Duration**: < 10 Minuten (Build), < 5 Minuten (Deploy) +- **Memory Usage**: < 80% des verfügbaren Speichers + +### Warnzeichen +- **Load Average**: > 2.0 (kann auf Überlastung hinweisen) +- **Gitea API Response**: > 500ms (kann auf Gitea-Überlastung hinweisen) +- **Workflow Duration**: > 20 Minuten (kann auf Ineffizienzen hinweisen) +- **Workflow Log Entries**: > 1000 pro Stunde (kann auf zu viele Workflows hinweisen) + +## Troubleshooting + +### Keine Metriken gesammelt +1. Prüfe Gitea API-Zugriff (Token, URL) +2. Prüfe SSH-Zugriff auf Server (für Ansible Playbook) +3. Prüfe ob Monitoring-Verzeichnis existiert + +### Hohe System-Last +1. Prüfe laufende Workflows +2. Prüfe Gitea Runner-Status +3. Prüfe Docker-Container-Ressourcennutzung +4. Prüfe ob zu viele parallele Deployments laufen + +### Langsame Workflows +1. Prüfe ob Repository-Artifacts verwendet werden +2. Prüfe ob Helper-Scripts gecacht werden +3. Prüfe Docker Build Cache +4. Prüfe Netzwerk-Latenz zu Registry + +## Nächste Schritte + +1. **Baseline etablieren**: Sammle Metriken über 1-2 Wochen +2. **Trends analysieren**: Identifiziere langfristige Trends +3. **Alerts einrichten**: Warnungen bei kritischen Werten +4. **Weitere Optimierungen**: Basierend auf Metriken + +## Weitere Ressourcen + +- [Gitea Actions Documentation](https://docs.gitea.com/usage/actions) +- [Ansible Best Practices](https://docs.ansible.com/ansible/latest/user_guide/playbooks_best_practices.html) +- [Docker Monitoring](https://docs.docker.com/config/containers/logging/) + diff --git a/scripts/ci/monitor-workflow-performance.sh b/scripts/ci/monitor-workflow-performance.sh new file mode 100755 index 00000000..b9730aaf --- /dev/null +++ b/scripts/ci/monitor-workflow-performance.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# Monitor Workflow Performance +# Collects metrics about workflow execution times, Gitea load, and resource usage + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +OUTPUT_DIR="${REPO_ROOT}/monitoring/workflow-metrics" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +METRICS_FILE="${OUTPUT_DIR}/workflow_metrics_${TIMESTAMP}.json" + +# Create output directory +mkdir -p "$OUTPUT_DIR" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}📊 Workflow Performance Monitor${NC}" +echo "==================================" +echo "" + +# Check if Gitea API credentials are available +GITEA_URL="${GITEA_URL:-https://git.michaelschiemer.de}" +GITEA_TOKEN="${GITEA_TOKEN:-${CI_TOKEN:-}}" +REPO="${GITHUB_REPOSITORY:-michael/michaelschiemer}" + +if [ -z "$GITEA_TOKEN" ]; then + echo -e "${YELLOW}⚠️ GITEA_TOKEN not set, some metrics will be unavailable${NC}" +fi + +# Function to get workflow runs from Gitea API +get_workflow_runs() { + local workflow_name="$1" + local limit="${2:-10}" + + if [ -z "$GITEA_TOKEN" ]; then + echo "[]" + return + fi + + local api_url="${GITEA_URL}/api/v1/repos/${REPO}/actions/runs" + if [ -n "$workflow_name" ]; then + api_url="${api_url}?workflow=${workflow_name}&limit=${limit}" + else + api_url="${api_url}?limit=${limit}" + fi + + curl -sfL \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Accept: application/json" \ + "$api_url" 2>/dev/null || echo "[]" +} + +# Function to calculate average duration +calculate_average_duration() { + local runs_json="$1" + local total=0 + local count=0 + + if [ "$runs_json" = "[]" ] || [ -z "$runs_json" ]; then + echo "0" + return + fi + + # Extract durations (in seconds) from workflow runs + # Note: This is a simplified parser - in production, use jq + echo "$runs_json" | grep -o '"duration":[0-9]*' | grep -o '[0-9]*' | while read -r duration; do + if [ -n "$duration" ] && [ "$duration" -gt 0 ]; then + total=$((total + duration)) + count=$((count + 1)) + fi + done + + if [ "$count" -eq 0 ]; then + echo "0" + else + echo "$((total / count))" + fi +} + +# Collect metrics +echo -e "${BLUE}📥 Collecting workflow metrics...${NC}" + +# Get recent workflow runs +BUILD_WORKFLOW_RUNS=$(get_workflow_runs "build-image.yml" 20) +DEPLOY_WORKFLOW_RUNS=$(get_workflow_runs "manual-deploy.yml" 10) + +# Calculate metrics +BUILD_AVG_DURATION=$(calculate_average_duration "$BUILD_WORKFLOW_RUNS") +DEPLOY_AVG_DURATION=$(calculate_average_duration "$DEPLOY_WORKFLOW_RUNS") + +# Get system metrics (if running on server) +SYSTEM_LOAD="unknown" +DOCKER_CONTAINERS="unknown" +GITEA_RUNNER_STATUS="unknown" + +if command -v uptime >/dev/null 2>&1; then + SYSTEM_LOAD=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ' ') +fi + +if command -v docker >/dev/null 2>&1; then + DOCKER_CONTAINERS=$(docker ps --format '{{.Names}}' | wc -l) + + if docker ps --format '{{.Names}}' | grep -q "gitea-runner"; then + GITEA_RUNNER_STATUS="running" + else + GITEA_RUNNER_STATUS="stopped" + fi +fi + +# Create metrics JSON +cat > "$METRICS_FILE" <