feat: CI/CD pipeline setup complete - Ansible playbooks updated, secrets configured, workflow ready
This commit is contained in:
40
deployment/stacks/application/.env.example
Normal file
40
deployment/stacks/application/.env.example
Normal file
@@ -0,0 +1,40 @@
|
||||
# Application Stack Configuration
|
||||
# Copy this file to .env and adjust values
|
||||
|
||||
# Timezone
|
||||
TZ=Europe/Berlin
|
||||
|
||||
# Application Domain
|
||||
APP_DOMAIN=michaelschiemer.de
|
||||
|
||||
# Application Settings
|
||||
APP_ENV=production
|
||||
APP_DEBUG=false
|
||||
APP_URL=https://michaelschiemer.de
|
||||
|
||||
# Database Configuration
|
||||
# Note: MySQL runs in Stack 5 (PostgreSQL) or external server
|
||||
DB_HOST=mysql
|
||||
DB_PORT=3306
|
||||
DB_NAME=michaelschiemer
|
||||
DB_USER=appuser
|
||||
DB_PASS=<generate-with-openssl-rand-base64-32>
|
||||
|
||||
# Redis Configuration
|
||||
# Generate password with: openssl rand -base64 32
|
||||
REDIS_PASSWORD=<generate-with-openssl-rand-base64-32>
|
||||
|
||||
# Cache Configuration
|
||||
CACHE_DRIVER=redis
|
||||
CACHE_PREFIX=app
|
||||
|
||||
# Session Configuration
|
||||
SESSION_DRIVER=redis
|
||||
SESSION_LIFETIME=120
|
||||
|
||||
# Queue Worker Configuration
|
||||
QUEUE_DRIVER=redis
|
||||
QUEUE_CONNECTION=default
|
||||
QUEUE_WORKER_SLEEP=3
|
||||
QUEUE_WORKER_TRIES=3
|
||||
QUEUE_WORKER_TIMEOUT=60
|
||||
918
deployment/stacks/application/README.md
Normal file
918
deployment/stacks/application/README.md
Normal file
@@ -0,0 +1,918 @@
|
||||
# Application Stack - PHP Application with Nginx, Redis, Queue & Scheduler
|
||||
|
||||
## Overview
|
||||
|
||||
Production-ready PHP application stack with multi-service architecture for high-performance web applications.
|
||||
|
||||
**Features**:
|
||||
- PHP-FPM 8.3+ application runtime
|
||||
- Nginx web server with optimized configuration
|
||||
- Redis for caching, sessions, and queue backend
|
||||
- Dedicated queue worker for background job processing
|
||||
- Scheduler for cron job execution
|
||||
- SSL via Traefik with automatic Let's Encrypt certificates
|
||||
- Private Docker Registry integration
|
||||
- Health checks and automatic restart policies
|
||||
|
||||
## Services
|
||||
|
||||
- **app** (PHP-FPM) - Application runtime handling PHP code execution
|
||||
- **nginx** (Nginx 1.25) - Web server proxying requests to PHP-FPM
|
||||
- **redis** (Redis 7) - Cache, session, and queue backend
|
||||
- **queue-worker** - Background job processor
|
||||
- **scheduler** - Cron job executor
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### 1. Traefik Stack Running
|
||||
```bash
|
||||
cd ../traefik
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### 2. DNS Configuration
|
||||
Point `michaelschiemer.de` to your server IP (94.16.110.151)
|
||||
|
||||
### 3. Docker Registry Access
|
||||
```bash
|
||||
# Login to private registry
|
||||
docker login registry.michaelschiemer.de
|
||||
|
||||
# Verify access
|
||||
docker pull registry.michaelschiemer.de/michaelschiemer-app:latest
|
||||
```
|
||||
|
||||
### 4. Application Image Built
|
||||
```bash
|
||||
# Build and push application image
|
||||
docker build -t registry.michaelschiemer.de/michaelschiemer-app:latest .
|
||||
docker push registry.michaelschiemer.de/michaelschiemer-app:latest
|
||||
```
|
||||
|
||||
### 5. Database Available
|
||||
Stack 5 (PostgreSQL/MySQL) must be running or external database configured.
|
||||
|
||||
## Configuration
|
||||
|
||||
### 1. Create Environment File
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
### 2. Generate Passwords
|
||||
|
||||
```bash
|
||||
# Database password
|
||||
openssl rand -base64 32
|
||||
|
||||
# Redis password
|
||||
openssl rand -base64 32
|
||||
```
|
||||
|
||||
Update `.env`:
|
||||
```env
|
||||
DB_PASS=<generated-database-password>
|
||||
REDIS_PASSWORD=<generated-redis-password>
|
||||
```
|
||||
|
||||
### 3. Configure Database Connection
|
||||
|
||||
**If using Stack 5 (PostgreSQL)**:
|
||||
```env
|
||||
DB_HOST=postgres
|
||||
DB_PORT=5432
|
||||
DB_NAME=michaelschiemer
|
||||
DB_USER=appuser
|
||||
```
|
||||
|
||||
**If using Stack 2 Gitea MySQL**:
|
||||
```env
|
||||
DB_HOST=mysql
|
||||
DB_PORT=3306
|
||||
DB_NAME=michaelschiemer
|
||||
DB_USER=appuser
|
||||
```
|
||||
|
||||
**If using external database**:
|
||||
```env
|
||||
DB_HOST=<external-host>
|
||||
DB_PORT=<port>
|
||||
DB_NAME=<database>
|
||||
DB_USER=<username>
|
||||
```
|
||||
|
||||
### 4. Adjust Queue Worker Settings (Optional)
|
||||
|
||||
```env
|
||||
# Queue worker configuration
|
||||
QUEUE_WORKER_SLEEP=3 # Sleep between job checks (seconds)
|
||||
QUEUE_WORKER_TRIES=3 # Max attempts per job
|
||||
QUEUE_WORKER_TIMEOUT=60 # Max execution time per job (seconds)
|
||||
```
|
||||
|
||||
### 5. Configure Application Settings
|
||||
|
||||
```env
|
||||
APP_ENV=production # production, staging, or development
|
||||
APP_DEBUG=false # Enable debug mode (false for production)
|
||||
APP_URL=https://michaelschiemer.de
|
||||
```
|
||||
|
||||
## Deployment
|
||||
|
||||
### Initial Setup
|
||||
|
||||
```bash
|
||||
# Ensure Traefik is running
|
||||
docker network inspect traefik-public
|
||||
|
||||
# Create .env file
|
||||
cp .env.example .env
|
||||
# Edit .env with generated passwords
|
||||
|
||||
# Start application stack
|
||||
docker compose up -d
|
||||
|
||||
# Check logs
|
||||
docker compose logs -f
|
||||
|
||||
# Verify health
|
||||
docker compose ps
|
||||
```
|
||||
|
||||
### Verify Deployment
|
||||
|
||||
```bash
|
||||
# Test application endpoint
|
||||
curl https://michaelschiemer.de/health
|
||||
# Expected: HTTP 200 "healthy"
|
||||
|
||||
# Check service status
|
||||
docker compose ps
|
||||
# All services should show "healthy" status
|
||||
|
||||
# View logs
|
||||
docker compose logs app # Application logs
|
||||
docker compose logs nginx # Web server logs
|
||||
docker compose logs redis # Redis logs
|
||||
docker compose logs queue-worker # Queue worker logs
|
||||
docker compose logs scheduler # Scheduler logs
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Accessing the Application
|
||||
|
||||
**Main Application**:
|
||||
- URL: https://michaelschiemer.de
|
||||
- SSL: Automatic via Traefik + Let's Encrypt
|
||||
- Auth: Configured in application
|
||||
|
||||
### Managing Services
|
||||
|
||||
```bash
|
||||
# Start stack
|
||||
docker compose up -d
|
||||
|
||||
# Stop stack
|
||||
docker compose down
|
||||
|
||||
# Restart specific service
|
||||
docker compose restart app
|
||||
docker compose restart nginx
|
||||
|
||||
# View logs (follow mode)
|
||||
docker compose logs -f app
|
||||
docker compose logs -f queue-worker
|
||||
|
||||
# Execute commands in app container
|
||||
docker compose exec app php console.php db:migrate
|
||||
docker compose exec app php console.php cache:clear
|
||||
|
||||
# Access Redis CLI
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD>
|
||||
```
|
||||
|
||||
### Queue Management
|
||||
|
||||
```bash
|
||||
# Monitor queue worker
|
||||
docker compose logs -f queue-worker
|
||||
|
||||
# Restart queue worker (e.g., after code changes)
|
||||
docker compose restart queue-worker
|
||||
|
||||
# Check queue status
|
||||
docker compose exec app php console.php queue:status
|
||||
|
||||
# Process specific queue
|
||||
docker compose exec app php console.php queue:work --queue=emails
|
||||
|
||||
# Clear failed jobs
|
||||
docker compose exec app php console.php queue:retry-failed
|
||||
```
|
||||
|
||||
### Scheduler Management
|
||||
|
||||
```bash
|
||||
# View scheduler logs
|
||||
docker compose logs -f scheduler
|
||||
|
||||
# List scheduled tasks
|
||||
docker compose exec app php console.php scheduler:list
|
||||
|
||||
# Run scheduler manually (for testing)
|
||||
docker compose exec app php console.php scheduler:run
|
||||
|
||||
# Restart scheduler
|
||||
docker compose restart scheduler
|
||||
```
|
||||
|
||||
### Cache Management
|
||||
|
||||
```bash
|
||||
# Clear application cache
|
||||
docker compose exec app php console.php cache:clear
|
||||
|
||||
# Clear specific cache tags
|
||||
docker compose exec app php console.php cache:forget user:123
|
||||
|
||||
# View cache statistics
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> INFO stats
|
||||
|
||||
# Monitor cache in real-time
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> MONITOR
|
||||
```
|
||||
|
||||
## Integration with Other Stacks
|
||||
|
||||
### Stack 1: Traefik (Reverse Proxy)
|
||||
|
||||
**Automatic SSL & Routing**:
|
||||
- Traefik labels in docker-compose.yml configure routing
|
||||
- SSL certificates automatically obtained via Let's Encrypt
|
||||
- Middleware chain applies security headers, rate limiting, etc.
|
||||
|
||||
**Verify Integration**:
|
||||
```bash
|
||||
# Check Traefik router
|
||||
docker exec traefik traefik healthcheck
|
||||
|
||||
# View Traefik logs
|
||||
docker logs traefik | grep michaelschiemer.de
|
||||
```
|
||||
|
||||
### Stack 3: Docker Registry
|
||||
|
||||
**Image Pulling**:
|
||||
- Stack pulls application image from private registry
|
||||
- Credentials configured via docker login
|
||||
|
||||
**Update Application**:
|
||||
```bash
|
||||
# Pull latest image
|
||||
docker compose pull app queue-worker scheduler
|
||||
|
||||
# Recreate containers with new image
|
||||
docker compose up -d --force-recreate app queue-worker scheduler
|
||||
```
|
||||
|
||||
### Stack 5: Database (PostgreSQL or MySQL)
|
||||
|
||||
**Connection**:
|
||||
- Database service must be on same Docker network or externally accessible
|
||||
- Connection configured via DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASS
|
||||
|
||||
**Run Migrations**:
|
||||
```bash
|
||||
# Apply database migrations
|
||||
docker compose exec app php console.php db:migrate
|
||||
|
||||
# Check migration status
|
||||
docker compose exec app php console.php db:status
|
||||
|
||||
# Rollback migration
|
||||
docker compose exec app php console.php db:rollback
|
||||
```
|
||||
|
||||
### Stack 2: Gitea (Optional - Shared MySQL)
|
||||
|
||||
**If using Gitea's MySQL**:
|
||||
```env
|
||||
DB_HOST=mysql
|
||||
DB_PORT=3306
|
||||
DB_NAME=michaelschiemer # Create separate database
|
||||
DB_USER=appuser # Create dedicated user
|
||||
```
|
||||
|
||||
**Create Database**:
|
||||
```bash
|
||||
docker exec mysql mysql -u root -p
|
||||
CREATE DATABASE michaelschiemer CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||
CREATE USER 'appuser'@'%' IDENTIFIED BY '<password>';
|
||||
GRANT ALL PRIVILEGES ON michaelschiemer.* TO 'appuser'@'%';
|
||||
FLUSH PRIVILEGES;
|
||||
```
|
||||
|
||||
## Backup & Recovery
|
||||
|
||||
### Manual Backup
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# backup-application.sh
|
||||
|
||||
BACKUP_DIR="/backups/application"
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Backup application storage volume
|
||||
docker run --rm \
|
||||
-v app-storage:/data \
|
||||
-v $BACKUP_DIR:/backup \
|
||||
alpine tar czf /backup/app-storage-$DATE.tar.gz -C /data .
|
||||
|
||||
# Backup application logs
|
||||
docker run --rm \
|
||||
-v app-logs:/data \
|
||||
-v $BACKUP_DIR:/backup \
|
||||
alpine tar czf /backup/app-logs-$DATE.tar.gz -C /data .
|
||||
|
||||
# Backup Redis data (if persistence enabled)
|
||||
docker run --rm \
|
||||
-v redis-data:/data \
|
||||
-v $BACKUP_DIR:/backup \
|
||||
alpine tar czf /backup/redis-data-$DATE.tar.gz -C /data .
|
||||
|
||||
# Backup .env file
|
||||
cp .env $BACKUP_DIR/env-$DATE
|
||||
|
||||
echo "Backup completed: $BACKUP_DIR/*-$DATE.*"
|
||||
```
|
||||
|
||||
### Restore from Backup
|
||||
|
||||
```bash
|
||||
# Stop stack
|
||||
docker compose down
|
||||
|
||||
# Restore storage volume
|
||||
docker run --rm \
|
||||
-v app-storage:/data \
|
||||
-v /backups/application:/backup \
|
||||
alpine tar xzf /backup/app-storage-YYYYMMDD_HHMMSS.tar.gz -C /data
|
||||
|
||||
# Restore logs
|
||||
docker run --rm \
|
||||
-v app-logs:/data \
|
||||
-v /backups/application:/backup \
|
||||
alpine tar xzf /backup/app-logs-YYYYMMDD_HHMMSS.tar.gz -C /data
|
||||
|
||||
# Restore Redis data
|
||||
docker run --rm \
|
||||
-v redis-data:/data \
|
||||
-v /backups/application:/backup \
|
||||
alpine tar xzf /backup/redis-data-YYYYMMDD_HHMMSS.tar.gz -C /data
|
||||
|
||||
# Restore .env
|
||||
cp /backups/application/env-YYYYMMDD_HHMMSS .env
|
||||
|
||||
# Start stack
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### Automated Backups
|
||||
|
||||
Add to crontab:
|
||||
```bash
|
||||
# Daily backup at 2 AM
|
||||
0 2 * * * /path/to/backup-application.sh
|
||||
|
||||
# Keep only last 14 days
|
||||
0 3 * * * find /backups/application -type f -mtime +14 -delete
|
||||
```
|
||||
|
||||
### Redis Persistence
|
||||
|
||||
**Automatic Persistence** (configured in docker-compose.yml):
|
||||
- RDB snapshots: save 900 1, save 300 10, save 60 10000
|
||||
- AOF (Append-Only File): appendonly yes, appendfsync everysec
|
||||
|
||||
**Manual Redis Backup**:
|
||||
```bash
|
||||
# Trigger manual save
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> SAVE
|
||||
|
||||
# Export RDB file
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> BGSAVE
|
||||
docker cp redis:/data/dump.rdb ./redis-backup.rdb
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Health Checks
|
||||
|
||||
```bash
|
||||
# Check all service health
|
||||
docker compose ps
|
||||
|
||||
# Application health
|
||||
curl https://michaelschiemer.de/health
|
||||
|
||||
# Redis health
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> PING
|
||||
# Expected: PONG
|
||||
|
||||
# PHP-FPM health
|
||||
docker compose exec app php-fpm-healthcheck
|
||||
```
|
||||
|
||||
### Log Management
|
||||
|
||||
```bash
|
||||
# View all logs
|
||||
docker compose logs
|
||||
|
||||
# Follow logs for specific service
|
||||
docker compose logs -f app
|
||||
docker compose logs -f nginx
|
||||
docker compose logs -f queue-worker
|
||||
docker compose logs -f scheduler
|
||||
|
||||
# View last 100 lines
|
||||
docker compose logs --tail=100 app
|
||||
|
||||
# View logs since specific time
|
||||
docker compose logs --since 2024-01-01T00:00:00 app
|
||||
|
||||
# Search logs
|
||||
docker compose logs app | grep ERROR
|
||||
docker compose logs nginx | grep 404
|
||||
```
|
||||
|
||||
### Performance Metrics
|
||||
|
||||
```bash
|
||||
# Container resource usage
|
||||
docker stats app nginx redis queue-worker scheduler
|
||||
|
||||
# Redis statistics
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> INFO stats
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> INFO memory
|
||||
|
||||
# Redis slow log
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> SLOWLOG GET 10
|
||||
|
||||
# Nginx status (if enabled)
|
||||
docker compose exec nginx curl http://localhost/nginx_status
|
||||
```
|
||||
|
||||
### Application Monitoring
|
||||
|
||||
```bash
|
||||
# Application metrics endpoint (if implemented)
|
||||
curl https://michaelschiemer.de/metrics
|
||||
|
||||
# Queue statistics
|
||||
docker compose exec app php console.php queue:stats
|
||||
|
||||
# Cache hit rate
|
||||
docker compose exec app php console.php cache:stats
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Application Not Accessible
|
||||
|
||||
```bash
|
||||
# Check service status
|
||||
docker compose ps
|
||||
|
||||
# Check Traefik routing
|
||||
docker exec traefik traefik healthcheck
|
||||
|
||||
# View Nginx logs
|
||||
docker compose logs nginx
|
||||
|
||||
# Test internal connectivity
|
||||
docker compose exec nginx curl http://app:9000/health
|
||||
```
|
||||
|
||||
### PHP-FPM Errors
|
||||
|
||||
```bash
|
||||
# View PHP-FPM logs
|
||||
docker compose logs app
|
||||
|
||||
# Check PHP-FPM pool status
|
||||
docker compose exec app php-fpm-healthcheck
|
||||
|
||||
# Restart PHP-FPM
|
||||
docker compose restart app
|
||||
|
||||
# Check PHP configuration
|
||||
docker compose exec app php -i | grep error
|
||||
```
|
||||
|
||||
### Redis Connection Issues
|
||||
|
||||
```bash
|
||||
# Test Redis connection
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> PING
|
||||
|
||||
# Check Redis logs
|
||||
docker compose logs redis
|
||||
|
||||
# Verify Redis password
|
||||
grep REDIS_PASSWORD .env
|
||||
|
||||
# Test connection from app
|
||||
docker compose exec app php console.php redis:test
|
||||
```
|
||||
|
||||
### Queue Worker Not Processing Jobs
|
||||
|
||||
```bash
|
||||
# Check queue worker status
|
||||
docker compose ps queue-worker
|
||||
|
||||
# View queue worker logs
|
||||
docker compose logs queue-worker
|
||||
|
||||
# Check if queue worker process is running
|
||||
docker compose exec queue-worker pgrep -f queue:work
|
||||
|
||||
# Restart queue worker
|
||||
docker compose restart queue-worker
|
||||
|
||||
# Check queue size
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> LLEN queues:default
|
||||
```
|
||||
|
||||
### Scheduler Not Running
|
||||
|
||||
```bash
|
||||
# Check scheduler status
|
||||
docker compose ps scheduler
|
||||
|
||||
# View scheduler logs
|
||||
docker compose logs scheduler
|
||||
|
||||
# Verify scheduler process
|
||||
docker compose exec scheduler pgrep -f scheduler:run
|
||||
|
||||
# Restart scheduler
|
||||
docker compose restart scheduler
|
||||
|
||||
# Test scheduler manually
|
||||
docker compose exec scheduler php console.php scheduler:run --once
|
||||
```
|
||||
|
||||
### High Memory Usage
|
||||
|
||||
```bash
|
||||
# Check container memory usage
|
||||
docker stats --no-stream
|
||||
|
||||
# Redis memory usage
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> INFO memory
|
||||
|
||||
# Adjust Redis max memory (in .env or docker-compose.yml)
|
||||
# --maxmemory 512mb (default)
|
||||
|
||||
# Clear Redis cache if needed
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> FLUSHDB
|
||||
```
|
||||
|
||||
### Slow Response Times
|
||||
|
||||
```bash
|
||||
# Check Nginx access logs
|
||||
docker compose logs nginx | grep "request_time"
|
||||
|
||||
# Enable PHP slow log (in php.ini or PHP-FPM pool config)
|
||||
# slowlog = /var/log/php-fpm-slow.log
|
||||
# request_slowlog_timeout = 5s
|
||||
|
||||
# Check database query performance
|
||||
docker compose exec app php console.php db:explain
|
||||
|
||||
# Check cache hit ratio
|
||||
docker compose exec redis redis-cli -a <REDIS_PASSWORD> INFO stats
|
||||
```
|
||||
|
||||
### SSL Certificate Issues
|
||||
|
||||
```bash
|
||||
# Check Traefik certificate
|
||||
docker exec traefik cat /acme.json | grep michaelschiemer.de
|
||||
|
||||
# Force certificate renewal (via Traefik)
|
||||
docker restart traefik
|
||||
|
||||
# Test SSL
|
||||
openssl s_client -connect michaelschiemer.de:443 -servername michaelschiemer.de < /dev/null
|
||||
```
|
||||
|
||||
### Database Connection Errors
|
||||
|
||||
```bash
|
||||
# Test database connection from app
|
||||
docker compose exec app php console.php db:test
|
||||
|
||||
# Verify database is accessible
|
||||
# If Stack 5 (PostgreSQL):
|
||||
docker exec postgres pg_isready
|
||||
|
||||
# If Stack 2 (MySQL):
|
||||
docker exec mysql mysqladmin ping
|
||||
|
||||
# Check database credentials in .env
|
||||
grep DB_ .env
|
||||
|
||||
# Test connection manually
|
||||
docker compose exec app php -r "new PDO('mysql:host=mysql;dbname=michaelschiemer', 'appuser', 'password');"
|
||||
```
|
||||
|
||||
## Security
|
||||
|
||||
### Security Best Practices
|
||||
|
||||
1. **Environment Variables**: Never commit .env to version control
|
||||
2. **Strong Passwords**: Use `openssl rand -base64 32` for all passwords
|
||||
3. **Redis Password**: Always set REDIS_PASSWORD in production
|
||||
4. **Database Access**: Use dedicated database user with minimal privileges
|
||||
5. **File Permissions**: Ensure storage directories have correct ownership
|
||||
6. **Updates**: Regularly update Docker images and dependencies
|
||||
7. **Network Isolation**: app-internal network isolates services from external access
|
||||
|
||||
### Security Headers
|
||||
|
||||
**Nginx Configuration** (`nginx/conf.d/default.conf`):
|
||||
- X-Frame-Options: SAMEORIGIN
|
||||
- X-Content-Type-Options: nosniff
|
||||
- X-XSS-Protection: 1; mode=block
|
||||
- Referrer-Policy: strict-origin-when-cross-origin
|
||||
|
||||
**Traefik Middleware** (via default-chain@file):
|
||||
- HSTS
|
||||
- Additional security headers
|
||||
- Rate limiting
|
||||
|
||||
### Rate Limiting
|
||||
|
||||
**Nginx Rate Limits** (configured in nginx/conf.d/default.conf):
|
||||
- API endpoints: 10 requests/second (burst 20)
|
||||
- General requests: 30 requests/second (burst 50)
|
||||
|
||||
**Adjust Rate Limits**:
|
||||
```nginx
|
||||
# Edit nginx/conf.d/default.conf
|
||||
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
|
||||
limit_req_zone $binary_remote_addr zone=general_limit:10m rate=30r/s;
|
||||
```
|
||||
|
||||
### File Upload Security
|
||||
|
||||
```nginx
|
||||
# Client body size limit (nginx/conf.d/default.conf)
|
||||
client_max_body_size 100M; # Adjust based on requirements
|
||||
```
|
||||
|
||||
### Sensitive File Protection
|
||||
|
||||
**Automatically Protected** (nginx/conf.d/default.conf):
|
||||
- .env, .git, .gitignore, .gitattributes
|
||||
- composer.json, composer.lock
|
||||
- package.json, package-lock.json
|
||||
- /storage (except /storage/public)
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Redis Optimization
|
||||
|
||||
**Memory Management**:
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
--maxmemory 512mb # Adjust based on available RAM
|
||||
--maxmemory-policy allkeys-lru # Eviction policy
|
||||
```
|
||||
|
||||
**Persistence**:
|
||||
```yaml
|
||||
# Adjust RDB snapshot frequency
|
||||
--save 900 1 # After 900 sec if 1 key changed
|
||||
--save 300 10 # After 300 sec if 10 keys changed
|
||||
--save 60 10000 # After 60 sec if 10000 keys changed
|
||||
|
||||
# AOF settings
|
||||
--appendonly yes
|
||||
--appendfsync everysec # or: always, no
|
||||
```
|
||||
|
||||
**Connection Pooling**:
|
||||
```php
|
||||
// In application code
|
||||
$redis = new Redis();
|
||||
$redis->pconnect('redis', 6379); // Persistent connection
|
||||
```
|
||||
|
||||
### Nginx Optimization
|
||||
|
||||
**Worker Processes** (add to nginx.conf if needed):
|
||||
```nginx
|
||||
worker_processes auto;
|
||||
worker_connections 1024;
|
||||
```
|
||||
|
||||
**Gzip Compression** (configured):
|
||||
- Level: 6 (balance between compression ratio and CPU usage)
|
||||
- Types: text/plain, text/css, text/javascript, application/json, etc.
|
||||
- Min length: 1024 bytes
|
||||
|
||||
**Static File Caching** (configured):
|
||||
- Expires: 1 year for immutable assets
|
||||
- Cache-Control: public, immutable
|
||||
- Access log: disabled for static files
|
||||
|
||||
**Buffer Tuning** (configured):
|
||||
```nginx
|
||||
fastcgi_buffer_size 128k;
|
||||
fastcgi_buffers 256 16k;
|
||||
fastcgi_busy_buffers_size 256k;
|
||||
fastcgi_temp_file_write_size 256k;
|
||||
```
|
||||
|
||||
### PHP-FPM Optimization
|
||||
|
||||
**Pool Configuration** (adjust in Dockerfile or php-fpm.conf):
|
||||
```ini
|
||||
pm = dynamic
|
||||
pm.max_children = 50
|
||||
pm.start_servers = 5
|
||||
pm.min_spare_servers = 5
|
||||
pm.max_spare_servers = 35
|
||||
pm.max_requests = 500
|
||||
```
|
||||
|
||||
**OPcache** (enable in php.ini):
|
||||
```ini
|
||||
opcache.enable=1
|
||||
opcache.memory_consumption=128
|
||||
opcache.interned_strings_buffer=8
|
||||
opcache.max_accelerated_files=10000
|
||||
opcache.revalidate_freq=2
|
||||
```
|
||||
|
||||
**Timeout Settings** (configured in nginx):
|
||||
```nginx
|
||||
fastcgi_connect_timeout 60s;
|
||||
fastcgi_send_timeout 180s;
|
||||
fastcgi_read_timeout 180s;
|
||||
```
|
||||
|
||||
### Queue Worker Optimization
|
||||
|
||||
**Worker Count**:
|
||||
```bash
|
||||
# Run multiple queue workers for parallel processing
|
||||
docker compose up -d --scale queue-worker=3
|
||||
```
|
||||
|
||||
**Queue Configuration** (.env):
|
||||
```env
|
||||
QUEUE_WORKER_SLEEP=3 # Lower = more responsive, higher = less CPU
|
||||
QUEUE_WORKER_TRIES=3 # Retry failed jobs
|
||||
QUEUE_WORKER_TIMEOUT=60 # Increase for long-running jobs
|
||||
```
|
||||
|
||||
**Separate Queues**:
|
||||
```bash
|
||||
# Start workers for specific queues
|
||||
docker compose exec app php console.php queue:work --queue=high-priority
|
||||
docker compose exec app php console.php queue:work --queue=emails
|
||||
docker compose exec app php console.php queue:work --queue=default
|
||||
```
|
||||
|
||||
### Database Query Optimization
|
||||
|
||||
```bash
|
||||
# Analyze slow queries
|
||||
docker compose exec app php console.php db:explain
|
||||
|
||||
# Enable query logging
|
||||
docker compose exec app php console.php db:log enable
|
||||
|
||||
# Cache query results
|
||||
docker compose exec app php console.php cache:queries
|
||||
```
|
||||
|
||||
## Scheduler Configuration
|
||||
|
||||
### Cron Jobs
|
||||
|
||||
**Scheduler runs** `php console.php scheduler:run` continuously.
|
||||
|
||||
**Define Scheduled Tasks** (in application code):
|
||||
```php
|
||||
// Example: app/Console/Kernel.php
|
||||
protected function schedule(Schedule $schedule): void
|
||||
{
|
||||
// Run every minute
|
||||
$schedule->command('cache:clear')->everyMinute();
|
||||
|
||||
// Run hourly
|
||||
$schedule->command('reports:generate')->hourly();
|
||||
|
||||
// Run daily at 2 AM
|
||||
$schedule->command('cleanup:old-logs')->dailyAt('02:00');
|
||||
|
||||
// Run weekly on Sundays
|
||||
$schedule->command('backup:database')->weekly();
|
||||
}
|
||||
```
|
||||
|
||||
### Scheduler Monitoring
|
||||
|
||||
```bash
|
||||
# View scheduler logs
|
||||
docker compose logs -f scheduler
|
||||
|
||||
# List scheduled tasks
|
||||
docker compose exec app php console.php scheduler:list
|
||||
|
||||
# Run scheduler manually (for testing)
|
||||
docker compose exec app php console.php scheduler:run --once
|
||||
```
|
||||
|
||||
## Update Stack
|
||||
|
||||
### Update Application Code
|
||||
|
||||
```bash
|
||||
# Build new image
|
||||
docker build -t registry.michaelschiemer.de/michaelschiemer-app:latest .
|
||||
|
||||
# Push to registry
|
||||
docker push registry.michaelschiemer.de/michaelschiemer-app:latest
|
||||
|
||||
# Pull and recreate containers
|
||||
docker compose pull
|
||||
docker compose up -d --force-recreate app queue-worker scheduler
|
||||
|
||||
# Run migrations if needed
|
||||
docker compose exec app php console.php db:migrate
|
||||
```
|
||||
|
||||
### Update Nginx Configuration
|
||||
|
||||
```bash
|
||||
# Edit nginx configuration
|
||||
nano nginx/conf.d/default.conf
|
||||
|
||||
# Test configuration
|
||||
docker compose exec nginx nginx -t
|
||||
|
||||
# Reload Nginx
|
||||
docker compose exec nginx nginx -s reload
|
||||
|
||||
# Or restart Nginx
|
||||
docker compose restart nginx
|
||||
```
|
||||
|
||||
### Update Stack Configuration
|
||||
|
||||
```bash
|
||||
# Pull latest images
|
||||
docker compose pull
|
||||
|
||||
# Recreate containers
|
||||
docker compose up -d
|
||||
|
||||
# Verify
|
||||
docker compose ps
|
||||
```
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **Docker Compose Documentation**: https://docs.docker.com/compose/
|
||||
- **Nginx Documentation**: https://nginx.org/en/docs/
|
||||
- **Redis Documentation**: https://redis.io/documentation
|
||||
- **PHP-FPM Documentation**: https://www.php.net/manual/en/install.fpm.php
|
||||
- **Traefik v3 Documentation**: https://doc.traefik.io/traefik/
|
||||
|
||||
## Stack Integration Summary
|
||||
|
||||
**Depends On**:
|
||||
- Stack 1 (Traefik) - SSL and reverse proxy
|
||||
- Stack 3 (Docker Registry) - Application image storage
|
||||
- Stack 5 (Database) - Data persistence
|
||||
|
||||
**Provides**:
|
||||
- PHP application runtime
|
||||
- Web server with SSL
|
||||
- Background job processing
|
||||
- Scheduled task execution
|
||||
- Caching infrastructure
|
||||
215
deployment/stacks/application/docker-compose.yml
Normal file
215
deployment/stacks/application/docker-compose.yml
Normal file
@@ -0,0 +1,215 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# PHP-FPM Application Runtime
|
||||
app:
|
||||
image: git.michaelschiemer.de:5000/framework:latest
|
||||
container_name: app
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- app-internal
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
- APP_ENV=${APP_ENV:-production}
|
||||
- APP_DEBUG=${APP_DEBUG:-false}
|
||||
- APP_URL=${APP_URL:-https://michaelschiemer.de}
|
||||
# Database
|
||||
- DB_HOST=${DB_HOST:-mysql}
|
||||
- DB_PORT=${DB_PORT:-3306}
|
||||
- DB_NAME=${DB_NAME}
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASS=${DB_PASS}
|
||||
# Redis
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_PASSWORD=${REDIS_PASSWORD}
|
||||
# Cache
|
||||
- CACHE_DRIVER=redis
|
||||
- CACHE_PREFIX=${CACHE_PREFIX:-app}
|
||||
# Session
|
||||
- SESSION_DRIVER=redis
|
||||
- SESSION_LIFETIME=${SESSION_LIFETIME:-120}
|
||||
# Queue
|
||||
- QUEUE_DRIVER=redis
|
||||
- QUEUE_CONNECTION=default
|
||||
volumes:
|
||||
- app-storage:/var/www/html/storage
|
||||
- app-logs:/var/www/html/storage/logs
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "php-fpm-healthcheck"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
|
||||
# Nginx Web Server
|
||||
nginx:
|
||||
image: nginx:1.25-alpine
|
||||
container_name: nginx
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- traefik-public
|
||||
- app-internal
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
volumes:
|
||||
- ./nginx/conf.d:/etc/nginx/conf.d:ro
|
||||
- app-storage:/var/www/html/storage:ro
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
# HTTP Router
|
||||
- "traefik.http.routers.app.rule=Host(`${APP_DOMAIN:-michaelschiemer.de}`)"
|
||||
- "traefik.http.routers.app.entrypoints=websecure"
|
||||
- "traefik.http.routers.app.tls=true"
|
||||
- "traefik.http.routers.app.tls.certresolver=letsencrypt"
|
||||
# Service
|
||||
- "traefik.http.services.app.loadbalancer.server.port=80"
|
||||
# Middleware
|
||||
- "traefik.http.routers.app.middlewares=default-chain@file"
|
||||
# Network
|
||||
- "traefik.docker.network=traefik-public"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
depends_on:
|
||||
app:
|
||||
condition: service_healthy
|
||||
|
||||
# Redis Cache/Session/Queue Backend
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: redis
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- app-internal
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
command: >
|
||||
redis-server
|
||||
--requirepass ${REDIS_PASSWORD}
|
||||
--maxmemory 512mb
|
||||
--maxmemory-policy allkeys-lru
|
||||
--save 900 1
|
||||
--save 300 10
|
||||
--save 60 10000
|
||||
--appendonly yes
|
||||
--appendfsync everysec
|
||||
volumes:
|
||||
- redis-data:/data
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
|
||||
# Queue Worker (Background Jobs)
|
||||
queue-worker:
|
||||
image: git.michaelschiemer.de:5000/framework:latest
|
||||
container_name: queue-worker
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- app-internal
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
- APP_ENV=${APP_ENV:-production}
|
||||
- APP_DEBUG=${APP_DEBUG:-false}
|
||||
# Database
|
||||
- DB_HOST=${DB_HOST:-mysql}
|
||||
- DB_PORT=${DB_PORT:-3306}
|
||||
- DB_NAME=${DB_NAME}
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASS=${DB_PASS}
|
||||
# Redis
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_PASSWORD=${REDIS_PASSWORD}
|
||||
# Queue
|
||||
- QUEUE_DRIVER=redis
|
||||
- QUEUE_CONNECTION=default
|
||||
- QUEUE_WORKER_SLEEP=${QUEUE_WORKER_SLEEP:-3}
|
||||
- QUEUE_WORKER_TRIES=${QUEUE_WORKER_TRIES:-3}
|
||||
- QUEUE_WORKER_TIMEOUT=${QUEUE_WORKER_TIMEOUT:-60}
|
||||
volumes:
|
||||
- app-storage:/var/www/html/storage
|
||||
- app-logs:/var/www/html/storage/logs
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
command: php console.php queue:work --queue=default --timeout=${QUEUE_WORKER_TIMEOUT:-60}
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pgrep -f 'queue:work' || exit 1"]
|
||||
interval: 60s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
depends_on:
|
||||
app:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
|
||||
# Scheduler (Cron Jobs)
|
||||
scheduler:
|
||||
image: git.michaelschiemer.de:5000/framework:latest
|
||||
container_name: scheduler
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- app-internal
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
- APP_ENV=${APP_ENV:-production}
|
||||
- APP_DEBUG=${APP_DEBUG:-false}
|
||||
# Database
|
||||
- DB_HOST=${DB_HOST:-mysql}
|
||||
- DB_PORT=${DB_PORT:-3306}
|
||||
- DB_NAME=${DB_NAME}
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASS=${DB_PASS}
|
||||
# Redis
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_PASSWORD=${REDIS_PASSWORD}
|
||||
volumes:
|
||||
- app-storage:/var/www/html/storage
|
||||
- app-logs:/var/www/html/storage/logs
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
command: php console.php scheduler:run
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pgrep -f 'scheduler:run' || exit 1"]
|
||||
interval: 60s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
depends_on:
|
||||
app:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
|
||||
volumes:
|
||||
app-storage:
|
||||
name: app-storage
|
||||
app-logs:
|
||||
name: app-logs
|
||||
redis-data:
|
||||
name: redis-data
|
||||
|
||||
networks:
|
||||
traefik-public:
|
||||
external: true
|
||||
app-internal:
|
||||
name: app-internal
|
||||
driver: bridge
|
||||
129
deployment/stacks/application/nginx/conf.d/default.conf
Normal file
129
deployment/stacks/application/nginx/conf.d/default.conf
Normal file
@@ -0,0 +1,129 @@
|
||||
# Nginx Configuration for PHP-FPM Application
|
||||
# Optimized for production with security headers and performance tuning
|
||||
|
||||
# Upstream PHP-FPM
|
||||
upstream php-fpm {
|
||||
server app:9000;
|
||||
}
|
||||
|
||||
# Rate limiting zones
|
||||
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
|
||||
limit_req_zone $binary_remote_addr zone=general_limit:10m rate=30r/s;
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
|
||||
root /var/www/html/public;
|
||||
index index.php index.html;
|
||||
|
||||
# Security Headers (additional to Traefik middleware)
|
||||
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||
add_header X-Content-Type-Options "nosniff" always;
|
||||
add_header X-XSS-Protection "1; mode=block" always;
|
||||
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
||||
|
||||
# Client body size limit (adjust for file uploads)
|
||||
client_max_body_size 100M;
|
||||
|
||||
# Gzip compression
|
||||
gzip on;
|
||||
gzip_vary on;
|
||||
gzip_min_length 1024;
|
||||
gzip_comp_level 6;
|
||||
gzip_types
|
||||
text/plain
|
||||
text/css
|
||||
text/javascript
|
||||
application/json
|
||||
application/javascript
|
||||
application/x-javascript
|
||||
text/xml
|
||||
application/xml
|
||||
application/xml+rss
|
||||
image/svg+xml;
|
||||
|
||||
# Logging
|
||||
access_log /var/log/nginx/access.log combined;
|
||||
error_log /var/log/nginx/error.log warn;
|
||||
|
||||
# Health check endpoint (for Docker healthcheck)
|
||||
location /health {
|
||||
access_log off;
|
||||
return 200 "healthy\n";
|
||||
add_header Content-Type text/plain;
|
||||
}
|
||||
|
||||
# Deny access to sensitive files
|
||||
location ~ /\. {
|
||||
deny all;
|
||||
access_log off;
|
||||
log_not_found off;
|
||||
}
|
||||
|
||||
location ~ ^/(\.env|\.git|\.gitignore|\.gitattributes|composer\.(json|lock)|package(-lock)?\.json) {
|
||||
deny all;
|
||||
access_log off;
|
||||
log_not_found off;
|
||||
}
|
||||
|
||||
# Static files - serve directly for performance
|
||||
location ~* \.(jpg|jpeg|png|gif|ico|css|js|svg|woff|woff2|ttf|eot|webp)$ {
|
||||
expires 1y;
|
||||
add_header Cache-Control "public, immutable";
|
||||
access_log off;
|
||||
try_files $uri =404;
|
||||
}
|
||||
|
||||
# API endpoints - rate limited
|
||||
location ^~ /api/ {
|
||||
limit_req zone=api_limit burst=20 nodelay;
|
||||
limit_req_status 429;
|
||||
|
||||
try_files $uri $uri/ /index.php?$query_string;
|
||||
}
|
||||
|
||||
# PHP-FPM processing
|
||||
location ~ \.php$ {
|
||||
limit_req zone=general_limit burst=50 nodelay;
|
||||
|
||||
try_files $uri =404;
|
||||
fastcgi_split_path_info ^(.+\.php)(/.+)$;
|
||||
|
||||
fastcgi_pass php-fpm;
|
||||
fastcgi_index index.php;
|
||||
fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name;
|
||||
fastcgi_param PATH_INFO $fastcgi_path_info;
|
||||
|
||||
# FastCGI parameters
|
||||
include fastcgi_params;
|
||||
|
||||
# PHP-FPM timeouts
|
||||
fastcgi_connect_timeout 60s;
|
||||
fastcgi_send_timeout 180s;
|
||||
fastcgi_read_timeout 180s;
|
||||
|
||||
# Buffer settings
|
||||
fastcgi_buffer_size 128k;
|
||||
fastcgi_buffers 256 16k;
|
||||
fastcgi_busy_buffers_size 256k;
|
||||
fastcgi_temp_file_write_size 256k;
|
||||
|
||||
# Hide PHP version
|
||||
fastcgi_hide_header X-Powered-By;
|
||||
}
|
||||
|
||||
# Fallback to index.php for non-existent files (framework routing)
|
||||
location / {
|
||||
try_files $uri $uri/ /index.php?$query_string;
|
||||
}
|
||||
|
||||
# Deny access to storage directory (except public subdirectory)
|
||||
location ^~ /storage {
|
||||
deny all;
|
||||
}
|
||||
|
||||
location ^~ /storage/public {
|
||||
allow all;
|
||||
}
|
||||
}
|
||||
20
deployment/stacks/gitea/.env.example
Normal file
20
deployment/stacks/gitea/.env.example
Normal file
@@ -0,0 +1,20 @@
|
||||
# Gitea Configuration
|
||||
# Copy this file to .env and adjust values
|
||||
|
||||
# Timezone
|
||||
TZ=Europe/Berlin
|
||||
|
||||
# Gitea Domain
|
||||
GITEA_DOMAIN=git.michaelschiemer.de
|
||||
|
||||
# MySQL Configuration
|
||||
MYSQL_ROOT_PASSWORD=<generate-strong-password>
|
||||
MYSQL_DATABASE=gitea
|
||||
MYSQL_USER=gitea
|
||||
MYSQL_PASSWORD=<generate-strong-password>
|
||||
|
||||
# Redis Configuration
|
||||
REDIS_PASSWORD=<generate-strong-password>
|
||||
|
||||
# Gitea Settings
|
||||
DISABLE_REGISTRATION=true # Set to false to allow user registration
|
||||
452
deployment/stacks/gitea/README.md
Normal file
452
deployment/stacks/gitea/README.md
Normal file
@@ -0,0 +1,452 @@
|
||||
# Gitea Stack - Self-Hosted Git Server
|
||||
|
||||
## Overview
|
||||
|
||||
Gitea acts as the central Git server with integrated CI/CD capabilities through Gitea Actions, handling:
|
||||
- Git repository hosting
|
||||
- User and organization management
|
||||
- Pull requests and code reviews
|
||||
- Issue tracking
|
||||
- Gitea Actions for CI/CD (runner runs on development machine)
|
||||
- API for automation
|
||||
|
||||
## Services
|
||||
|
||||
- **git.michaelschiemer.de** - Gitea Web Interface
|
||||
- **git.michaelschiemer.de:2222** - SSH for Git operations
|
||||
- **MySQL 8.0** - Database backend
|
||||
- **Redis 7** - Cache, session, and queue storage
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Traefik Stack Running**
|
||||
```bash
|
||||
cd ../traefik
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
2. **DNS Configuration**
|
||||
Point `git.michaelschiemer.de` to your server IP (94.16.110.151)
|
||||
|
||||
3. **SSH Port Availability**
|
||||
Ensure port 2222 is open in your firewall for Git SSH operations
|
||||
|
||||
## Configuration
|
||||
|
||||
### 1. Create Environment File
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
### 2. Generate Strong Passwords
|
||||
|
||||
```bash
|
||||
# MySQL root password
|
||||
openssl rand -base64 32
|
||||
|
||||
# MySQL gitea password
|
||||
openssl rand -base64 32
|
||||
|
||||
# Redis password
|
||||
openssl rand -base64 32
|
||||
```
|
||||
|
||||
Update `.env` with generated passwords:
|
||||
```env
|
||||
MYSQL_ROOT_PASSWORD=<generated-password-1>
|
||||
MYSQL_PASSWORD=<generated-password-2>
|
||||
REDIS_PASSWORD=<generated-password-3>
|
||||
```
|
||||
|
||||
### 3. Adjust Configuration (Optional)
|
||||
|
||||
Edit `.env` for:
|
||||
- Domain customization
|
||||
- User registration settings
|
||||
- Database configuration
|
||||
|
||||
## Deployment
|
||||
|
||||
### Initial Setup
|
||||
|
||||
```bash
|
||||
# Deploy stack
|
||||
docker compose up -d
|
||||
|
||||
# Check logs
|
||||
docker compose logs -f
|
||||
|
||||
# Wait for MySQL initialization (30-60 seconds)
|
||||
docker compose logs mysql | grep "ready for connections"
|
||||
|
||||
# Verify services are healthy
|
||||
docker compose ps
|
||||
```
|
||||
|
||||
### First Time Configuration
|
||||
|
||||
1. **Access Gitea**: https://git.michaelschiemer.de
|
||||
|
||||
2. **Initial Setup Wizard**:
|
||||
- Database settings are pre-configured via environment variables
|
||||
- Set up admin account:
|
||||
- Username: `admin` (or your preference)
|
||||
- Email: `kontakt@michaelschiemer.de`
|
||||
- Password: Strong password
|
||||
- Server and third-party settings: Use defaults
|
||||
- Click "Install Gitea"
|
||||
|
||||
3. **Verify SSH Access**:
|
||||
```bash
|
||||
# Test SSH connection (replace 'git' with your username after setup)
|
||||
ssh -T -p 2222 git@git.michaelschiemer.de
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Creating a Repository
|
||||
|
||||
1. Log in to https://git.michaelschiemer.de
|
||||
2. Click "+" → "New Repository"
|
||||
3. Fill in repository details
|
||||
4. Clone via HTTPS or SSH:
|
||||
```bash
|
||||
# HTTPS
|
||||
git clone https://git.michaelschiemer.de/username/repo.git
|
||||
|
||||
# SSH
|
||||
git clone ssh://git@git.michaelschiemer.de:2222/username/repo.git
|
||||
```
|
||||
|
||||
### Gitea Actions
|
||||
|
||||
Gitea Actions (GitHub Actions compatible) are enabled by default. To use them:
|
||||
|
||||
1. **Create `.gitea/workflows/` directory** in your repository
|
||||
2. **Add workflow YAML files** (e.g., `deploy.yml`)
|
||||
3. **Register a Runner** (see Runner setup section below)
|
||||
|
||||
**Note**: The Gitea Actions Runner should run on your **development machine**, not on the production server. See Stack 9 documentation for runner setup.
|
||||
|
||||
### User Management
|
||||
|
||||
**Disable Registration** (Default):
|
||||
- Set `DISABLE_REGISTRATION=true` in `.env` (already default)
|
||||
- Create users via Admin Panel
|
||||
|
||||
**Enable Registration**:
|
||||
- Set `DISABLE_REGISTRATION=false` in `.env`
|
||||
- Restart: `docker compose restart gitea`
|
||||
|
||||
### Organizations and Teams
|
||||
|
||||
1. Navigate to Organizations
|
||||
2. Create organization
|
||||
3. Add repositories to organization
|
||||
4. Manage teams and permissions
|
||||
|
||||
## API Access
|
||||
|
||||
Gitea provides a comprehensive API:
|
||||
|
||||
```bash
|
||||
# Generate API token
|
||||
# Settings → Applications → Generate New Token
|
||||
|
||||
# Example: List repositories
|
||||
curl -H "Authorization: token YOUR_TOKEN" \
|
||||
https://git.michaelschiemer.de/api/v1/user/repos
|
||||
```
|
||||
|
||||
**API Documentation**: https://git.michaelschiemer.de/api/swagger
|
||||
|
||||
## Backup & Recovery
|
||||
|
||||
### Manual Backup
|
||||
|
||||
```bash
|
||||
# Backup script (run on production server)
|
||||
#!/bin/bash
|
||||
BACKUP_DIR="/backups/gitea"
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
# Create backup directory
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Backup Gitea data
|
||||
docker run --rm \
|
||||
-v gitea-data:/data \
|
||||
-v $BACKUP_DIR:/backup \
|
||||
alpine tar czf /backup/gitea-data-$DATE.tar.gz -C /data .
|
||||
|
||||
# Backup MySQL database
|
||||
docker exec gitea-mysql mysqldump \
|
||||
-u root -p$MYSQL_ROOT_PASSWORD \
|
||||
--all-databases \
|
||||
--single-transaction \
|
||||
--quick \
|
||||
--lock-tables=false \
|
||||
> $BACKUP_DIR/gitea-mysql-$DATE.sql
|
||||
|
||||
# Backup Redis data
|
||||
docker run --rm \
|
||||
-v gitea-redis-data:/data \
|
||||
-v $BACKUP_DIR:/backup \
|
||||
alpine tar czf /backup/gitea-redis-$DATE.tar.gz -C /data .
|
||||
|
||||
echo "Backup completed: $BACKUP_DIR/*-$DATE.*"
|
||||
```
|
||||
|
||||
### Restore from Backup
|
||||
|
||||
```bash
|
||||
# Stop services
|
||||
docker compose down
|
||||
|
||||
# Restore Gitea data
|
||||
docker run --rm \
|
||||
-v gitea-data:/data \
|
||||
-v /backups/gitea:/backup \
|
||||
alpine tar xzf /backup/gitea-data-YYYYMMDD_HHMMSS.tar.gz -C /data
|
||||
|
||||
# Restore MySQL
|
||||
cat /backups/gitea/gitea-mysql-YYYYMMDD_HHMMSS.sql | \
|
||||
docker exec -i gitea-mysql mysql -u root -p$MYSQL_ROOT_PASSWORD
|
||||
|
||||
# Restore Redis
|
||||
docker run --rm \
|
||||
-v gitea-redis-data:/data \
|
||||
-v /backups/gitea:/backup \
|
||||
alpine tar xzf /backup/gitea-redis-YYYYMMDD_HHMMSS.tar.gz -C /data
|
||||
|
||||
# Start services
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### Automated Backups
|
||||
|
||||
Add to crontab on production server:
|
||||
|
||||
```bash
|
||||
# Daily backup at 2 AM
|
||||
0 2 * * * /path/to/backup-gitea.sh
|
||||
|
||||
# Keep only last 7 days
|
||||
0 3 * * * find /backups/gitea -type f -mtime +7 -delete
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Health Checks
|
||||
|
||||
```bash
|
||||
# Check service health
|
||||
docker compose ps
|
||||
|
||||
# Gitea health endpoint
|
||||
curl -f https://git.michaelschiemer.de/api/healthz
|
||||
|
||||
# MySQL health
|
||||
docker exec gitea-mysql mysqladmin ping -h localhost -u root -p$MYSQL_ROOT_PASSWORD
|
||||
|
||||
# Redis health
|
||||
docker exec gitea-redis redis-cli -a $REDIS_PASSWORD ping
|
||||
```
|
||||
|
||||
### Logs
|
||||
|
||||
```bash
|
||||
# All services
|
||||
docker compose logs -f
|
||||
|
||||
# Gitea only
|
||||
docker compose logs -f gitea
|
||||
|
||||
# MySQL only
|
||||
docker compose logs -f mysql
|
||||
|
||||
# Redis only
|
||||
docker compose logs -f redis
|
||||
|
||||
# MySQL slow queries
|
||||
docker exec gitea-mysql tail -f /var/log/mysql/slow-queries.log
|
||||
```
|
||||
|
||||
### Resource Usage
|
||||
|
||||
```bash
|
||||
# Container stats
|
||||
docker stats gitea gitea-mysql gitea-redis
|
||||
|
||||
# Disk usage
|
||||
docker system df -v | grep gitea
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Gitea Not Starting
|
||||
|
||||
```bash
|
||||
# Check logs
|
||||
docker compose logs gitea
|
||||
|
||||
# Common issues:
|
||||
# 1. MySQL not ready - wait 30-60 seconds
|
||||
# 2. Database connection failed - check MYSQL_PASSWORD in .env
|
||||
# 3. Redis connection failed - check REDIS_PASSWORD
|
||||
```
|
||||
|
||||
### SSH Not Working
|
||||
|
||||
```bash
|
||||
# Verify port 2222 is open
|
||||
sudo ufw status | grep 2222
|
||||
|
||||
# Open if needed
|
||||
sudo ufw allow 2222/tcp
|
||||
|
||||
# Test SSH connection
|
||||
ssh -T -p 2222 git@git.michaelschiemer.de
|
||||
|
||||
# Check Gitea SSH settings
|
||||
# Admin Panel → Configuration → Server and Other Services → SSH Server Domain
|
||||
```
|
||||
|
||||
### Database Connection Issues
|
||||
|
||||
```bash
|
||||
# Verify MySQL is running and healthy
|
||||
docker compose ps mysql
|
||||
|
||||
# Test database connection
|
||||
docker exec gitea-mysql mysql -u gitea -p$MYSQL_PASSWORD -e "SELECT 1;"
|
||||
|
||||
# Check MySQL logs
|
||||
docker compose logs mysql | grep -i error
|
||||
```
|
||||
|
||||
### Redis Connection Issues
|
||||
|
||||
```bash
|
||||
# Verify Redis is running
|
||||
docker compose ps redis
|
||||
|
||||
# Test Redis connection
|
||||
docker exec gitea-redis redis-cli -a $REDIS_PASSWORD ping
|
||||
|
||||
# Check Redis logs
|
||||
docker compose logs redis
|
||||
```
|
||||
|
||||
### Performance Issues
|
||||
|
||||
```bash
|
||||
# Check MySQL slow queries
|
||||
docker exec gitea-mysql tail -100 /var/log/mysql/slow-queries.log
|
||||
|
||||
# Analyze MySQL performance
|
||||
docker exec gitea-mysql mysql -u root -p$MYSQL_ROOT_PASSWORD \
|
||||
-e "SHOW PROCESSLIST;"
|
||||
|
||||
# Check Redis memory usage
|
||||
docker exec gitea-redis redis-cli -a $REDIS_PASSWORD INFO memory
|
||||
```
|
||||
|
||||
### Reset Admin Password
|
||||
|
||||
```bash
|
||||
# Connect to Gitea container
|
||||
docker exec -it gitea bash
|
||||
|
||||
# Change admin password
|
||||
gitea admin user change-password --username admin --password new-password
|
||||
```
|
||||
|
||||
## Security
|
||||
|
||||
### Security Best Practices
|
||||
|
||||
1. **Disable User Registration**: Set `DISABLE_REGISTRATION=true`
|
||||
2. **Strong Passwords**: Use generated passwords for all services
|
||||
3. **Regular Updates**: Keep Gitea, MySQL, and Redis updated
|
||||
4. **SSH Keys**: Prefer SSH keys over HTTPS for Git operations
|
||||
5. **2FA**: Enable two-factor authentication for admin accounts
|
||||
6. **API Token Security**: Rotate tokens regularly
|
||||
7. **Firewall**: Only expose ports 80, 443, and 2222
|
||||
|
||||
### Update Stack
|
||||
|
||||
```bash
|
||||
# Pull latest images
|
||||
docker compose pull
|
||||
|
||||
# Recreate containers
|
||||
docker compose up -d
|
||||
|
||||
# Verify
|
||||
docker compose ps
|
||||
```
|
||||
|
||||
### Security Headers
|
||||
|
||||
Security headers are applied via Traefik's `default-chain@file` middleware:
|
||||
- HSTS
|
||||
- Content-Type Nosniff
|
||||
- XSS Protection
|
||||
- Frame Deny
|
||||
- CSP
|
||||
|
||||
## Integration with Other Stacks
|
||||
|
||||
### Docker Registry (Stack 3)
|
||||
|
||||
Gitea Actions can push built images to the private Docker Registry:
|
||||
|
||||
```yaml
|
||||
# .gitea/workflows/deploy.yml
|
||||
- name: Push to Registry
|
||||
run: |
|
||||
docker login registry.michaelschiemer.de -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASS }}
|
||||
docker push registry.michaelschiemer.de/myapp:latest
|
||||
```
|
||||
|
||||
### Application Stack (Stack 4)
|
||||
|
||||
Deploy applications via Gitea Actions + Ansible:
|
||||
|
||||
```yaml
|
||||
- name: Deploy to Production
|
||||
run: |
|
||||
ansible-playbook -i inventory/production deploy.yml
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### MySQL Optimization
|
||||
|
||||
Adjust `mysql/conf.d/gitea.cnf`:
|
||||
- `innodb_buffer_pool_size`: Increase for more RAM
|
||||
- `max_connections`: Increase for more concurrent users
|
||||
- `slow_query_log`: Monitor slow queries
|
||||
|
||||
### Redis Optimization
|
||||
|
||||
```bash
|
||||
# Add to docker-compose.yml redis command:
|
||||
# --maxmemory 512mb --maxmemory-policy allkeys-lru
|
||||
```
|
||||
|
||||
### Gitea Configuration
|
||||
|
||||
Edit via Admin Panel → Configuration or `app.ini`:
|
||||
- Enable caching for static assets
|
||||
- Adjust session timeout
|
||||
- Configure queue workers for Actions
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **Gitea Documentation**: https://docs.gitea.io/
|
||||
- **Gitea Actions**: https://docs.gitea.io/en-us/usage/actions/overview/
|
||||
- **API Documentation**: https://git.michaelschiemer.de/api/swagger
|
||||
- **MySQL Tuning**: https://dev.mysql.com/doc/refman/8.0/en/optimization.html
|
||||
@@ -7,7 +7,6 @@ services:
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- postgres
|
||||
- redis
|
||||
networks:
|
||||
- traefik-public
|
||||
- gitea-internal
|
||||
@@ -15,18 +14,16 @@ services:
|
||||
- TZ=Europe/Berlin
|
||||
- USER_UID=1000
|
||||
- USER_GID=1000
|
||||
- POSTGRES_PASSWORD=gitea_password
|
||||
- GITEA__database__DB_TYPE=postgres
|
||||
- GITEA__database__HOST=postgres:5432
|
||||
- GITEA__database__NAME=${POSTGRES_DB:-gitea}
|
||||
- GITEA__database__USER=${POSTGRES_USER:-gitea}
|
||||
- GITEA__database__PASSWD=${POSTGRES_PASSWORD:-gitea_password}
|
||||
- GITEA__cache__ENABLED=true
|
||||
- GITEA__cache__ADAPTER=redis
|
||||
- GITEA__cache__HOST=redis://:${REDIS_PASSWORD:-gitea_redis}@redis:6379/0
|
||||
- GITEA__session__PROVIDER=redis
|
||||
- GITEA__session__PROVIDER_CONFIG=redis://:${REDIS_PASSWORD:-gitea_redis}@redis:6379/1
|
||||
- GITEA__queue__TYPE=redis
|
||||
- GITEA__queue__CONN_STR=redis://:${REDIS_PASSWORD:-gitea_redis}@redis:6379/2
|
||||
- GITEA__cache__ENABLED=false
|
||||
- GITEA__cache__ADAPTER=memory
|
||||
- GITEA__session__PROVIDER=file
|
||||
- GITEA__queue__TYPE=channel
|
||||
- GITEA__server__DOMAIN=${GITEA_DOMAIN:-git.michaelschiemer.de}
|
||||
- GITEA__server__ROOT_URL=https://${GITEA_DOMAIN:-git.michaelschiemer.de}/
|
||||
- GITEA__server__SSH_DOMAIN=${GITEA_DOMAIN:-git.michaelschiemer.de}
|
||||
@@ -43,10 +40,12 @@ services:
|
||||
- "traefik.enable=true"
|
||||
|
||||
# HTTP Router
|
||||
- "traefik.http.routers.gitea.rule=Host(`${GITEA_DOMAIN:-git.michaelschiemer.de}`)"
|
||||
- "traefik.http.routers.gitea.rule=Host(`git.michaelschiemer.de`)"
|
||||
- "traefik.http.routers.gitea.entrypoints=websecure"
|
||||
- "traefik.http.routers.gitea.tls=true"
|
||||
- "traefik.http.routers.gitea.tls.certresolver=letsencrypt"
|
||||
# Priority to ensure this router is matched before catchall (catchall has no explicit priority, so default is 0)
|
||||
- "traefik.http.routers.gitea.priority=100"
|
||||
|
||||
# Service
|
||||
- "traefik.http.services.gitea.loadbalancer.server.port=3000"
|
||||
@@ -68,9 +67,9 @@ services:
|
||||
- gitea-internal
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
- POSTGRES_DB=${POSTGRES_DB:-gitea}
|
||||
- POSTGRES_USER=${POSTGRES_USER:-gitea}
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-gitea_password}
|
||||
- POSTGRES_DB=gitea
|
||||
- POSTGRES_USER=gitea
|
||||
- POSTGRES_PASSWORD=gitea_password
|
||||
volumes:
|
||||
- postgres-data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
@@ -80,31 +79,32 @@ services:
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: gitea-redis
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- gitea-internal
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
volumes:
|
||||
- redis-data:/data
|
||||
command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD:-gitea_redis}
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
# redis (disabled for now; Gitea configured to not use redis)
|
||||
# redis:
|
||||
# image: redis:7
|
||||
# container_name: gitea-redis
|
||||
# restart: unless-stopped
|
||||
# networks:
|
||||
# - gitea-internal
|
||||
# environment:
|
||||
# - TZ=Europe/Berlin
|
||||
# volumes:
|
||||
# - redis-data:/data
|
||||
# command: redis-server --appendonly yes
|
||||
# healthcheck:
|
||||
# test: ["CMD", "redis-cli", "ping"]
|
||||
# interval: 30s
|
||||
# timeout: 10s
|
||||
# retries: 3
|
||||
# start_period: 10s
|
||||
|
||||
volumes:
|
||||
gitea-data:
|
||||
name: gitea-data
|
||||
postgres-data:
|
||||
name: gitea-postgres-data
|
||||
redis-data:
|
||||
name: gitea-redis-data
|
||||
# redis-data:
|
||||
# name: gitea-redis-data
|
||||
|
||||
networks:
|
||||
traefik-public:
|
||||
|
||||
33
deployment/stacks/gitea/mysql/conf.d/gitea.cnf
Normal file
33
deployment/stacks/gitea/mysql/conf.d/gitea.cnf
Normal file
@@ -0,0 +1,33 @@
|
||||
[mysqld]
|
||||
# Gitea-optimized MySQL configuration
|
||||
|
||||
# Character set
|
||||
character-set-server = utf8mb4
|
||||
collation-server = utf8mb4_unicode_ci
|
||||
|
||||
# InnoDB settings
|
||||
innodb_buffer_pool_size = 256M
|
||||
innodb_log_file_size = 64M
|
||||
innodb_flush_log_at_trx_commit = 2
|
||||
innodb_flush_method = O_DIRECT
|
||||
|
||||
# Connection settings
|
||||
max_connections = 200
|
||||
max_allowed_packet = 64M
|
||||
|
||||
# Query cache (disabled in MySQL 8.0+)
|
||||
# Performance schema
|
||||
performance_schema = ON
|
||||
|
||||
# Logging
|
||||
slow_query_log = 1
|
||||
slow_query_log_file = /var/log/mysql/slow-queries.log
|
||||
long_query_time = 2
|
||||
|
||||
# Binary logging for backups
|
||||
log_bin = /var/log/mysql/mysql-bin.log
|
||||
binlog_expire_logs_seconds = 604800 # 7 days
|
||||
max_binlog_size = 100M
|
||||
|
||||
[client]
|
||||
default-character-set = utf8mb4
|
||||
21
deployment/stacks/monitoring/.env.example
Normal file
21
deployment/stacks/monitoring/.env.example
Normal file
@@ -0,0 +1,21 @@
|
||||
# Monitoring Stack Environment Configuration
|
||||
# Copy to .env and configure with your actual values
|
||||
|
||||
# Domain Configuration
|
||||
DOMAIN=michaelschiemer.de
|
||||
|
||||
# Grafana Configuration
|
||||
GRAFANA_ADMIN_USER=admin
|
||||
GRAFANA_ADMIN_PASSWORD=changeme_secure_password
|
||||
|
||||
# Grafana Plugins (comma-separated)
|
||||
# Common useful plugins:
|
||||
# - grafana-clock-panel
|
||||
# - grafana-piechart-panel
|
||||
# - grafana-worldmap-panel
|
||||
GRAFANA_PLUGINS=
|
||||
|
||||
# Prometheus BasicAuth
|
||||
# Generate with: htpasswd -nb admin password
|
||||
# Format: username:hashed_password
|
||||
PROMETHEUS_AUTH=admin:$$apr1$$xyz...
|
||||
751
deployment/stacks/monitoring/README.md
Normal file
751
deployment/stacks/monitoring/README.md
Normal file
@@ -0,0 +1,751 @@
|
||||
# Stack 6: Monitoring (Portainer + Grafana + Prometheus)
|
||||
|
||||
Comprehensive monitoring stack for infrastructure and application observability.
|
||||
|
||||
## Overview
|
||||
|
||||
This stack provides complete monitoring and visualization capabilities for the entire infrastructure:
|
||||
- **Prometheus**: Time-series metrics collection and alerting
|
||||
- **Grafana**: Metrics visualization with pre-configured dashboards
|
||||
- **Portainer**: Container management UI
|
||||
- **Node Exporter**: Host system metrics (CPU, memory, disk, network)
|
||||
- **cAdvisor**: Container resource usage metrics
|
||||
- **Alertmanager**: Alert routing and management (via Prometheus)
|
||||
|
||||
## Features
|
||||
|
||||
### Prometheus
|
||||
- Multi-target scraping (node-exporter, cadvisor, traefik)
|
||||
- 15-second scrape interval for near real-time metrics
|
||||
- 15-day retention period
|
||||
- Pre-configured alert rules for critical conditions
|
||||
- Built-in alerting engine
|
||||
- Service discovery via static configs
|
||||
- HTTPS support with BasicAuth protection
|
||||
|
||||
### Grafana
|
||||
- Pre-configured Prometheus datasource
|
||||
- Three comprehensive dashboards:
|
||||
- **Docker Containers**: Container CPU, memory, network I/O, restarts
|
||||
- **Host System**: System CPU, memory, disk, network, uptime
|
||||
- **Traefik**: Request rates, response times, status codes, error rates
|
||||
- Auto-provisioning (no manual configuration needed)
|
||||
- HTTPS access via Traefik
|
||||
- 30-second auto-refresh
|
||||
- Dark theme for reduced eye strain
|
||||
|
||||
### Portainer
|
||||
- Web-based Docker management UI
|
||||
- Container start/stop/restart/logs
|
||||
- Stack management and deployment
|
||||
- Volume and network management
|
||||
- Resource usage visualization
|
||||
- HTTPS access via Traefik
|
||||
|
||||
### Node Exporter
|
||||
- Host system metrics:
|
||||
- CPU usage by core and mode
|
||||
- Memory usage and available memory
|
||||
- Disk usage by filesystem
|
||||
- Network I/O by interface
|
||||
- System load averages
|
||||
- System uptime
|
||||
|
||||
### cAdvisor
|
||||
- Container metrics:
|
||||
- CPU usage per container
|
||||
- Memory usage per container
|
||||
- Network I/O per container
|
||||
- Disk I/O per container
|
||||
- Container restart counts
|
||||
- Container health status
|
||||
|
||||
## Services
|
||||
|
||||
| Service | Domain | Port | Purpose |
|
||||
|---------|--------|------|---------|
|
||||
| Grafana | grafana.michaelschiemer.de | 3000 | Metrics visualization |
|
||||
| Prometheus | prometheus.michaelschiemer.de | 9090 | Metrics collection |
|
||||
| Portainer | portainer.michaelschiemer.de | 9000/9443 | Container management |
|
||||
| Node Exporter | - | 9100 | Host metrics (internal) |
|
||||
| cAdvisor | - | 8080 | Container metrics (internal) |
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Traefik stack deployed and running (Stack 1)
|
||||
- Docker networks: `traefik-public`, `monitoring`
|
||||
- Docker Swarm initialized (if using swarm mode)
|
||||
- Domain DNS configured (grafana/prometheus/portainer subdomains)
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
monitoring/
|
||||
├── docker-compose.yml # Main stack definition
|
||||
├── .env.example # Environment template
|
||||
├── prometheus/
|
||||
│ ├── prometheus.yml # Prometheus configuration
|
||||
│ └── alerts.yml # Alert rules
|
||||
├── grafana/
|
||||
│ ├── provisioning/
|
||||
│ │ ├── datasources/
|
||||
│ │ │ └── prometheus.yml # Auto-configured datasource
|
||||
│ │ └── dashboards/
|
||||
│ │ └── dashboard.yml # Dashboard provisioning
|
||||
│ └── dashboards/
|
||||
│ ├── docker-containers.json # Container metrics dashboard
|
||||
│ ├── host-system.json # Host metrics dashboard
|
||||
│ └── traefik.json # Traefik metrics dashboard
|
||||
└── README.md # This file
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### 1. Create Environment File
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
### 2. Configure Environment Variables
|
||||
|
||||
Edit `.env` and set the following variables:
|
||||
|
||||
```bash
|
||||
# Domain Configuration
|
||||
DOMAIN=michaelschiemer.de
|
||||
|
||||
# Grafana Configuration
|
||||
GRAFANA_ADMIN_USER=admin
|
||||
GRAFANA_ADMIN_PASSWORD=<generate-strong-password>
|
||||
|
||||
# Prometheus Configuration
|
||||
PROMETHEUS_USER=admin
|
||||
PROMETHEUS_PASSWORD=<generate-strong-password>
|
||||
|
||||
# Portainer Configuration
|
||||
PORTAINER_ADMIN_PASSWORD=<generate-strong-password>
|
||||
|
||||
# Network Configuration
|
||||
TRAEFIK_NETWORK=traefik-public
|
||||
MONITORING_NETWORK=monitoring
|
||||
```
|
||||
|
||||
### 3. Generate Strong Passwords
|
||||
|
||||
```bash
|
||||
# Generate random passwords
|
||||
openssl rand -base64 32
|
||||
|
||||
# For Prometheus BasicAuth (bcrypt hash)
|
||||
docker run --rm httpd:alpine htpasswd -nbB admin "your-password" | cut -d ":" -f 2
|
||||
```
|
||||
|
||||
### 4. Update Traefik BasicAuth (Optional)
|
||||
|
||||
If using Prometheus BasicAuth, add the bcrypt hash to Traefik labels in docker-compose.yml:
|
||||
|
||||
```yaml
|
||||
- "traefik.http.middlewares.prometheus-auth.basicauth.users=admin:$$2y$$05$$..."
|
||||
```
|
||||
|
||||
## Deployment
|
||||
|
||||
### Deploy Stack
|
||||
|
||||
```bash
|
||||
cd /home/michael/dev/michaelschiemer/deployment/stacks/monitoring
|
||||
|
||||
# Deploy with Docker Compose
|
||||
docker compose up -d
|
||||
|
||||
# Or with Docker Stack (Swarm mode)
|
||||
docker stack deploy -c docker-compose.yml monitoring
|
||||
```
|
||||
|
||||
### Verify Deployment
|
||||
|
||||
```bash
|
||||
# Check running containers
|
||||
docker compose ps
|
||||
|
||||
# Check service logs
|
||||
docker compose logs -f grafana
|
||||
docker compose logs -f prometheus
|
||||
|
||||
# Check Prometheus targets
|
||||
curl -u admin:password https://prometheus.michaelschiemer.de/api/v1/targets
|
||||
```
|
||||
|
||||
### Initial Access
|
||||
|
||||
1. **Grafana**: https://grafana.michaelschiemer.de
|
||||
- Login: `admin` / `<GRAFANA_ADMIN_PASSWORD>`
|
||||
- Dashboards are pre-loaded and ready to use
|
||||
|
||||
2. **Prometheus**: https://prometheus.michaelschiemer.de
|
||||
- BasicAuth: `admin` / `<PROMETHEUS_PASSWORD>`
|
||||
- Check targets at `/targets`
|
||||
- View alerts at `/alerts`
|
||||
|
||||
3. **Portainer**: https://portainer.michaelschiemer.de
|
||||
- First login: Set admin password
|
||||
- Connect to local Docker environment
|
||||
|
||||
## Usage
|
||||
|
||||
### Grafana Dashboards
|
||||
|
||||
#### Docker Containers Dashboard
|
||||
Access: https://grafana.michaelschiemer.de/d/docker-containers
|
||||
|
||||
**Metrics Displayed**:
|
||||
- Container CPU Usage % (per container, timeseries)
|
||||
- Container Memory Usage (bytes per container, timeseries)
|
||||
- Containers Running (current count, stat)
|
||||
- Container Restarts in 5m (rate with thresholds, stat)
|
||||
- Container Network I/O (RX/TX per container, timeseries)
|
||||
|
||||
**Use Cases**:
|
||||
- Identify containers with high resource usage
|
||||
- Monitor container stability (restart rates)
|
||||
- Track network bandwidth consumption
|
||||
- Verify all expected containers are running
|
||||
|
||||
#### Host System Dashboard
|
||||
Access: https://grafana.michaelschiemer.de/d/host-system
|
||||
|
||||
**Metrics Displayed**:
|
||||
- CPU Usage % (historical and current)
|
||||
- Memory Usage % (historical and current)
|
||||
- Disk Usage % (root filesystem, historical and current)
|
||||
- Network I/O (RX/TX by interface)
|
||||
- System Uptime (seconds since boot)
|
||||
|
||||
**Thresholds**:
|
||||
- Green: < 80% usage
|
||||
- Yellow: 80-90% usage
|
||||
- Red: > 90% usage
|
||||
|
||||
**Use Cases**:
|
||||
- Monitor server health and resource utilization
|
||||
- Identify resource bottlenecks
|
||||
- Plan capacity upgrades
|
||||
- Track system stability (uptime)
|
||||
|
||||
#### Traefik Dashboard
|
||||
Access: https://grafana.michaelschiemer.de/d/traefik
|
||||
|
||||
**Metrics Displayed**:
|
||||
- Request Rate by Service (req/s, timeseries)
|
||||
- Response Time p95/p99 (milliseconds, timeseries)
|
||||
- HTTP Status Codes (2xx/4xx/5xx stacked, color-coded)
|
||||
- Service Status (Up/Down per service)
|
||||
- Requests per Minute (total)
|
||||
- 4xx Error Rate (percentage)
|
||||
- 5xx Error Rate (percentage)
|
||||
- Active Services (count)
|
||||
|
||||
**Thresholds**:
|
||||
- 4xx errors: Green < 5%, Yellow < 10%, Red ≥ 10%
|
||||
- 5xx errors: Green < 1%, Yellow < 5%, Red ≥ 5%
|
||||
|
||||
**Use Cases**:
|
||||
- Monitor HTTP traffic patterns
|
||||
- Identify performance issues (high latency)
|
||||
- Track error rates and types
|
||||
- Verify service availability
|
||||
|
||||
### Prometheus Queries
|
||||
|
||||
#### Common PromQL Examples
|
||||
|
||||
**CPU Usage**:
|
||||
```promql
|
||||
# Overall CPU usage
|
||||
100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
|
||||
# Per-core CPU usage
|
||||
rate(node_cpu_seconds_total[5m]) * 100
|
||||
```
|
||||
|
||||
**Memory Usage**:
|
||||
```promql
|
||||
# Memory usage percentage
|
||||
100 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100)
|
||||
|
||||
# Memory available in GB
|
||||
node_memory_MemAvailable_bytes / 1024 / 1024 / 1024
|
||||
```
|
||||
|
||||
**Disk Usage**:
|
||||
```promql
|
||||
# Disk usage percentage
|
||||
100 - ((node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100)
|
||||
|
||||
# Disk I/O rate
|
||||
rate(node_disk_io_time_seconds_total[5m])
|
||||
```
|
||||
|
||||
**Container Metrics**:
|
||||
```promql
|
||||
# Container CPU usage
|
||||
sum(rate(container_cpu_usage_seconds_total{name!~".*exporter.*"}[5m])) by (name) * 100
|
||||
|
||||
# Container memory usage
|
||||
sum(container_memory_usage_bytes{name!~".*exporter.*"}) by (name)
|
||||
|
||||
# Container network I/O
|
||||
rate(container_network_receive_bytes_total[5m])
|
||||
rate(container_network_transmit_bytes_total[5m])
|
||||
```
|
||||
|
||||
**Traefik Metrics**:
|
||||
```promql
|
||||
# Request rate by service
|
||||
sum(rate(traefik_service_requests_total[5m])) by (service)
|
||||
|
||||
# Response time percentiles
|
||||
histogram_quantile(0.95, sum(rate(traefik_service_request_duration_seconds_bucket[5m])) by (service, le))
|
||||
|
||||
# Error rate
|
||||
sum(rate(traefik_service_requests_total{code=~"5.."}[5m])) / sum(rate(traefik_service_requests_total[5m])) * 100
|
||||
```
|
||||
|
||||
### Alert Management
|
||||
|
||||
#### Configured Alerts
|
||||
|
||||
Alerts are defined in `prometheus/alerts.yml`:
|
||||
|
||||
1. **HostHighCPU**: CPU usage > 80% for 5 minutes
|
||||
2. **HostHighMemory**: Memory usage > 80% for 5 minutes
|
||||
3. **HostDiskSpaceLow**: Disk usage > 80%
|
||||
4. **ContainerHighCPU**: Container CPU > 80% for 5 minutes
|
||||
5. **ContainerHighMemory**: Container memory > 80% for 5 minutes
|
||||
6. **ServiceDown**: Service unavailable
|
||||
7. **HighErrorRate**: Error rate > 5% for 5 minutes
|
||||
|
||||
#### View Active Alerts
|
||||
|
||||
```bash
|
||||
# Via Prometheus UI
|
||||
https://prometheus.michaelschiemer.de/alerts
|
||||
|
||||
# Via API
|
||||
curl -u admin:password https://prometheus.michaelschiemer.de/api/v1/alerts
|
||||
|
||||
# Check alert rules
|
||||
curl -u admin:password https://prometheus.michaelschiemer.de/api/v1/rules
|
||||
```
|
||||
|
||||
#### Silence Alerts
|
||||
|
||||
Use Prometheus UI or API to silence alerts during maintenance:
|
||||
|
||||
```bash
|
||||
# Silence via API (example)
|
||||
curl -X POST -u admin:password \
|
||||
https://prometheus.michaelschiemer.de/api/v1/alerts \
|
||||
-d 'alertname=HostHighCPU&duration=1h'
|
||||
```
|
||||
|
||||
### Portainer Usage
|
||||
|
||||
#### Container Management
|
||||
|
||||
1. Navigate to https://portainer.michaelschiemer.de
|
||||
2. Select "Local" environment
|
||||
3. Go to "Containers" section
|
||||
4. Available actions:
|
||||
- Start/Stop/Restart containers
|
||||
- View logs (live stream)
|
||||
- Inspect container details
|
||||
- Execute commands in containers
|
||||
- View resource statistics
|
||||
|
||||
#### Stack Management
|
||||
|
||||
1. Go to "Stacks" section
|
||||
2. View deployed stacks
|
||||
3. Actions available:
|
||||
- View stack definition
|
||||
- Update stack (edit compose file)
|
||||
- Stop/Start entire stack
|
||||
- Remove stack
|
||||
|
||||
#### Volume Management
|
||||
|
||||
1. Go to "Volumes" section
|
||||
2. View volume details and size
|
||||
3. Browse volume contents
|
||||
4. Backup/restore volumes
|
||||
|
||||
## Integration with Other Stacks
|
||||
|
||||
### Stack 1: Traefik
|
||||
- Provides HTTPS reverse proxy for Grafana, Prometheus, Portainer
|
||||
- Automatic SSL certificate management
|
||||
- BasicAuth middleware for Prometheus
|
||||
|
||||
### Stack 2: Gitea
|
||||
- Monitor Gitea container resources
|
||||
- Track HTTP requests to Gitea via Traefik dashboard
|
||||
- Alert on Gitea service downtime
|
||||
|
||||
### Stack 3: Docker Registry
|
||||
- Monitor registry container resources
|
||||
- Track registry HTTP requests
|
||||
- Alert on registry unavailability
|
||||
|
||||
### Stack 4: Application
|
||||
- Monitor PHP-FPM, Nginx, Redis, Worker containers
|
||||
- Track application response times
|
||||
- Monitor queue worker health
|
||||
|
||||
### Stack 5: PostgreSQL
|
||||
- Monitor database container resources
|
||||
- Track PostgreSQL metrics (if postgres_exporter added)
|
||||
- Alert on database unavailability
|
||||
|
||||
## Monitoring Best Practices
|
||||
|
||||
### 1. Regular Dashboard Review
|
||||
- Check dashboards daily for anomalies
|
||||
- Review error rates and response times
|
||||
- Monitor resource utilization trends
|
||||
|
||||
### 2. Alert Configuration
|
||||
- Tune alert thresholds based on baseline metrics
|
||||
- Avoid alert fatigue (too many non-critical alerts)
|
||||
- Document alert response procedures
|
||||
|
||||
### 3. Capacity Planning
|
||||
- Review resource usage trends weekly
|
||||
- Plan capacity upgrades before hitting limits
|
||||
- Monitor growth rates for proactive scaling
|
||||
|
||||
### 4. Performance Optimization
|
||||
- Identify containers with high resource usage
|
||||
- Optimize slow endpoints (high p95/p99 latency)
|
||||
- Balance load across services
|
||||
|
||||
### 5. Security Monitoring
|
||||
- Monitor failed authentication attempts
|
||||
- Track unusual traffic patterns
|
||||
- Review service availability trends
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Grafana Issues
|
||||
|
||||
#### Dashboard Not Loading
|
||||
```bash
|
||||
# Check Grafana logs
|
||||
docker compose logs grafana
|
||||
|
||||
# Verify datasource connection
|
||||
curl http://localhost:3000/api/health
|
||||
|
||||
# Restart Grafana
|
||||
docker compose restart grafana
|
||||
```
|
||||
|
||||
#### Missing Metrics
|
||||
```bash
|
||||
# Check Prometheus datasource
|
||||
curl http://prometheus:9090/api/v1/targets
|
||||
|
||||
# Verify Prometheus is scraping
|
||||
docker compose logs prometheus | grep "Scrape"
|
||||
|
||||
# Check network connectivity
|
||||
docker compose exec grafana ping prometheus
|
||||
```
|
||||
|
||||
### Prometheus Issues
|
||||
|
||||
#### Targets Down
|
||||
```bash
|
||||
# Check target status
|
||||
curl -u admin:password https://prometheus.michaelschiemer.de/api/v1/targets
|
||||
|
||||
# Verify target services are running
|
||||
docker compose ps
|
||||
|
||||
# Check Prometheus configuration
|
||||
docker compose exec prometheus cat /etc/prometheus/prometheus.yml
|
||||
|
||||
# Reload configuration
|
||||
curl -X POST -u admin:password https://prometheus.michaelschiemer.de/-/reload
|
||||
```
|
||||
|
||||
#### High Memory Usage
|
||||
```bash
|
||||
# Check Prometheus memory
|
||||
docker stats prometheus
|
||||
|
||||
# Reduce retention period in docker-compose.yml:
|
||||
# --storage.tsdb.retention.time=7d
|
||||
|
||||
# Reduce scrape interval in prometheus.yml:
|
||||
# scrape_interval: 30s
|
||||
```
|
||||
|
||||
### Node Exporter Issues
|
||||
|
||||
#### No Host Metrics
|
||||
```bash
|
||||
# Check node-exporter is running
|
||||
docker compose ps node-exporter
|
||||
|
||||
# Test metrics endpoint
|
||||
curl http://localhost:9100/metrics
|
||||
|
||||
# Check Prometheus scraping
|
||||
docker compose logs prometheus | grep node-exporter
|
||||
```
|
||||
|
||||
### cAdvisor Issues
|
||||
|
||||
#### No Container Metrics
|
||||
```bash
|
||||
# Check cAdvisor is running
|
||||
docker compose ps cadvisor
|
||||
|
||||
# Test metrics endpoint
|
||||
curl http://localhost:8080/metrics
|
||||
|
||||
# Verify Docker socket mount
|
||||
docker compose exec cadvisor ls -la /var/run/docker.sock
|
||||
```
|
||||
|
||||
### Portainer Issues
|
||||
|
||||
#### Cannot Access UI
|
||||
```bash
|
||||
# Check Portainer is running
|
||||
docker compose ps portainer
|
||||
|
||||
# Check Traefik routing
|
||||
docker compose -f ../traefik/docker-compose.yml logs
|
||||
|
||||
# Verify network connectivity
|
||||
docker network ls | grep monitoring
|
||||
```
|
||||
|
||||
#### Cannot Connect to Docker
|
||||
```bash
|
||||
# Verify Docker socket permissions
|
||||
ls -la /var/run/docker.sock
|
||||
|
||||
# Check Portainer logs
|
||||
docker compose logs portainer
|
||||
|
||||
# Restart Portainer
|
||||
docker compose restart portainer
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Prometheus Optimization
|
||||
|
||||
#### Reduce Memory Usage
|
||||
```yaml
|
||||
# In docker-compose.yml, adjust retention:
|
||||
command:
|
||||
- '--storage.tsdb.retention.time=7d' # Reduce from 15d
|
||||
- '--storage.tsdb.retention.size=5GB' # Add size limit
|
||||
```
|
||||
|
||||
#### Optimize Scrape Intervals
|
||||
```yaml
|
||||
# In prometheus/prometheus.yml:
|
||||
global:
|
||||
scrape_interval: 30s # Increase from 15s for less load
|
||||
evaluation_interval: 30s
|
||||
```
|
||||
|
||||
#### Reduce Cardinality
|
||||
```yaml
|
||||
# In prometheus/prometheus.yml, add metric_relabel_configs:
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: 'unused_metric_.*'
|
||||
action: drop
|
||||
```
|
||||
|
||||
### Grafana Optimization
|
||||
|
||||
#### Reduce Query Load
|
||||
```json
|
||||
// In dashboard JSON, adjust refresh rate:
|
||||
"refresh": "1m" // Increase from 30s
|
||||
```
|
||||
|
||||
#### Optimize Panel Queries
|
||||
- Use recording rules for expensive queries
|
||||
- Reduce time range for heavy queries
|
||||
- Use appropriate resolution (step parameter)
|
||||
|
||||
### Storage Optimization
|
||||
|
||||
#### Prometheus Data Volume
|
||||
```bash
|
||||
# Check current size
|
||||
du -sh volumes/prometheus/
|
||||
|
||||
# Compact old data
|
||||
docker compose exec prometheus curl -X POST http://localhost:9090/api/v1/admin/tsdb/clean_tombstones
|
||||
```
|
||||
|
||||
#### Grafana Data Volume
|
||||
```bash
|
||||
# Check current size
|
||||
du -sh volumes/grafana/
|
||||
|
||||
# Clean old sessions
|
||||
docker compose exec grafana grafana-cli admin reset-admin-password
|
||||
```
|
||||
|
||||
## Security Considerations
|
||||
|
||||
### 1. Password Security
|
||||
- Use strong, randomly generated passwords
|
||||
- Store passwords securely (password manager)
|
||||
- Rotate passwords regularly
|
||||
- Use bcrypt for Prometheus BasicAuth
|
||||
|
||||
### 2. Network Security
|
||||
- Monitoring network is internal-only (except exporters)
|
||||
- Traefik handles SSL/TLS termination
|
||||
- BasicAuth protects Prometheus UI
|
||||
- Grafana requires login for dashboard access
|
||||
|
||||
### 3. Access Control
|
||||
- Limit Grafana admin access
|
||||
- Use Grafana organizations for multi-tenancy
|
||||
- Configure Prometheus with read-only access where possible
|
||||
- Restrict Portainer access to trusted users
|
||||
|
||||
### 4. Data Security
|
||||
- Prometheus stores metrics in plain text
|
||||
- Grafana encrypts passwords in database
|
||||
- Backup volumes contain sensitive data
|
||||
- Secure backups with encryption
|
||||
|
||||
### 5. Container Security
|
||||
- Use official Docker images
|
||||
- Keep images updated (security patches)
|
||||
- Run containers as non-root where possible
|
||||
- Limit container capabilities
|
||||
|
||||
## Backup and Recovery
|
||||
|
||||
### Backup Prometheus Data
|
||||
```bash
|
||||
# Stop Prometheus
|
||||
docker compose stop prometheus
|
||||
|
||||
# Backup data volume
|
||||
tar czf prometheus-backup-$(date +%Y%m%d).tar.gz -C volumes/prometheus .
|
||||
|
||||
# Restart Prometheus
|
||||
docker compose start prometheus
|
||||
```
|
||||
|
||||
### Backup Grafana Data
|
||||
```bash
|
||||
# Backup Grafana database and dashboards
|
||||
docker compose exec grafana tar czf - /var/lib/grafana > grafana-backup-$(date +%Y%m%d).tar.gz
|
||||
```
|
||||
|
||||
### Restore from Backup
|
||||
```bash
|
||||
# Stop services
|
||||
docker compose down
|
||||
|
||||
# Restore Prometheus data
|
||||
tar xzf prometheus-backup-YYYYMMDD.tar.gz -C volumes/prometheus/
|
||||
|
||||
# Restore Grafana data
|
||||
docker compose up -d grafana
|
||||
docker compose exec grafana tar xzf - -C / < grafana-backup-YYYYMMDD.tar.gz
|
||||
|
||||
# Start all services
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
## Maintenance
|
||||
|
||||
### Regular Tasks
|
||||
|
||||
#### Daily
|
||||
- Review dashboards for anomalies
|
||||
- Check active alerts
|
||||
- Verify all services are running
|
||||
|
||||
#### Weekly
|
||||
- Review resource usage trends
|
||||
- Check disk space usage
|
||||
- Update passwords if needed
|
||||
|
||||
#### Monthly
|
||||
- Review and update alert rules
|
||||
- Optimize slow queries
|
||||
- Clean up old data if needed
|
||||
- Update Docker images
|
||||
|
||||
### Update Procedure
|
||||
|
||||
```bash
|
||||
# Pull latest images
|
||||
docker compose pull
|
||||
|
||||
# Recreate containers with new images
|
||||
docker compose up -d
|
||||
|
||||
# Verify services are healthy
|
||||
docker compose ps
|
||||
docker compose logs -f
|
||||
```
|
||||
|
||||
## Support
|
||||
|
||||
### Documentation
|
||||
- Prometheus: https://prometheus.io/docs/
|
||||
- Grafana: https://grafana.com/docs/
|
||||
- Portainer: https://docs.portainer.io/
|
||||
|
||||
### Logs
|
||||
```bash
|
||||
# View all logs
|
||||
docker compose logs
|
||||
|
||||
# Follow specific service logs
|
||||
docker compose logs -f grafana
|
||||
docker compose logs -f prometheus
|
||||
|
||||
# View last 100 lines
|
||||
docker compose logs --tail=100
|
||||
```
|
||||
|
||||
### Health Checks
|
||||
```bash
|
||||
# Check service health
|
||||
docker compose ps
|
||||
|
||||
# Test endpoints
|
||||
curl http://localhost:9090/-/healthy # Prometheus
|
||||
curl http://localhost:3000/api/health # Grafana
|
||||
|
||||
# Check metrics
|
||||
curl http://localhost:9100/metrics # Node Exporter
|
||||
curl http://localhost:8080/metrics # cAdvisor
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
**Stack Version**: 1.0
|
||||
**Last Updated**: 2025-01-30
|
||||
**Maintained By**: DevOps Team
|
||||
147
deployment/stacks/monitoring/docker-compose.yml
Normal file
147
deployment/stacks/monitoring/docker-compose.yml
Normal file
@@ -0,0 +1,147 @@
|
||||
services:
|
||||
portainer:
|
||||
image: portainer/portainer-ce:latest
|
||||
container_name: portainer
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- traefik-public
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- portainer-data:/data
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.portainer.rule=Host(`portainer.${DOMAIN}`)"
|
||||
- "traefik.http.routers.portainer.entrypoints=websecure"
|
||||
- "traefik.http.routers.portainer.tls=true"
|
||||
- "traefik.http.routers.portainer.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.portainer.loadbalancer.server.port=9000"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9000/api/system/status"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
restart: unless-stopped
|
||||
user: "65534:65534"
|
||||
networks:
|
||||
- traefik-public
|
||||
- app-internal
|
||||
volumes:
|
||||
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.prometheus.rule=Host(`prometheus.${DOMAIN}`)"
|
||||
- "traefik.http.routers.prometheus.entrypoints=websecure"
|
||||
- "traefik.http.routers.prometheus.tls=true"
|
||||
- "traefik.http.routers.prometheus.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.routers.prometheus.middlewares=prometheus-auth"
|
||||
- "traefik.http.middlewares.prometheus-auth.basicauth.users=${PROMETHEUS_AUTH}"
|
||||
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- traefik-public
|
||||
- app-internal
|
||||
environment:
|
||||
- GF_SERVER_ROOT_URL=https://grafana.${DOMAIN}
|
||||
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_INSTALL_PLUGINS=${GRAFANA_PLUGINS}
|
||||
- GF_LOG_LEVEL=info
|
||||
- GF_ANALYTICS_REPORTING_ENABLED=false
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN}`)"
|
||||
- "traefik.http.routers.grafana.entrypoints=websecure"
|
||||
- "traefik.http.routers.grafana.tls=true"
|
||||
- "traefik.http.routers.grafana.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
|
||||
depends_on:
|
||||
prometheus:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: node-exporter
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- app-internal
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9100/metrics"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
container_name: cadvisor
|
||||
restart: unless-stopped
|
||||
privileged: true
|
||||
networks:
|
||||
- app-internal
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/healthz"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
volumes:
|
||||
portainer-data:
|
||||
name: portainer-data
|
||||
prometheus-data:
|
||||
name: prometheus-data
|
||||
grafana-data:
|
||||
name: grafana-data
|
||||
|
||||
networks:
|
||||
traefik-public:
|
||||
external: true
|
||||
app-internal:
|
||||
external: true
|
||||
@@ -0,0 +1,397 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{name!~\".*exporter.*\"}[5m])) by (name) * 100",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{name}}"
|
||||
}
|
||||
],
|
||||
"title": "Container CPU Usage %",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(container_memory_usage_bytes{name!~\".*exporter.*\"}) by (name)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{name}}"
|
||||
}
|
||||
],
|
||||
"title": "Container Memory Usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "red",
|
||||
"index": 1,
|
||||
"text": "Down"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"index": 0,
|
||||
"text": "Up"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(container_last_seen{name!~\".*exporter.*\"}) > 0",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Containers Running",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 3
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 8
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_restart_count[5m])) > 0",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Container Restarts (5m)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{name!~\".*exporter.*\"}[5m])) by (name)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{name}} RX"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{name!~\".*exporter.*\"}[5m])) by (name)",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{name}} TX"
|
||||
}
|
||||
],
|
||||
"title": "Container Network I/O",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 36,
|
||||
"style": "dark",
|
||||
"tags": ["docker", "containers"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Docker Containers",
|
||||
"uid": "docker-containers",
|
||||
"version": 1
|
||||
}
|
||||
591
deployment/stacks/monitoring/grafana/dashboards/host-system.json
Normal file
591
deployment/stacks/monitoring/grafana/dashboards/host-system.json
Normal file
@@ -0,0 +1,591 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "line"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull", "mean", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"title": "CPU Usage %",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "line"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull", "mean", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"title": "Memory Usage %",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "line"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull", "mean", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"rootfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"rootfs\"}) * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"title": "Disk Usage %",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull", "mean", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_bytes_total[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}} - {{device}} RX"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_network_transmit_bytes_total[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{instance}} - {{device}} TX"
|
||||
}
|
||||
],
|
||||
"title": "Network I/O",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Current CPU Usage",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 16
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Current Memory Usage",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"rootfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"rootfs\"}) * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Current Disk Usage",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 16
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - node_boot_time_seconds",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "System Uptime",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 36,
|
||||
"style": "dark",
|
||||
"tags": ["host", "system"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Host System",
|
||||
"uid": "host-system",
|
||||
"version": 1
|
||||
}
|
||||
613
deployment/stacks/monitoring/grafana/dashboards/traefik.json
Normal file
613
deployment/stacks/monitoring/grafana/dashboards/traefik.json
Normal file
@@ -0,0 +1,613 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull", "mean", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(traefik_service_requests_total[5m])) by (service)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
],
|
||||
"title": "Request Rate by Service",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull", "mean", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(traefik_service_request_duration_seconds_bucket[5m])) by (service, le)) * 1000",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{service}} p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(traefik_service_request_duration_seconds_bucket[5m])) by (service, le)) * 1000",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{service}} p99"
|
||||
}
|
||||
],
|
||||
"title": "Response Time (p95/p99)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "normal"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byRegexp",
|
||||
"options": ".*2xx.*"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byRegexp",
|
||||
"options": ".*4xx.*"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byRegexp",
|
||||
"options": ".*5xx.*"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull", "sum"],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(traefik_service_requests_total{code=~\"2..\"}[5m])) by (service)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{service}} 2xx"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(traefik_service_requests_total{code=~\"4..\"}[5m])) by (service)",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{service}} 4xx"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(traefik_service_requests_total{code=~\"5..\"}[5m])) by (service)",
|
||||
"refId": "C",
|
||||
"legendFormat": "{{service}} 5xx"
|
||||
}
|
||||
],
|
||||
"title": "HTTP Status Codes",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "red",
|
||||
"index": 1,
|
||||
"text": "Down"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"index": 0,
|
||||
"text": "Up"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "traefik_service_server_up",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
],
|
||||
"title": "Service Status",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(traefik_service_requests_total[5m])) * 60",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Requests per Minute",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 5
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 16
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(rate(traefik_service_requests_total{code=~\"4..\"}[5m])) / sum(rate(traefik_service_requests_total[5m]))) * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "4xx Error Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(rate(traefik_service_requests_total{code=~\"5..\"}[5m])) / sum(rate(traefik_service_requests_total[5m]))) * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "5xx Error Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 16
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(traefik_service_server_up == 1)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Active Services",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 36,
|
||||
"style": "dark",
|
||||
"tags": ["traefik", "proxy"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Traefik",
|
||||
"uid": "traefik",
|
||||
"version": 1
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
# Grafana Dashboard Provisioning
|
||||
# https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Default'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
@@ -0,0 +1,17 @@
|
||||
# Grafana Datasource Provisioning
|
||||
# https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
jsonData:
|
||||
timeInterval: 15s
|
||||
queryTimeout: 60s
|
||||
httpMethod: POST
|
||||
version: 1
|
||||
245
deployment/stacks/monitoring/prometheus/alerts.yml
Normal file
245
deployment/stacks/monitoring/prometheus/alerts.yml
Normal file
@@ -0,0 +1,245 @@
|
||||
# Prometheus Alerting Rules
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
|
||||
|
||||
groups:
|
||||
- name: infrastructure_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# Host System Alerts
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "High CPU load on {{ $labels.instance }}"
|
||||
description: "CPU load is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "Host out of memory on {{ $labels.instance }}"
|
||||
description: "Available memory is below 10% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: (node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"} * 100) < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "Host out of disk space on {{ $labels.instance }}"
|
||||
description: "Disk space is below 10% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: HostDiskSpaceWarning
|
||||
expr: (node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"} * 100) < 20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "Disk space warning on {{ $labels.instance }}"
|
||||
description: "Disk space is below 20% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: HostHighDiskReadLatency
|
||||
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "High disk read latency on {{ $labels.instance }}"
|
||||
description: "Disk read latency is high (current value: {{ $value }}s)"
|
||||
|
||||
# Container Alerts
|
||||
- alert: ContainerKilled
|
||||
expr: time() - container_last_seen{name!~".*exporter.*"} > 60
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
category: container
|
||||
annotations:
|
||||
summary: "Container killed: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} has disappeared"
|
||||
|
||||
- alert: ContainerHighCpuUsage
|
||||
expr: (sum(rate(container_cpu_usage_seconds_total{name!~".*exporter.*"}[5m])) by (name) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: container
|
||||
annotations:
|
||||
summary: "High CPU usage in container {{ $labels.name }}"
|
||||
description: "Container CPU usage is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: ContainerHighMemoryUsage
|
||||
expr: (sum(container_memory_usage_bytes{name!~".*exporter.*"}) by (name) / sum(container_spec_memory_limit_bytes{name!~".*exporter.*"}) by (name) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: container
|
||||
annotations:
|
||||
summary: "High memory usage in container {{ $labels.name }}"
|
||||
description: "Container memory usage is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: ContainerVolumeUsage
|
||||
expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: container
|
||||
annotations:
|
||||
summary: "Container volume usage on {{ $labels.instance }}"
|
||||
description: "Container volume usage is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: ContainerRestartCount
|
||||
expr: rate(container_restart_count[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
category: container
|
||||
annotations:
|
||||
summary: "Container restarting: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is restarting frequently"
|
||||
|
||||
# Prometheus Self-Monitoring
|
||||
- alert: PrometheusTargetDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
category: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus target down: {{ $labels.job }}"
|
||||
description: "Target {{ $labels.job }} on {{ $labels.instance }} is down"
|
||||
|
||||
- alert: PrometheusConfigReloadFailure
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
category: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus configuration reload failure"
|
||||
description: "Prometheus configuration reload has failed"
|
||||
|
||||
- alert: PrometheusTooManyRestarts
|
||||
expr: changes(process_start_time_seconds{job=~"prometheus"}[15m]) > 2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
category: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus restarting frequently"
|
||||
description: "Prometheus has restarted more than twice in the last 15 minutes"
|
||||
|
||||
- alert: PrometheusTargetScrapingSlow
|
||||
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus target scraping slow"
|
||||
description: "Prometheus is scraping targets slowly (current value: {{ $value }}s)"
|
||||
|
||||
# Traefik Alerts
|
||||
- alert: TraefikServiceDown
|
||||
expr: count(traefik_service_server_up) by (service) == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
category: traefik
|
||||
annotations:
|
||||
summary: "Traefik service down: {{ $labels.service }}"
|
||||
description: "Traefik service {{ $labels.service }} is down"
|
||||
|
||||
- alert: TraefikHighHttp4xxErrorRate
|
||||
expr: sum(rate(traefik_service_requests_total{code=~"4.."}[5m])) by (service) / sum(rate(traefik_service_requests_total[5m])) by (service) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: traefik
|
||||
annotations:
|
||||
summary: "High HTTP 4xx error rate for {{ $labels.service }}"
|
||||
description: "HTTP 4xx error rate is above 5% (current value: {{ $value }}%)"
|
||||
|
||||
- alert: TraefikHighHttp5xxErrorRate
|
||||
expr: sum(rate(traefik_service_requests_total{code=~"5.."}[5m])) by (service) / sum(rate(traefik_service_requests_total[5m])) by (service) * 100 > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
category: traefik
|
||||
annotations:
|
||||
summary: "High HTTP 5xx error rate for {{ $labels.service }}"
|
||||
description: "HTTP 5xx error rate is above 1% (current value: {{ $value }}%)"
|
||||
|
||||
- name: database_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# PostgreSQL Alerts (uncomment when postgres-exporter is deployed)
|
||||
# - alert: PostgresqlDown
|
||||
# expr: pg_up == 0
|
||||
# for: 1m
|
||||
# labels:
|
||||
# severity: critical
|
||||
# category: database
|
||||
# annotations:
|
||||
# summary: "PostgreSQL down on {{ $labels.instance }}"
|
||||
# description: "PostgreSQL instance is down"
|
||||
|
||||
# - alert: PostgresqlTooManyConnections
|
||||
# expr: sum by (instance) (pg_stat_activity_count) > pg_settings_max_connections * 0.8
|
||||
# for: 5m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# category: database
|
||||
# annotations:
|
||||
# summary: "Too many PostgreSQL connections on {{ $labels.instance }}"
|
||||
# description: "PostgreSQL connections are above 80% of max_connections"
|
||||
|
||||
# - alert: PostgresqlDeadLocks
|
||||
# expr: rate(pg_stat_database_deadlocks[1m]) > 0
|
||||
# for: 1m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# category: database
|
||||
# annotations:
|
||||
# summary: "PostgreSQL deadlocks on {{ $labels.instance }}"
|
||||
# description: "PostgreSQL has deadlocks"
|
||||
|
||||
# Redis Alerts (uncomment when redis-exporter is deployed)
|
||||
# - alert: RedisDown
|
||||
# expr: redis_up == 0
|
||||
# for: 1m
|
||||
# labels:
|
||||
# severity: critical
|
||||
# category: cache
|
||||
# annotations:
|
||||
# summary: "Redis down on {{ $labels.instance }}"
|
||||
# description: "Redis instance is down"
|
||||
|
||||
# - alert: RedisOutOfMemory
|
||||
# expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
|
||||
# for: 5m
|
||||
# labels:
|
||||
# severity: critical
|
||||
# category: cache
|
||||
# annotations:
|
||||
# summary: "Redis out of memory on {{ $labels.instance }}"
|
||||
# description: "Redis memory usage is above 90%"
|
||||
|
||||
# - alert: RedisTooManyConnections
|
||||
# expr: redis_connected_clients > 100
|
||||
# for: 5m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# category: cache
|
||||
# annotations:
|
||||
# summary: "Too many Redis connections on {{ $labels.instance }}"
|
||||
# description: "Redis has too many client connections (current value: {{ $value }})"
|
||||
82
deployment/stacks/monitoring/prometheus/prometheus.yml
Normal file
82
deployment/stacks/monitoring/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,82 @@
|
||||
# Prometheus Configuration
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'production'
|
||||
environment: 'michaelschiemer'
|
||||
|
||||
# Alertmanager configuration (optional)
|
||||
# alerting:
|
||||
# alertmanagers:
|
||||
# - static_configs:
|
||||
# - targets:
|
||||
# - alertmanager:9093
|
||||
|
||||
# Load alerting rules
|
||||
rule_files:
|
||||
- '/etc/prometheus/alerts.yml'
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
labels:
|
||||
service: 'prometheus'
|
||||
|
||||
# Node Exporter - Host system metrics
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
labels:
|
||||
service: 'node-exporter'
|
||||
instance: 'production-server'
|
||||
|
||||
# cAdvisor - Container metrics
|
||||
- job_name: 'cadvisor'
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
labels:
|
||||
service: 'cadvisor'
|
||||
|
||||
# Traefik metrics
|
||||
- job_name: 'traefik'
|
||||
static_configs:
|
||||
- targets: ['traefik:8080']
|
||||
labels:
|
||||
service: 'traefik'
|
||||
|
||||
# PostgreSQL Exporter (if deployed)
|
||||
# Uncomment if you add postgres-exporter to postgresql stack
|
||||
# - job_name: 'postgres'
|
||||
# static_configs:
|
||||
# - targets: ['postgres-exporter:9187']
|
||||
# labels:
|
||||
# service: 'postgresql'
|
||||
|
||||
# Redis Exporter (if deployed)
|
||||
# Uncomment if you add redis-exporter to application stack
|
||||
# - job_name: 'redis'
|
||||
# static_configs:
|
||||
# - targets: ['redis-exporter:9121']
|
||||
# labels:
|
||||
# service: 'redis'
|
||||
|
||||
# Application metrics endpoint (if available)
|
||||
# Uncomment and configure if your PHP app exposes Prometheus metrics
|
||||
# - job_name: 'application'
|
||||
# static_configs:
|
||||
# - targets: ['app:9000']
|
||||
# labels:
|
||||
# service: 'application'
|
||||
|
||||
# Nginx metrics (if nginx-prometheus-exporter deployed)
|
||||
# - job_name: 'nginx'
|
||||
# static_configs:
|
||||
# - targets: ['nginx-exporter:9113']
|
||||
# labels:
|
||||
# service: 'nginx'
|
||||
22
deployment/stacks/postgresql/.env.example
Normal file
22
deployment/stacks/postgresql/.env.example
Normal file
@@ -0,0 +1,22 @@
|
||||
# PostgreSQL Stack Configuration
|
||||
# Copy this file to .env and adjust values
|
||||
|
||||
# Timezone
|
||||
TZ=Europe/Berlin
|
||||
|
||||
# PostgreSQL Configuration
|
||||
POSTGRES_DB=michaelschiemer
|
||||
POSTGRES_USER=postgres
|
||||
POSTGRES_PASSWORD=<generate-with-openssl-rand-base64-32>
|
||||
|
||||
# Backup Configuration
|
||||
BACKUP_RETENTION_DAYS=7
|
||||
BACKUP_SCHEDULE=0 2 * * *
|
||||
|
||||
# Connection Settings (for application integration)
|
||||
# Use these in Stack 4 (Application) .env:
|
||||
# DB_HOST=postgres
|
||||
# DB_PORT=5432
|
||||
# DB_NAME=michaelschiemer
|
||||
# DB_USER=postgres
|
||||
# DB_PASS=<same-as-POSTGRES_PASSWORD>
|
||||
681
deployment/stacks/postgresql/README.md
Normal file
681
deployment/stacks/postgresql/README.md
Normal file
@@ -0,0 +1,681 @@
|
||||
# PostgreSQL Stack - Production Database with Automated Backups
|
||||
|
||||
## Overview
|
||||
|
||||
Production-ready PostgreSQL 16 database with automated backup system and performance optimization.
|
||||
|
||||
**Features**:
|
||||
- PostgreSQL 16 Alpine (lightweight, secure)
|
||||
- Automated daily backups with configurable retention
|
||||
- Performance-optimized configuration (2GB memory allocation)
|
||||
- Health checks and automatic recovery
|
||||
- Persistent storage with named volumes
|
||||
- Isolated app-internal network
|
||||
- Resource limits for stability
|
||||
|
||||
## Services
|
||||
|
||||
- **postgres** - PostgreSQL 16 database server
|
||||
- **postgres-backup** - Automated backup service with cron scheduling
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Traefik Stack Running**
|
||||
```bash
|
||||
cd ../traefik
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
2. **App-Internal Network Created**
|
||||
```bash
|
||||
docker network create app-internal
|
||||
```
|
||||
(Created automatically by Stack 4 - Application)
|
||||
|
||||
## Configuration
|
||||
|
||||
### 1. Create Environment File
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
### 2. Generate Secure Password
|
||||
|
||||
```bash
|
||||
openssl rand -base64 32
|
||||
```
|
||||
|
||||
Update `.env`:
|
||||
```env
|
||||
POSTGRES_PASSWORD=<generated-password>
|
||||
```
|
||||
|
||||
### 3. Review Configuration
|
||||
|
||||
**Database Settings** (`.env`):
|
||||
- `POSTGRES_DB` - Database name (default: michaelschiemer)
|
||||
- `POSTGRES_USER` - Database user (default: postgres)
|
||||
- `POSTGRES_PASSWORD` - Database password (REQUIRED)
|
||||
|
||||
**Backup Settings** (`.env`):
|
||||
- `BACKUP_RETENTION_DAYS` - Keep backups for N days (default: 7)
|
||||
- `BACKUP_SCHEDULE` - Cron expression (default: `0 2 * * *` = 2 AM daily)
|
||||
|
||||
**Performance Tuning** (`conf.d/postgresql.conf`):
|
||||
- Optimized for 2GB memory allocation
|
||||
- Connection pooling (max 100 connections)
|
||||
- Write-ahead logging for reliability
|
||||
- Query logging for slow queries (>1s)
|
||||
- Parallel query execution enabled
|
||||
|
||||
## Deployment
|
||||
|
||||
### Initial Setup
|
||||
|
||||
```bash
|
||||
# Create environment file
|
||||
cp .env.example .env
|
||||
|
||||
# Generate and set password
|
||||
openssl rand -base64 32
|
||||
# Update POSTGRES_PASSWORD in .env
|
||||
|
||||
# Ensure app-internal network exists
|
||||
docker network inspect app-internal || docker network create app-internal
|
||||
|
||||
# Start services
|
||||
docker compose up -d
|
||||
|
||||
# Check logs
|
||||
docker compose logs -f
|
||||
|
||||
# Verify health
|
||||
docker compose ps
|
||||
```
|
||||
|
||||
### Verify Deployment
|
||||
|
||||
```bash
|
||||
# Check PostgreSQL is running
|
||||
docker exec postgres pg_isready -U postgres -d michaelschiemer
|
||||
|
||||
# Expected: postgres:5432 - accepting connections
|
||||
|
||||
# Check backup service
|
||||
docker compose logs postgres-backup
|
||||
|
||||
# Expected: Initial backup completed successfully
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Database Access
|
||||
|
||||
#### From Host Machine
|
||||
|
||||
```bash
|
||||
# Connect to database
|
||||
docker exec -it postgres psql -U postgres -d michaelschiemer
|
||||
|
||||
# Run SQL query
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "SELECT version();"
|
||||
```
|
||||
|
||||
#### From Application Container
|
||||
|
||||
```bash
|
||||
# Connection string format
|
||||
postgresql://postgres:password@postgres:5432/michaelschiemer
|
||||
|
||||
# Example with environment variables (Stack 4 - Application)
|
||||
DB_HOST=postgres
|
||||
DB_PORT=5432
|
||||
DB_NAME=michaelschiemer
|
||||
DB_USER=postgres
|
||||
DB_PASS=<same-as-POSTGRES_PASSWORD>
|
||||
```
|
||||
|
||||
### Backup Management
|
||||
|
||||
#### Manual Backup
|
||||
|
||||
```bash
|
||||
# Trigger manual backup
|
||||
docker exec postgres-backup /scripts/backup.sh
|
||||
|
||||
# List backups
|
||||
ls -lh backups/
|
||||
|
||||
# Example output:
|
||||
# postgres_michaelschiemer_20250130_020000.sql.gz
|
||||
# postgres_michaelschiemer_20250131_020000.sql.gz
|
||||
```
|
||||
|
||||
#### Restore from Backup
|
||||
|
||||
```bash
|
||||
# List available backups
|
||||
docker exec postgres-backup ls -lh /backups
|
||||
|
||||
# Restore specific backup
|
||||
docker exec -it postgres-backup /scripts/restore.sh /backups/postgres_michaelschiemer_20250130_020000.sql.gz
|
||||
|
||||
# ⚠️ WARNING: This will DROP and RECREATE the database!
|
||||
# Confirm after 10 second countdown
|
||||
```
|
||||
|
||||
#### Download Backup
|
||||
|
||||
```bash
|
||||
# Copy backup to host
|
||||
docker cp postgres-backup:/backups/postgres_michaelschiemer_20250130_020000.sql.gz ./local-backup.sql.gz
|
||||
|
||||
# Extract and inspect
|
||||
gunzip -c local-backup.sql.gz | less
|
||||
```
|
||||
|
||||
### Database Maintenance
|
||||
|
||||
#### Vacuum and Analyze
|
||||
|
||||
```bash
|
||||
# Full vacuum (recommended weekly)
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "VACUUM FULL ANALYZE;"
|
||||
|
||||
# Quick vacuum (automatic, but can run manually)
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "VACUUM ANALYZE;"
|
||||
```
|
||||
|
||||
#### Check Database Size
|
||||
|
||||
```bash
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "
|
||||
SELECT
|
||||
pg_size_pretty(pg_database_size('michaelschiemer')) as db_size,
|
||||
pg_size_pretty(pg_total_relation_size('users')) as users_table_size;
|
||||
"
|
||||
```
|
||||
|
||||
#### Connection Statistics
|
||||
|
||||
```bash
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "
|
||||
SELECT
|
||||
datname,
|
||||
numbackends as connections,
|
||||
xact_commit as commits,
|
||||
xact_rollback as rollbacks
|
||||
FROM pg_stat_database
|
||||
WHERE datname = 'michaelschiemer';
|
||||
"
|
||||
```
|
||||
|
||||
### Performance Monitoring
|
||||
|
||||
#### Active Queries
|
||||
|
||||
```bash
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "
|
||||
SELECT
|
||||
pid,
|
||||
usename,
|
||||
application_name,
|
||||
state,
|
||||
query_start,
|
||||
query
|
||||
FROM pg_stat_activity
|
||||
WHERE state != 'idle'
|
||||
ORDER BY query_start;
|
||||
"
|
||||
```
|
||||
|
||||
#### Slow Queries
|
||||
|
||||
```bash
|
||||
# Check PostgreSQL logs for slow queries (>1s)
|
||||
docker exec postgres tail -f /var/lib/postgresql/data/pgdata/log/postgresql-*.log
|
||||
```
|
||||
|
||||
#### Index Usage
|
||||
|
||||
```bash
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "
|
||||
SELECT
|
||||
schemaname,
|
||||
tablename,
|
||||
indexname,
|
||||
idx_scan as index_scans,
|
||||
idx_tup_read as tuples_read
|
||||
FROM pg_stat_user_indexes
|
||||
ORDER BY idx_scan DESC;
|
||||
"
|
||||
```
|
||||
|
||||
## Integration with Other Stacks
|
||||
|
||||
### Stack 4: Application
|
||||
|
||||
Update `deployment/stacks/application/.env`:
|
||||
|
||||
```env
|
||||
# Database Configuration
|
||||
DB_HOST=postgres
|
||||
DB_PORT=5432
|
||||
DB_NAME=michaelschiemer
|
||||
DB_USER=postgres
|
||||
DB_PASS=<same-as-postgres-stack-password>
|
||||
```
|
||||
|
||||
**Connection Test** from Application:
|
||||
```bash
|
||||
# From app container
|
||||
docker exec app php -r "
|
||||
\$dsn = 'pgsql:host=postgres;port=5432;dbname=michaelschiemer';
|
||||
\$pdo = new PDO(\$dsn, 'postgres', getenv('DB_PASS'));
|
||||
echo 'Connection successful: ' . \$pdo->query('SELECT version()')->fetchColumn();
|
||||
"
|
||||
```
|
||||
|
||||
### Stack 2: Gitea (Optional PostgreSQL Backend)
|
||||
|
||||
If migrating Gitea from MySQL to PostgreSQL:
|
||||
|
||||
```env
|
||||
# In deployment/stacks/gitea/.env
|
||||
DB_TYPE=postgres
|
||||
DB_HOST=postgres
|
||||
DB_NAME=gitea
|
||||
DB_USER=postgres
|
||||
DB_PASS=<same-password>
|
||||
```
|
||||
|
||||
**Note**: Requires creating separate `gitea` database:
|
||||
```bash
|
||||
docker exec postgres psql -U postgres -c "CREATE DATABASE gitea;"
|
||||
```
|
||||
|
||||
## Backup & Recovery
|
||||
|
||||
### Automated Backup Strategy
|
||||
|
||||
**Schedule**: Daily at 2:00 AM (configurable via `BACKUP_SCHEDULE`)
|
||||
|
||||
**Retention**: 7 days (configurable via `BACKUP_RETENTION_DAYS`)
|
||||
|
||||
**Location**: `./backups/` directory on host
|
||||
|
||||
**Format**: Compressed SQL dumps (`postgres_<dbname>_<timestamp>.sql.gz`)
|
||||
|
||||
### Manual Backup Workflow
|
||||
|
||||
```bash
|
||||
# 1. Create manual backup
|
||||
docker exec postgres-backup /scripts/backup.sh
|
||||
|
||||
# 2. Verify backup
|
||||
ls -lh backups/
|
||||
|
||||
# 3. Test backup integrity (optional)
|
||||
gunzip -t backups/postgres_michaelschiemer_20250130_020000.sql.gz
|
||||
```
|
||||
|
||||
### Disaster Recovery
|
||||
|
||||
#### Scenario: Complete Database Loss
|
||||
|
||||
```bash
|
||||
# 1. Stop application to prevent writes
|
||||
cd ../application
|
||||
docker compose stop
|
||||
|
||||
# 2. Remove corrupted database
|
||||
cd ../postgresql
|
||||
docker compose down
|
||||
docker volume rm postgres-data
|
||||
|
||||
# 3. Recreate database
|
||||
docker compose up -d
|
||||
|
||||
# 4. Wait for PostgreSQL to initialize
|
||||
docker compose logs -f postgres
|
||||
|
||||
# 5. Restore from latest backup
|
||||
docker exec -it postgres-backup /scripts/restore.sh /backups/postgres_michaelschiemer_<latest>.sql.gz
|
||||
|
||||
# 6. Verify restoration
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "\dt"
|
||||
|
||||
# 7. Restart application
|
||||
cd ../application
|
||||
docker compose start
|
||||
```
|
||||
|
||||
#### Scenario: Point-in-Time Recovery
|
||||
|
||||
```bash
|
||||
# 1. List available backups
|
||||
docker exec postgres-backup ls -lh /backups
|
||||
|
||||
# 2. Choose backup timestamp
|
||||
# postgres_michaelschiemer_20250130_143000.sql.gz
|
||||
|
||||
# 3. Restore to that point
|
||||
docker exec -it postgres-backup /scripts/restore.sh /backups/postgres_michaelschiemer_20250130_143000.sql.gz
|
||||
```
|
||||
|
||||
### Off-site Backup
|
||||
|
||||
**Recommended**: Copy backups to external storage
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# backup-offsite.sh - Run daily after local backup
|
||||
|
||||
BACKUP_DIR="./backups"
|
||||
REMOTE_HOST="backup-server.example.com"
|
||||
REMOTE_PATH="/backups/michaelschiemer/postgresql"
|
||||
|
||||
# Sync backups to remote server
|
||||
rsync -avz --delete \
|
||||
"${BACKUP_DIR}/" \
|
||||
"${REMOTE_HOST}:${REMOTE_PATH}/"
|
||||
|
||||
echo "✅ Off-site backup completed"
|
||||
```
|
||||
|
||||
**Alternative: S3 Upload**
|
||||
```bash
|
||||
# Using AWS CLI
|
||||
aws s3 sync ./backups/ s3://my-backup-bucket/postgresql/ --delete
|
||||
```
|
||||
|
||||
## Security
|
||||
|
||||
### Connection Security
|
||||
|
||||
**Network Isolation**:
|
||||
- PostgreSQL only accessible via `app-internal` network
|
||||
- No external ports exposed
|
||||
- Service-to-service communication only
|
||||
|
||||
**Authentication**:
|
||||
- Strong password required (generated with `openssl rand -base64 32`)
|
||||
- No default passwords
|
||||
- Password stored in environment variables only
|
||||
|
||||
### Backup Security
|
||||
|
||||
**Encryption** (recommended for production):
|
||||
```bash
|
||||
# Encrypt backup before off-site storage
|
||||
gpg --symmetric --cipher-algo AES256 backups/postgres_michaelschiemer_*.sql.gz
|
||||
|
||||
# Decrypt when needed
|
||||
gpg --decrypt backups/postgres_michaelschiemer_*.sql.gz.gpg | gunzip | psql
|
||||
```
|
||||
|
||||
**Access Control**:
|
||||
- Backup directory mounted as read-only in other containers
|
||||
- Backup service has write access only
|
||||
- Host filesystem permissions: `chmod 700 backups/`
|
||||
|
||||
### Update Security
|
||||
|
||||
```bash
|
||||
# Update PostgreSQL image
|
||||
docker compose pull
|
||||
|
||||
# Recreate containers with new image
|
||||
docker compose up -d
|
||||
|
||||
# Verify version
|
||||
docker exec postgres psql -U postgres -c "SELECT version();"
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Health Checks
|
||||
|
||||
```bash
|
||||
# Check service health
|
||||
docker compose ps
|
||||
|
||||
# Expected: Both services "healthy"
|
||||
|
||||
# Manual health check
|
||||
docker exec postgres pg_isready -U postgres -d michaelschiemer
|
||||
|
||||
# Check backup service
|
||||
docker compose logs postgres-backup | grep "✅ Backup completed"
|
||||
```
|
||||
|
||||
### Resource Usage
|
||||
|
||||
```bash
|
||||
# Database container stats
|
||||
docker stats postgres --no-stream
|
||||
|
||||
# Expected:
|
||||
# - Memory: ~200-800MB (under 2GB limit)
|
||||
# - CPU: <50% sustained
|
||||
|
||||
# Disk usage
|
||||
docker exec postgres du -sh /var/lib/postgresql/data
|
||||
```
|
||||
|
||||
### Logs
|
||||
|
||||
```bash
|
||||
# PostgreSQL logs
|
||||
docker compose logs postgres
|
||||
|
||||
# Backup logs
|
||||
docker compose logs postgres-backup
|
||||
|
||||
# Real-time monitoring
|
||||
docker compose logs -f
|
||||
|
||||
# PostgreSQL server logs (inside container)
|
||||
docker exec postgres tail -f /var/lib/postgresql/data/pgdata/log/postgresql-*.log
|
||||
```
|
||||
|
||||
### Alerts
|
||||
|
||||
**Recommended Monitoring**:
|
||||
- Backup success/failure notifications
|
||||
- Disk space warnings (>80% full)
|
||||
- Connection count monitoring
|
||||
- Slow query alerts
|
||||
- Replication lag (if using replication)
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Database Won't Start
|
||||
|
||||
```bash
|
||||
# Check logs
|
||||
docker compose logs postgres
|
||||
|
||||
# Common issues:
|
||||
# 1. Invalid configuration
|
||||
docker exec postgres postgres --check
|
||||
|
||||
# 2. Corrupted data directory
|
||||
docker compose down
|
||||
docker volume rm postgres-data
|
||||
docker compose up -d
|
||||
|
||||
# 3. Permission issues
|
||||
docker exec postgres ls -la /var/lib/postgresql/data
|
||||
```
|
||||
|
||||
### Backup Failures
|
||||
|
||||
```bash
|
||||
# Check backup service logs
|
||||
docker compose logs postgres-backup
|
||||
|
||||
# Common issues:
|
||||
# 1. Disk full
|
||||
df -h
|
||||
|
||||
# 2. Connection to PostgreSQL failed
|
||||
docker exec postgres-backup pg_isready -h postgres -U postgres
|
||||
|
||||
# 3. Manual backup test
|
||||
docker exec postgres-backup /scripts/backup.sh
|
||||
```
|
||||
|
||||
### Connection Refused from Application
|
||||
|
||||
```bash
|
||||
# 1. Check PostgreSQL is running
|
||||
docker compose ps postgres
|
||||
|
||||
# 2. Verify network
|
||||
docker network inspect app-internal | grep postgres
|
||||
|
||||
# 3. Test connection
|
||||
docker exec app nc -zv postgres 5432
|
||||
|
||||
# 4. Check credentials
|
||||
docker exec app printenv | grep DB_
|
||||
```
|
||||
|
||||
### Slow Queries
|
||||
|
||||
```bash
|
||||
# Enable extended logging
|
||||
docker exec postgres psql -U postgres -c "ALTER SYSTEM SET log_min_duration_statement = 500;"
|
||||
docker compose restart postgres
|
||||
|
||||
# Check for missing indexes
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "
|
||||
SELECT
|
||||
schemaname,
|
||||
tablename,
|
||||
attname,
|
||||
n_distinct,
|
||||
correlation
|
||||
FROM pg_stats
|
||||
WHERE schemaname = 'public'
|
||||
ORDER BY correlation;
|
||||
"
|
||||
```
|
||||
|
||||
### Out of Disk Space
|
||||
|
||||
```bash
|
||||
# Check disk usage
|
||||
df -h
|
||||
|
||||
# Check database size
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "
|
||||
SELECT pg_size_pretty(pg_database_size('michaelschiemer'));
|
||||
"
|
||||
|
||||
# Vacuum to reclaim space
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "VACUUM FULL;"
|
||||
|
||||
# Clean old backups manually
|
||||
find ./backups -name "*.sql.gz" -mtime +30 -delete
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Current Configuration (2GB Memory)
|
||||
|
||||
**`conf.d/postgresql.conf`** optimized for:
|
||||
- **Memory**: 2GB allocated (512MB shared buffers, 1.5GB effective cache)
|
||||
- **Connections**: 100 max connections
|
||||
- **Workers**: 4 parallel workers
|
||||
- **Checkpoint**: 2GB max WAL size
|
||||
|
||||
### Scaling Up (4GB+ Memory)
|
||||
|
||||
```ini
|
||||
# conf.d/postgresql.conf
|
||||
shared_buffers = 1GB # 25% of RAM
|
||||
effective_cache_size = 3GB # 75% of RAM
|
||||
maintenance_work_mem = 256MB
|
||||
work_mem = 10MB
|
||||
max_connections = 200
|
||||
max_parallel_workers = 8
|
||||
```
|
||||
|
||||
### Query Optimization
|
||||
|
||||
```bash
|
||||
# Analyze query performance
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "
|
||||
EXPLAIN ANALYZE
|
||||
SELECT * FROM users WHERE email = 'test@example.com';
|
||||
"
|
||||
|
||||
# Create index for frequently queried columns
|
||||
docker exec postgres psql -U postgres -d michaelschiemer -c "
|
||||
CREATE INDEX idx_users_email ON users(email);
|
||||
"
|
||||
```
|
||||
|
||||
### Connection Pooling
|
||||
|
||||
**Recommended**: Use PgBouncer for connection pooling in high-traffic scenarios
|
||||
|
||||
```yaml
|
||||
# Add to docker-compose.yml
|
||||
pgbouncer:
|
||||
image: pgbouncer/pgbouncer:latest
|
||||
environment:
|
||||
- DATABASES_HOST=postgres
|
||||
- DATABASES_PORT=5432
|
||||
- DATABASES_DBNAME=michaelschiemer
|
||||
- PGBOUNCER_POOL_MODE=transaction
|
||||
- PGBOUNCER_MAX_CLIENT_CONN=1000
|
||||
- PGBOUNCER_DEFAULT_POOL_SIZE=25
|
||||
```
|
||||
|
||||
## Upgrading PostgreSQL
|
||||
|
||||
### Minor Version Upgrade (e.g., 16.0 → 16.1)
|
||||
|
||||
```bash
|
||||
# Pull latest 16.x image
|
||||
docker compose pull
|
||||
|
||||
# Recreate container
|
||||
docker compose up -d
|
||||
|
||||
# Verify version
|
||||
docker exec postgres psql -U postgres -c "SELECT version();"
|
||||
```
|
||||
|
||||
### Major Version Upgrade (e.g., 16 → 17)
|
||||
|
||||
```bash
|
||||
# 1. Create full backup
|
||||
docker exec postgres-backup /scripts/backup.sh
|
||||
|
||||
# 2. Stop services
|
||||
docker compose down
|
||||
|
||||
# 3. Update docker-compose.yml
|
||||
# Change: postgres:16-alpine → postgres:17-alpine
|
||||
|
||||
# 4. Remove old data volume
|
||||
docker volume rm postgres-data
|
||||
|
||||
# 5. Start new version
|
||||
docker compose up -d
|
||||
|
||||
# 6. Restore data
|
||||
docker exec -it postgres-backup /scripts/restore.sh /backups/postgres_michaelschiemer_<latest>.sql.gz
|
||||
```
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **PostgreSQL Documentation**: https://www.postgresql.org/docs/16/
|
||||
- **Performance Tuning**: https://wiki.postgresql.org/wiki/Performance_Optimization
|
||||
- **Backup Best Practices**: https://www.postgresql.org/docs/16/backup.html
|
||||
- **Security Hardening**: https://www.postgresql.org/docs/16/security.html
|
||||
70
deployment/stacks/postgresql/conf.d/postgresql.conf
Normal file
70
deployment/stacks/postgresql/conf.d/postgresql.conf
Normal file
@@ -0,0 +1,70 @@
|
||||
# PostgreSQL Configuration for Production
|
||||
# Optimized for 2GB memory allocation
|
||||
|
||||
# Connection Settings
|
||||
listen_addresses = '*'
|
||||
max_connections = 100
|
||||
superuser_reserved_connections = 3
|
||||
|
||||
# Memory Settings (for 2GB allocation)
|
||||
shared_buffers = 512MB
|
||||
effective_cache_size = 1536MB
|
||||
maintenance_work_mem = 128MB
|
||||
work_mem = 5MB
|
||||
|
||||
# Checkpoint Settings
|
||||
checkpoint_completion_target = 0.9
|
||||
wal_buffers = 16MB
|
||||
default_statistics_target = 100
|
||||
|
||||
# Query Planner
|
||||
random_page_cost = 1.1
|
||||
effective_io_concurrency = 200
|
||||
|
||||
# Write-Ahead Logging
|
||||
wal_level = replica
|
||||
max_wal_size = 2GB
|
||||
min_wal_size = 1GB
|
||||
|
||||
# Logging
|
||||
log_destination = 'stderr'
|
||||
logging_collector = on
|
||||
log_directory = 'log'
|
||||
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
|
||||
log_rotation_age = 1d
|
||||
log_rotation_size = 100MB
|
||||
log_line_prefix = '%m [%p] %u@%d '
|
||||
log_timezone = 'Europe/Berlin'
|
||||
|
||||
# What to Log
|
||||
log_min_duration_statement = 1000
|
||||
log_checkpoints = on
|
||||
log_connections = on
|
||||
log_disconnections = on
|
||||
log_lock_waits = on
|
||||
log_statement = 'ddl'
|
||||
|
||||
# Autovacuum
|
||||
autovacuum = on
|
||||
autovacuum_max_workers = 3
|
||||
autovacuum_naptime = 1min
|
||||
|
||||
# Client Connection Defaults
|
||||
datestyle = 'iso, mdy'
|
||||
timezone = 'Europe/Berlin'
|
||||
lc_messages = 'en_US.utf8'
|
||||
lc_monetary = 'en_US.utf8'
|
||||
lc_numeric = 'en_US.utf8'
|
||||
lc_time = 'en_US.utf8'
|
||||
default_text_search_config = 'pg_catalog.english'
|
||||
|
||||
# Lock Management
|
||||
deadlock_timeout = 1s
|
||||
|
||||
# Statement Timeout (prevent long-running queries)
|
||||
statement_timeout = 30000 # 30 seconds
|
||||
|
||||
# Parallel Query Execution
|
||||
max_parallel_workers_per_gather = 2
|
||||
max_parallel_workers = 4
|
||||
max_worker_processes = 4
|
||||
71
deployment/stacks/postgresql/docker-compose.yml
Normal file
71
deployment/stacks/postgresql/docker-compose.yml
Normal file
@@ -0,0 +1,71 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# PostgreSQL Database
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
container_name: postgres
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- app-internal
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
- POSTGRES_DB=${POSTGRES_DB:-michaelschiemer}
|
||||
- POSTGRES_USER=${POSTGRES_USER:-postgres}
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
|
||||
- PGDATA=/var/lib/postgresql/data/pgdata
|
||||
volumes:
|
||||
- postgres-data:/var/lib/postgresql/data
|
||||
- ./conf.d:/etc/postgresql/conf.d:ro
|
||||
- ./backups:/backups
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
command: >
|
||||
postgres
|
||||
-c config_file=/etc/postgresql/conf.d/postgresql.conf
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-postgres} -d ${POSTGRES_DB:-michaelschiemer}"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
shm_size: 256mb
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
reservations:
|
||||
memory: 512M
|
||||
|
||||
# Automated Backup Service
|
||||
postgres-backup:
|
||||
image: postgres:16-alpine
|
||||
container_name: postgres-backup
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- app-internal
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
- POSTGRES_HOST=postgres
|
||||
- POSTGRES_DB=${POSTGRES_DB:-michaelschiemer}
|
||||
- POSTGRES_USER=${POSTGRES_USER:-postgres}
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
|
||||
- BACKUP_RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7}
|
||||
- BACKUP_SCHEDULE=${BACKUP_SCHEDULE:-0 2 * * *}
|
||||
volumes:
|
||||
- ./backups:/backups
|
||||
- ./scripts:/scripts:ro
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
entrypoint: ["/scripts/backup-entrypoint.sh"]
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
|
||||
volumes:
|
||||
postgres-data:
|
||||
name: postgres-data
|
||||
|
||||
networks:
|
||||
app-internal:
|
||||
external: true
|
||||
23
deployment/stacks/postgresql/scripts/backup-entrypoint.sh
Executable file
23
deployment/stacks/postgresql/scripts/backup-entrypoint.sh
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
echo "🔄 PostgreSQL Backup Service Starting..."
|
||||
echo "📅 Backup Schedule: ${BACKUP_SCHEDULE}"
|
||||
echo "🗑️ Retention: ${BACKUP_RETENTION_DAYS} days"
|
||||
|
||||
# Install cronie for scheduled backups
|
||||
apk add --no-cache dcron
|
||||
|
||||
# Create cron job
|
||||
echo "${BACKUP_SCHEDULE} /scripts/backup.sh >> /var/log/backup.log 2>&1" > /etc/crontabs/root
|
||||
|
||||
# Ensure backup directory exists
|
||||
mkdir -p /backups
|
||||
|
||||
# Run initial backup
|
||||
echo "🚀 Running initial backup..."
|
||||
/scripts/backup.sh
|
||||
|
||||
# Start cron in foreground
|
||||
echo "✅ Backup service ready - cron daemon starting"
|
||||
crond -f -l 2
|
||||
55
deployment/stacks/postgresql/scripts/backup.sh
Executable file
55
deployment/stacks/postgresql/scripts/backup.sh
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
# Configuration
|
||||
BACKUP_DIR="/backups"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_FILE="${BACKUP_DIR}/postgres_${POSTGRES_DB}_${TIMESTAMP}.sql.gz"
|
||||
RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7}
|
||||
|
||||
echo "======================================"
|
||||
echo "PostgreSQL Backup - $(date)"
|
||||
echo "======================================"
|
||||
|
||||
# Wait for PostgreSQL to be ready
|
||||
until PGPASSWORD="${POSTGRES_PASSWORD}" pg_isready -h "${POSTGRES_HOST}" -U "${POSTGRES_USER}" -d "${POSTGRES_DB}"; do
|
||||
echo "⏳ Waiting for PostgreSQL..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
echo "✅ PostgreSQL is ready"
|
||||
|
||||
# Create backup
|
||||
echo "📦 Creating backup: ${BACKUP_FILE}"
|
||||
PGPASSWORD="${POSTGRES_PASSWORD}" pg_dump \
|
||||
-h "${POSTGRES_HOST}" \
|
||||
-U "${POSTGRES_USER}" \
|
||||
-d "${POSTGRES_DB}" \
|
||||
--clean \
|
||||
--if-exists \
|
||||
--create \
|
||||
--no-owner \
|
||||
--no-privileges \
|
||||
| gzip > "${BACKUP_FILE}"
|
||||
|
||||
# Verify backup was created
|
||||
if [ -f "${BACKUP_FILE}" ]; then
|
||||
BACKUP_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1)
|
||||
echo "✅ Backup created successfully: ${BACKUP_SIZE}"
|
||||
else
|
||||
echo "❌ Backup failed!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Cleanup old backups
|
||||
echo "🗑️ Cleaning up backups older than ${RETENTION_DAYS} days..."
|
||||
find "${BACKUP_DIR}" -name "postgres_*.sql.gz" -type f -mtime +${RETENTION_DAYS} -delete
|
||||
|
||||
# List current backups
|
||||
echo ""
|
||||
echo "📊 Current backups:"
|
||||
ls -lh "${BACKUP_DIR}"/postgres_*.sql.gz 2>/dev/null || echo "No backups found"
|
||||
|
||||
echo ""
|
||||
echo "✅ Backup completed successfully"
|
||||
echo "======================================"
|
||||
55
deployment/stacks/postgresql/scripts/restore.sh
Executable file
55
deployment/stacks/postgresql/scripts/restore.sh
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
# Configuration
|
||||
BACKUP_DIR="/backups"
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "Usage: ./restore.sh <backup-file>"
|
||||
echo ""
|
||||
echo "Available backups:"
|
||||
ls -lh "${BACKUP_DIR}"/postgres_*.sql.gz 2>/dev/null || echo "No backups found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
BACKUP_FILE="$1"
|
||||
|
||||
if [ ! -f "${BACKUP_FILE}" ]; then
|
||||
echo "❌ Backup file not found: ${BACKUP_FILE}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "======================================"
|
||||
echo "PostgreSQL Restore - $(date)"
|
||||
echo "======================================"
|
||||
echo "📦 Backup file: ${BACKUP_FILE}"
|
||||
echo ""
|
||||
|
||||
# Wait for PostgreSQL to be ready
|
||||
until PGPASSWORD="${POSTGRES_PASSWORD}" pg_isready -h "${POSTGRES_HOST}" -U "${POSTGRES_USER}" -d postgres; do
|
||||
echo "⏳ Waiting for PostgreSQL..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
echo "✅ PostgreSQL is ready"
|
||||
echo ""
|
||||
|
||||
# Warning
|
||||
echo "⚠️ WARNING: This will DROP and RECREATE the database!"
|
||||
echo "⚠️ Database: ${POSTGRES_DB}"
|
||||
echo ""
|
||||
echo "Press Ctrl+C to cancel, or wait 10 seconds to continue..."
|
||||
sleep 10
|
||||
|
||||
echo ""
|
||||
echo "🔄 Restoring database..."
|
||||
|
||||
# Restore backup
|
||||
gunzip -c "${BACKUP_FILE}" | PGPASSWORD="${POSTGRES_PASSWORD}" psql \
|
||||
-h "${POSTGRES_HOST}" \
|
||||
-U "${POSTGRES_USER}" \
|
||||
-d postgres
|
||||
|
||||
echo ""
|
||||
echo "✅ Database restored successfully"
|
||||
echo "======================================"
|
||||
14
deployment/stacks/registry/.env.example
Normal file
14
deployment/stacks/registry/.env.example
Normal file
@@ -0,0 +1,14 @@
|
||||
# Docker Registry Configuration
|
||||
# Copy this file to .env and adjust values
|
||||
|
||||
# Timezone
|
||||
TZ=Europe/Berlin
|
||||
|
||||
# Registry Domain
|
||||
REGISTRY_DOMAIN=registry.michaelschiemer.de
|
||||
|
||||
# Registry HTTP Secret (generate with: openssl rand -hex 32)
|
||||
REGISTRY_HTTP_SECRET=<generate-random-secret>
|
||||
|
||||
# BasicAuth Users (managed via auth/htpasswd file)
|
||||
# Generate with: htpasswd -Bn username
|
||||
26
deployment/stacks/registry/CREATE_AUTH.sh
Executable file
26
deployment/stacks/registry/CREATE_AUTH.sh
Executable file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
# Script to create htpasswd file for Docker Registry
|
||||
# Usage: ./CREATE_AUTH.sh <username> <password>
|
||||
|
||||
if [ -z "$1" ] || [ -z "$2" ]; then
|
||||
echo "Usage: $0 <username> <password>"
|
||||
echo "Example: $0 admin mypassword"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
USERNAME=$1
|
||||
PASSWORD=$2
|
||||
AUTH_DIR="$(dirname "$0")/auth"
|
||||
|
||||
# Create auth directory if it doesn't exist
|
||||
mkdir -p "$AUTH_DIR"
|
||||
|
||||
# Create or update htpasswd file
|
||||
docker run --rm --entrypoint htpasswd httpd:2 -Bbn "$USERNAME" "$PASSWORD" > "$AUTH_DIR/htpasswd"
|
||||
|
||||
# Set proper permissions
|
||||
chmod 644 "$AUTH_DIR/htpasswd"
|
||||
|
||||
echo "✅ htpasswd file created successfully!"
|
||||
echo "Username: $USERNAME"
|
||||
echo "Location: $AUTH_DIR/htpasswd"
|
||||
610
deployment/stacks/registry/README.md
Normal file
610
deployment/stacks/registry/README.md
Normal file
@@ -0,0 +1,610 @@
|
||||
# Docker Registry Stack - Private Container Registry
|
||||
|
||||
## Overview
|
||||
|
||||
Private Docker Registry mit BasicAuth für sichere Container Image Storage.
|
||||
|
||||
**Features**:
|
||||
- Private Docker Registry v2.8
|
||||
- BasicAuth Authentifizierung
|
||||
- SSL via Traefik
|
||||
- Automatic garbage collection
|
||||
- Image deletion support
|
||||
- Persistent storage
|
||||
|
||||
## Services
|
||||
|
||||
- **registry.michaelschiemer.de** - Docker Registry (BasicAuth protected)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Traefik Stack Running**
|
||||
```bash
|
||||
cd ../traefik
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
2. **DNS Configuration**
|
||||
Point `registry.michaelschiemer.de` to your server IP (94.16.110.151)
|
||||
|
||||
3. **htpasswd Utility**
|
||||
```bash
|
||||
# Install if not available
|
||||
sudo apt-get install apache2-utils
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### 1. Create Environment File
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
### 2. Generate Registry HTTP Secret
|
||||
|
||||
```bash
|
||||
openssl rand -hex 32
|
||||
```
|
||||
|
||||
Update `.env`:
|
||||
```env
|
||||
REGISTRY_HTTP_SECRET=<generated-secret>
|
||||
```
|
||||
|
||||
### 3. Create Registry Users
|
||||
|
||||
```bash
|
||||
# Create htpasswd file with first user
|
||||
htpasswd -Bc auth/htpasswd admin
|
||||
|
||||
# Add additional users
|
||||
htpasswd -B auth/htpasswd developer
|
||||
|
||||
# Verify users
|
||||
cat auth/htpasswd
|
||||
```
|
||||
|
||||
**Important**: Use `-B` (bcrypt) for best security. `-c` creates new file (only for first user).
|
||||
|
||||
## Deployment
|
||||
|
||||
### Initial Setup
|
||||
|
||||
```bash
|
||||
# Ensure Traefik is running
|
||||
docker network inspect traefik-public
|
||||
|
||||
# Create auth directory and users
|
||||
mkdir -p auth
|
||||
htpasswd -Bc auth/htpasswd admin
|
||||
|
||||
# Start registry
|
||||
docker compose up -d
|
||||
|
||||
# Check logs
|
||||
docker compose logs -f
|
||||
|
||||
# Verify health
|
||||
docker compose ps
|
||||
```
|
||||
|
||||
### Verify Deployment
|
||||
|
||||
```bash
|
||||
# Test registry endpoint
|
||||
curl https://registry.michaelschiemer.de/v2/
|
||||
|
||||
# Expected: Authentication required (401)
|
||||
|
||||
# Test with credentials
|
||||
curl -u admin:yourpassword https://registry.michaelschiemer.de/v2/_catalog
|
||||
|
||||
# Expected: {"repositories":[]}
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Docker Login
|
||||
|
||||
```bash
|
||||
# Login to registry
|
||||
docker login registry.michaelschiemer.de
|
||||
|
||||
# Enter username and password when prompted
|
||||
```
|
||||
|
||||
### Push Images
|
||||
|
||||
```bash
|
||||
# Tag local image for registry
|
||||
docker tag myapp:latest registry.michaelschiemer.de/myapp:latest
|
||||
|
||||
# Push to registry
|
||||
docker push registry.michaelschiemer.de/myapp:latest
|
||||
```
|
||||
|
||||
### Pull Images
|
||||
|
||||
```bash
|
||||
# Pull from registry
|
||||
docker pull registry.michaelschiemer.de/myapp:latest
|
||||
```
|
||||
|
||||
### List Images
|
||||
|
||||
```bash
|
||||
# List all repositories
|
||||
curl -u admin:password https://registry.michaelschiemer.de/v2/_catalog
|
||||
|
||||
# List tags for repository
|
||||
curl -u admin:password https://registry.michaelschiemer.de/v2/myapp/tags/list
|
||||
```
|
||||
|
||||
### Delete Images
|
||||
|
||||
```bash
|
||||
# Get image digest
|
||||
curl -I -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \
|
||||
-u admin:password \
|
||||
https://registry.michaelschiemer.de/v2/myapp/manifests/latest
|
||||
|
||||
# Delete by digest
|
||||
curl -X DELETE -u admin:password \
|
||||
https://registry.michaelschiemer.de/v2/myapp/manifests/sha256:...
|
||||
|
||||
# Run garbage collection
|
||||
docker exec registry bin/registry garbage-collect /etc/docker/registry/config.yml
|
||||
```
|
||||
|
||||
## Integration with Other Stacks
|
||||
|
||||
### Gitea Actions (Stack 2)
|
||||
|
||||
Push built images from Gitea Actions:
|
||||
|
||||
```yaml
|
||||
# .gitea/workflows/build.yml
|
||||
name: Build and Push
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Login to Registry
|
||||
run: |
|
||||
echo "${{ secrets.REGISTRY_PASSWORD }}" | \
|
||||
docker login registry.michaelschiemer.de \
|
||||
-u "${{ secrets.REGISTRY_USER }}" \
|
||||
--password-stdin
|
||||
|
||||
- name: Build and Push
|
||||
run: |
|
||||
docker build -t registry.michaelschiemer.de/myapp:${{ github.sha }} .
|
||||
docker push registry.michaelschiemer.de/myapp:${{ github.sha }}
|
||||
```
|
||||
|
||||
### Application Stack (Stack 4)
|
||||
|
||||
Pull images in application deployment:
|
||||
|
||||
```yaml
|
||||
# In application docker-compose.yml
|
||||
services:
|
||||
app:
|
||||
image: registry.michaelschiemer.de/myapp:latest
|
||||
# ... rest of configuration
|
||||
```
|
||||
|
||||
**Note**: Ensure Docker daemon has registry credentials configured.
|
||||
|
||||
## User Management
|
||||
|
||||
### Add User
|
||||
|
||||
```bash
|
||||
# Add new user
|
||||
htpasswd -B auth/htpasswd newuser
|
||||
|
||||
# Restart registry to apply
|
||||
docker compose restart
|
||||
```
|
||||
|
||||
### Remove User
|
||||
|
||||
```bash
|
||||
# Edit htpasswd file and remove user line
|
||||
nano auth/htpasswd
|
||||
|
||||
# Restart registry
|
||||
docker compose restart
|
||||
```
|
||||
|
||||
### Change Password
|
||||
|
||||
```bash
|
||||
# Update password (removes old entry)
|
||||
htpasswd -B auth/htpasswd username
|
||||
|
||||
# Restart registry
|
||||
docker compose restart
|
||||
```
|
||||
|
||||
## Backup & Recovery
|
||||
|
||||
### Manual Backup
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# backup-registry.sh
|
||||
|
||||
BACKUP_DIR="/backups/registry"
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Backup registry data
|
||||
docker run --rm \
|
||||
-v registry-data:/data \
|
||||
-v $BACKUP_DIR:/backup \
|
||||
alpine tar czf /backup/registry-data-$DATE.tar.gz -C /data .
|
||||
|
||||
# Backup auth configuration
|
||||
tar czf $BACKUP_DIR/registry-auth-$DATE.tar.gz auth/
|
||||
|
||||
echo "Backup completed: $BACKUP_DIR/*-$DATE.tar.gz"
|
||||
```
|
||||
|
||||
### Restore from Backup
|
||||
|
||||
```bash
|
||||
# Stop registry
|
||||
docker compose down
|
||||
|
||||
# Restore registry data
|
||||
docker run --rm \
|
||||
-v registry-data:/data \
|
||||
-v /backups/registry:/backup \
|
||||
alpine tar xzf /backup/registry-data-YYYYMMDD_HHMMSS.tar.gz -C /data
|
||||
|
||||
# Restore auth
|
||||
tar xzf /backups/registry/registry-auth-YYYYMMDD_HHMMSS.tar.gz
|
||||
|
||||
# Start registry
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### Automated Backups
|
||||
|
||||
Add to crontab:
|
||||
|
||||
```bash
|
||||
# Daily backup at 3 AM
|
||||
0 3 * * * /path/to/backup-registry.sh
|
||||
|
||||
# Keep only last 14 days
|
||||
0 4 * * * find /backups/registry -type f -mtime +14 -delete
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Health Checks
|
||||
|
||||
```bash
|
||||
# Check registry health
|
||||
docker compose ps
|
||||
|
||||
# Registry health endpoint
|
||||
curl -f https://registry.michaelschiemer.de/v2/
|
||||
|
||||
# Check storage usage
|
||||
docker exec registry du -sh /var/lib/registry
|
||||
```
|
||||
|
||||
### Logs
|
||||
|
||||
```bash
|
||||
# View logs
|
||||
docker compose logs -f
|
||||
|
||||
# Check for errors
|
||||
docker compose logs registry | grep -i error
|
||||
|
||||
# Monitor access logs
|
||||
docker compose logs -f registry | grep "GET /v2"
|
||||
```
|
||||
|
||||
### Storage Statistics
|
||||
|
||||
```bash
|
||||
# Check volume size
|
||||
docker volume inspect registry-data
|
||||
|
||||
# Check disk usage
|
||||
docker system df -v | grep registry
|
||||
|
||||
# List images in registry
|
||||
curl -u admin:password https://registry.michaelschiemer.de/v2/_catalog | jq
|
||||
```
|
||||
|
||||
## Garbage Collection
|
||||
|
||||
### Manual Garbage Collection
|
||||
|
||||
```bash
|
||||
# Run garbage collection
|
||||
docker exec registry bin/registry garbage-collect \
|
||||
/etc/docker/registry/config.yml
|
||||
|
||||
# With dry-run
|
||||
docker exec registry bin/registry garbage-collect \
|
||||
--dry-run \
|
||||
/etc/docker/registry/config.yml
|
||||
```
|
||||
|
||||
### Scheduled Garbage Collection
|
||||
|
||||
Add to crontab (on production server):
|
||||
|
||||
```bash
|
||||
# Weekly garbage collection (Sunday 2 AM)
|
||||
0 2 * * 0 docker exec registry bin/registry garbage-collect /etc/docker/registry/config.yml
|
||||
```
|
||||
|
||||
**Note**: Automatic upload purging is enabled (168h old uploads cleaned every 24h).
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Authentication Failed
|
||||
|
||||
```bash
|
||||
# Check htpasswd file exists
|
||||
ls -la auth/htpasswd
|
||||
|
||||
# Verify htpasswd format (should be bcrypt)
|
||||
cat auth/htpasswd
|
||||
# Format: username:$2y$...
|
||||
|
||||
# Test authentication
|
||||
curl -u username:password https://registry.michaelschiemer.de/v2/
|
||||
|
||||
# Check registry logs
|
||||
docker compose logs registry | grep auth
|
||||
```
|
||||
|
||||
### Cannot Push Images
|
||||
|
||||
```bash
|
||||
# Verify Docker login
|
||||
cat ~/.docker/config.json | grep registry.michaelschiemer.de
|
||||
|
||||
# Re-login
|
||||
docker logout registry.michaelschiemer.de
|
||||
docker login registry.michaelschiemer.de
|
||||
|
||||
# Check storage space
|
||||
df -h /var/lib/docker
|
||||
|
||||
# Check registry logs
|
||||
docker compose logs -f registry
|
||||
```
|
||||
|
||||
### SSL Certificate Issues
|
||||
|
||||
```bash
|
||||
# Verify Traefik certificate
|
||||
docker exec traefik cat /acme.json | grep registry.michaelschiemer.de
|
||||
|
||||
# Force certificate renewal (via Traefik)
|
||||
# Remove acme.json and restart Traefik
|
||||
|
||||
# Test SSL
|
||||
openssl s_client -connect registry.michaelschiemer.de:443 -servername registry.michaelschiemer.de < /dev/null
|
||||
```
|
||||
|
||||
### Registry Not Accessible
|
||||
|
||||
```bash
|
||||
# Check service is running
|
||||
docker compose ps
|
||||
|
||||
# Check Traefik routing
|
||||
docker exec traefik cat /etc/traefik/traefik.yml
|
||||
|
||||
# Check network
|
||||
docker network inspect traefik-public | grep registry
|
||||
|
||||
# Test from server
|
||||
curl -k https://localhost:5000/v2/
|
||||
```
|
||||
|
||||
### Storage Issues
|
||||
|
||||
```bash
|
||||
# Check volume mount
|
||||
docker exec registry df -h /var/lib/registry
|
||||
|
||||
# Check for corrupted layers
|
||||
docker exec registry find /var/lib/registry -type f -name "data" | wc -l
|
||||
|
||||
# Run garbage collection
|
||||
docker exec registry bin/registry garbage-collect /etc/docker/registry/config.yml
|
||||
|
||||
# Check for orphaned data
|
||||
docker volume prune
|
||||
```
|
||||
|
||||
## Security
|
||||
|
||||
### Security Best Practices
|
||||
|
||||
1. **Strong Passwords**: Use bcrypt (htpasswd -B) with strong passwords
|
||||
2. **SSL Only**: Always use HTTPS (enforced via Traefik)
|
||||
3. **User Management**: Regularly review and rotate credentials
|
||||
4. **Access Logging**: Monitor access logs for suspicious activity
|
||||
5. **Firewall**: Only expose port 443 (handled by Traefik)
|
||||
6. **Backup Encryption**: Encrypt backups containing sensitive data
|
||||
7. **Minimal Permissions**: Limit registry access to necessary users
|
||||
|
||||
### Update Stack
|
||||
|
||||
```bash
|
||||
# Pull latest images
|
||||
docker compose pull
|
||||
|
||||
# Recreate containers
|
||||
docker compose up -d
|
||||
|
||||
# Verify
|
||||
docker compose ps
|
||||
```
|
||||
|
||||
### Security Headers
|
||||
|
||||
Security headers are applied via Traefik's `default-chain@file` middleware:
|
||||
- HSTS
|
||||
- Content-Type Nosniff
|
||||
- XSS Protection
|
||||
- Frame Deny
|
||||
|
||||
## Docker Daemon Configuration
|
||||
|
||||
### Configure Docker to Trust Registry
|
||||
|
||||
On machines that will push/pull from registry:
|
||||
|
||||
```bash
|
||||
# Edit daemon.json
|
||||
sudo nano /etc/docker/daemon.json
|
||||
```
|
||||
|
||||
Add:
|
||||
```json
|
||||
{
|
||||
"insecure-registries": [],
|
||||
"registry-mirrors": [],
|
||||
"log-driver": "json-file",
|
||||
"log-opts": {
|
||||
"max-size": "10m",
|
||||
"max-file": "3"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Note**: No need to add registry to `insecure-registries` since we use SSL.
|
||||
|
||||
```bash
|
||||
# Restart Docker
|
||||
sudo systemctl restart docker
|
||||
|
||||
# Verify
|
||||
docker info | grep Registry
|
||||
```
|
||||
|
||||
### Configure Credentials
|
||||
|
||||
```bash
|
||||
# Login once per machine
|
||||
docker login registry.michaelschiemer.de
|
||||
|
||||
# Credentials stored in ~/.docker/config.json
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Registry Configuration
|
||||
|
||||
For high-traffic registries, edit `docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
# Increase concurrent operations
|
||||
- REGISTRY_STORAGE_MAXCONCURRENCY=50
|
||||
|
||||
# Cache settings
|
||||
- REGISTRY_STORAGE_CACHE_BLOBDESCRIPTOR=inmemory
|
||||
|
||||
# Rate limiting
|
||||
- REGISTRY_HTTP_RATELIMIT_REQUESTS_PER_SECOND=100
|
||||
```
|
||||
|
||||
### Storage Optimization
|
||||
|
||||
```bash
|
||||
# Enable compression for layers (reduces storage)
|
||||
# Already enabled in v2.8
|
||||
|
||||
# Monitor storage growth
|
||||
du -sh /var/lib/docker/volumes/registry-data/
|
||||
|
||||
# Schedule regular garbage collection
|
||||
# See "Scheduled Garbage Collection" section
|
||||
```
|
||||
|
||||
## Migration from Docker Hub
|
||||
|
||||
### Pull and Re-push Images
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# migrate-images.sh
|
||||
|
||||
IMAGES=(
|
||||
"nginx:latest"
|
||||
"node:18-alpine"
|
||||
"postgres:14"
|
||||
)
|
||||
|
||||
for image in "${IMAGES[@]}"; do
|
||||
echo "Migrating $image..."
|
||||
|
||||
# Pull from Docker Hub
|
||||
docker pull $image
|
||||
|
||||
# Tag for private registry
|
||||
docker tag $image registry.michaelschiemer.de/$image
|
||||
|
||||
# Push to private registry
|
||||
docker push registry.michaelschiemer.de/$image
|
||||
|
||||
echo "✅ Migrated $image"
|
||||
done
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### Registry API v2
|
||||
|
||||
```bash
|
||||
# List catalog
|
||||
GET /v2/_catalog
|
||||
|
||||
# List tags
|
||||
GET /v2/<name>/tags/list
|
||||
|
||||
# Get manifest
|
||||
GET /v2/<name>/manifests/<reference>
|
||||
|
||||
# Delete manifest
|
||||
DELETE /v2/<name>/manifests/<digest>
|
||||
|
||||
# Check blob exists
|
||||
HEAD /v2/<name>/blobs/<digest>
|
||||
```
|
||||
|
||||
**Authentication**: All endpoints require BasicAuth.
|
||||
|
||||
**Documentation**: https://docs.docker.com/registry/spec/api/
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **Docker Registry Documentation**: https://docs.docker.com/registry/
|
||||
- **Registry Configuration**: https://docs.docker.com/registry/configuration/
|
||||
- **Storage Drivers**: https://docs.docker.com/registry/storage-drivers/
|
||||
- **Token Authentication**: https://docs.docker.com/registry/spec/auth/token/
|
||||
54
deployment/stacks/registry/docker-compose.yml
Normal file
54
deployment/stacks/registry/docker-compose.yml
Normal file
@@ -0,0 +1,54 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
registry:
|
||||
image: registry:2.8
|
||||
container_name: registry
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- traefik-public
|
||||
ports:
|
||||
- "127.0.0.1:5000:5000"
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
- REGISTRY_STORAGE_DELETE_ENABLED=true
|
||||
- REGISTRY_AUTH=htpasswd
|
||||
- REGISTRY_AUTH_HTPASSWD_REALM=Registry Realm
|
||||
- REGISTRY_AUTH_HTPASSWD_PATH=/auth/htpasswd
|
||||
- REGISTRY_HTTP_SECRET=${REGISTRY_HTTP_SECRET}
|
||||
# Storage configuration
|
||||
- REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY=/var/lib/registry
|
||||
# Garbage collection configuration (moved to config.yml if needed)
|
||||
volumes:
|
||||
- registry-data:/var/lib/registry
|
||||
- ./auth:/auth:ro
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
|
||||
# HTTP Router
|
||||
- "traefik.http.routers.registry.rule=Host(`${REGISTRY_DOMAIN:-registry.michaelschiemer.de}`)"
|
||||
- "traefik.http.routers.registry.entrypoints=websecure"
|
||||
- "traefik.http.routers.registry.tls=true"
|
||||
- "traefik.http.routers.registry.tls.certresolver=letsencrypt"
|
||||
|
||||
# Service
|
||||
- "traefik.http.services.registry.loadbalancer.server.port=5000"
|
||||
|
||||
# Middleware
|
||||
- "traefik.http.routers.registry.middlewares=default-chain@file"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:5000/v2/"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
|
||||
volumes:
|
||||
registry-data:
|
||||
name: registry-data
|
||||
|
||||
networks:
|
||||
traefik-public:
|
||||
external: true
|
||||
20
deployment/stacks/traefik/.env.example
Normal file
20
deployment/stacks/traefik/.env.example
Normal file
@@ -0,0 +1,20 @@
|
||||
# Traefik Configuration
|
||||
# Copy this file to .env and adjust values
|
||||
|
||||
# Timezone
|
||||
TZ=Europe/Berlin
|
||||
|
||||
# Let's Encrypt Email
|
||||
ACME_EMAIL=kontakt@michaelschiemer.de
|
||||
|
||||
# Domain
|
||||
DOMAIN=michaelschiemer.de
|
||||
|
||||
# Dashboard Authentication
|
||||
# Generate password hash with: htpasswd -nb admin your_password
|
||||
# Replace $ with $$ in docker-compose.yml
|
||||
TRAEFIK_DASHBOARD_USER=admin
|
||||
TRAEFIK_DASHBOARD_PASSWORD_HASH=
|
||||
|
||||
# Log Level (DEBUG, INFO, WARN, ERROR)
|
||||
LOG_LEVEL=INFO
|
||||
372
deployment/stacks/traefik/README.md
Normal file
372
deployment/stacks/traefik/README.md
Normal file
@@ -0,0 +1,372 @@
|
||||
# Traefik Stack - Reverse Proxy with SSL
|
||||
|
||||
## Overview
|
||||
|
||||
Traefik acts as the central reverse proxy for all services, handling:
|
||||
- Automatic SSL certificate generation via Let's Encrypt
|
||||
- HTTP to HTTPS redirection
|
||||
- Service discovery via Docker labels
|
||||
- Security headers and compression
|
||||
- Rate limiting and access control
|
||||
|
||||
## Services
|
||||
|
||||
- **traefik.michaelschiemer.de** - Traefik Dashboard (BasicAuth protected)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Docker Network**
|
||||
```bash
|
||||
docker network create traefik-public
|
||||
```
|
||||
|
||||
2. **ACME Storage File**
|
||||
```bash
|
||||
touch acme.json
|
||||
chmod 600 acme.json
|
||||
```
|
||||
|
||||
3. **DNS Configuration**
|
||||
Point these domains to your server IP (94.16.110.151):
|
||||
- `michaelschiemer.de`
|
||||
- `*.michaelschiemer.de` (wildcard)
|
||||
|
||||
## Configuration
|
||||
|
||||
### 1. Create Environment File
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
### 2. Generate Dashboard Password
|
||||
|
||||
```bash
|
||||
# Generate password hash
|
||||
htpasswd -nb admin your_secure_password
|
||||
|
||||
# Example output:
|
||||
# admin:$apr1$8kj9d7lj$r.x5jhLVPLuCDLvJ6x0Hd0
|
||||
|
||||
# Important: In docker-compose.yml, replace $ with $$
|
||||
# admin:$$apr1$$8kj9d7lj$$r.x5jhLVPLuCDLvJ6x0Hd0
|
||||
```
|
||||
|
||||
Update the `traefik.http.middlewares.traefik-auth.basicauth.users` label in `docker-compose.yml`.
|
||||
|
||||
### 3. Adjust Configuration (Optional)
|
||||
|
||||
Edit `traefik.yml` for:
|
||||
- Log levels
|
||||
- Certificate resolvers
|
||||
- Additional entry points
|
||||
- Metrics configuration
|
||||
|
||||
## Deployment
|
||||
|
||||
### Initial Setup
|
||||
|
||||
```bash
|
||||
# Create network
|
||||
docker network create traefik-public
|
||||
|
||||
# Create acme.json
|
||||
touch acme.json
|
||||
chmod 600 acme.json
|
||||
|
||||
# Create log directories
|
||||
mkdir -p logs
|
||||
|
||||
# Start Traefik
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### Verify Deployment
|
||||
|
||||
```bash
|
||||
# Check container status
|
||||
docker compose ps
|
||||
|
||||
# Check logs
|
||||
docker compose logs -f
|
||||
|
||||
# Test dashboard access
|
||||
curl -I https://traefik.michaelschiemer.de
|
||||
|
||||
# Check certificate
|
||||
openssl s_client -connect traefik.michaelschiemer.de:443 -servername traefik.michaelschiemer.de < /dev/null
|
||||
```
|
||||
|
||||
## Middleware Configuration
|
||||
|
||||
Traefik provides several reusable middlewares in `dynamic/middlewares.yml`:
|
||||
|
||||
### Security Headers
|
||||
```yaml
|
||||
labels:
|
||||
- "traefik.http.routers.myapp.middlewares=security-headers-global@file"
|
||||
```
|
||||
|
||||
### Rate Limiting
|
||||
```yaml
|
||||
labels:
|
||||
# Strict: 50 req/s
|
||||
- "traefik.http.routers.myapp.middlewares=rate-limit-strict@file"
|
||||
|
||||
# Moderate: 100 req/s
|
||||
- "traefik.http.routers.myapp.middlewares=rate-limit-moderate@file"
|
||||
|
||||
# Lenient: 200 req/s
|
||||
- "traefik.http.routers.myapp.middlewares=rate-limit-lenient@file"
|
||||
```
|
||||
|
||||
### Compression
|
||||
```yaml
|
||||
labels:
|
||||
- "traefik.http.routers.myapp.middlewares=gzip-compression@file"
|
||||
```
|
||||
|
||||
### Middleware Chains
|
||||
```yaml
|
||||
labels:
|
||||
# Default chain: Security + Compression
|
||||
- "traefik.http.routers.myapp.middlewares=default-chain@file"
|
||||
|
||||
# Admin chain: Security + Compression + Rate Limiting
|
||||
- "traefik.http.routers.myapp.middlewares=admin-chain@file"
|
||||
```
|
||||
|
||||
## Service Integration
|
||||
|
||||
### Example Service Configuration
|
||||
|
||||
Add these labels to any Docker service to expose it through Traefik:
|
||||
|
||||
```yaml
|
||||
services:
|
||||
myapp:
|
||||
image: myapp:latest
|
||||
networks:
|
||||
- traefik-public
|
||||
labels:
|
||||
# Enable Traefik
|
||||
- "traefik.enable=true"
|
||||
|
||||
# Router configuration
|
||||
- "traefik.http.routers.myapp.rule=Host(`app.michaelschiemer.de`)"
|
||||
- "traefik.http.routers.myapp.entrypoints=websecure"
|
||||
- "traefik.http.routers.myapp.tls=true"
|
||||
- "traefik.http.routers.myapp.tls.certresolver=letsencrypt"
|
||||
|
||||
# Service configuration
|
||||
- "traefik.http.services.myapp.loadbalancer.server.port=80"
|
||||
|
||||
# Middleware (optional)
|
||||
- "traefik.http.routers.myapp.middlewares=default-chain@file"
|
||||
|
||||
networks:
|
||||
traefik-public:
|
||||
external: true
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Dashboard Access
|
||||
|
||||
Access the Traefik dashboard at: https://traefik.michaelschiemer.de
|
||||
|
||||
Default credentials (change in production):
|
||||
- Username: `admin`
|
||||
- Password: (set via htpasswd)
|
||||
|
||||
### Logs
|
||||
|
||||
```bash
|
||||
# Access logs (HTTP requests)
|
||||
tail -f logs/access.log
|
||||
|
||||
# Traefik logs (errors, warnings)
|
||||
tail -f logs/traefik.log
|
||||
|
||||
# Container logs
|
||||
docker compose logs -f traefik
|
||||
```
|
||||
|
||||
### Prometheus Metrics
|
||||
|
||||
Traefik exposes Prometheus metrics for monitoring:
|
||||
|
||||
```yaml
|
||||
# Add to Prometheus scrape config
|
||||
- job_name: 'traefik'
|
||||
static_configs:
|
||||
- targets: ['traefik:8082']
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Certificate Issues
|
||||
|
||||
```bash
|
||||
# Check acme.json permissions
|
||||
ls -la acme.json
|
||||
# Should be: -rw------- (600)
|
||||
|
||||
# View certificate status
|
||||
docker compose logs traefik | grep -i "certificate"
|
||||
|
||||
# Force certificate renewal
|
||||
rm acme.json
|
||||
touch acme.json
|
||||
chmod 600 acme.json
|
||||
docker compose restart
|
||||
```
|
||||
|
||||
### DNS Issues
|
||||
|
||||
```bash
|
||||
# Verify DNS resolution
|
||||
dig michaelschiemer.de
|
||||
dig git.michaelschiemer.de
|
||||
|
||||
# Check from external
|
||||
nslookup michaelschiemer.de 8.8.8.8
|
||||
```
|
||||
|
||||
### Service Not Accessible
|
||||
|
||||
```bash
|
||||
# Check Traefik can reach service
|
||||
docker network inspect traefik-public
|
||||
|
||||
# Verify service labels
|
||||
docker inspect <container_name> | grep -A 20 Labels
|
||||
|
||||
# Check Traefik logs for routing errors
|
||||
docker compose logs traefik | grep -i error
|
||||
```
|
||||
|
||||
### Port Conflicts
|
||||
|
||||
```bash
|
||||
# Check if ports 80/443 are free
|
||||
sudo netstat -tlnp | grep -E ':80|:443'
|
||||
|
||||
# Stop conflicting services
|
||||
sudo systemctl stop nginx # or apache2
|
||||
```
|
||||
|
||||
## Security Hardening
|
||||
|
||||
### 1. IP Whitelisting
|
||||
|
||||
Uncomment and configure in `dynamic/middlewares.yml`:
|
||||
|
||||
```yaml
|
||||
admin-whitelist:
|
||||
ipWhiteList:
|
||||
sourceRange:
|
||||
- "your.vpn.ip.range/32"
|
||||
- "10.0.0.0/8"
|
||||
```
|
||||
|
||||
### 2. Strong Dashboard Password
|
||||
|
||||
```bash
|
||||
# Generate strong password
|
||||
openssl rand -base64 32
|
||||
|
||||
# Create hash
|
||||
htpasswd -nb admin "your_strong_password"
|
||||
```
|
||||
|
||||
### 3. Rate Limiting
|
||||
|
||||
Apply rate limiting to sensitive endpoints:
|
||||
|
||||
```yaml
|
||||
labels:
|
||||
- "traefik.http.routers.admin.middlewares=rate-limit-strict@file"
|
||||
```
|
||||
|
||||
### 4. DDoS Protection
|
||||
|
||||
```yaml
|
||||
# In traefik.yml - add entry point middleware
|
||||
entryPoints:
|
||||
websecure:
|
||||
address: ":443"
|
||||
http:
|
||||
middlewares:
|
||||
- rate-limit-moderate@file
|
||||
```
|
||||
|
||||
## Backup
|
||||
|
||||
### Important Files
|
||||
|
||||
- `acme.json` - SSL certificates
|
||||
- `traefik.yml` - Static configuration
|
||||
- `dynamic/` - Dynamic configuration
|
||||
|
||||
```bash
|
||||
# Backup certificates
|
||||
cp acme.json acme.json.backup.$(date +%Y%m%d)
|
||||
|
||||
# Backup configuration
|
||||
tar -czf traefik-config-backup.tar.gz traefik.yml dynamic/
|
||||
```
|
||||
|
||||
## Updates
|
||||
|
||||
```bash
|
||||
# Pull latest image
|
||||
docker compose pull
|
||||
|
||||
# Restart with new image
|
||||
docker compose up -d
|
||||
|
||||
# Verify
|
||||
docker compose ps
|
||||
docker compose logs -f
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Connection Limits
|
||||
|
||||
In `traefik.yml`:
|
||||
|
||||
```yaml
|
||||
entryPoints:
|
||||
websecure:
|
||||
transport:
|
||||
respondingTimeouts:
|
||||
readTimeout: 60s
|
||||
writeTimeout: 60s
|
||||
lifeCycle:
|
||||
requestAcceptGraceTimeout: 0s
|
||||
graceTimeOut: 10s
|
||||
```
|
||||
|
||||
### Resource Limits
|
||||
|
||||
In `docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
```
|
||||
|
||||
## Support
|
||||
|
||||
For issues with Traefik configuration:
|
||||
1. Check official Traefik documentation: https://doc.traefik.io/traefik/
|
||||
2. Review logs: `docker compose logs -f`
|
||||
3. Verify network connectivity: `docker network inspect traefik-public`
|
||||
80
deployment/stacks/traefik/docker-compose.yml
Normal file
80
deployment/stacks/traefik/docker-compose.yml
Normal file
@@ -0,0 +1,80 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
traefik:
|
||||
image: traefik:v3.0
|
||||
container_name: traefik
|
||||
restart: unless-stopped
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
networks:
|
||||
- traefik-public
|
||||
ports:
|
||||
- "80:80"
|
||||
- "443:443"
|
||||
environment:
|
||||
- TZ=Europe/Berlin
|
||||
volumes:
|
||||
# Docker socket for service discovery
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
# Static configuration
|
||||
- ./traefik.yml:/traefik.yml:ro
|
||||
# Dynamic configuration
|
||||
- ./dynamic:/dynamic:ro
|
||||
# SSL certificates
|
||||
- ./acme.json:/acme.json
|
||||
# Logs
|
||||
- ./logs:/logs
|
||||
labels:
|
||||
# Enable Traefik for itself
|
||||
- "traefik.enable=true"
|
||||
|
||||
# Dashboard
|
||||
- "traefik.http.routers.traefik-dashboard.rule=Host(`traefik.michaelschiemer.de`)"
|
||||
- "traefik.http.routers.traefik-dashboard.entrypoints=websecure"
|
||||
- "traefik.http.routers.traefik-dashboard.tls=true"
|
||||
- "traefik.http.routers.traefik-dashboard.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.routers.traefik-dashboard.service=api@internal"
|
||||
- "traefik.http.routers.traefik-dashboard.middlewares=traefik-auth"
|
||||
|
||||
# BasicAuth for dashboard (user: admin, password: generate with htpasswd)
|
||||
# htpasswd -nb admin your_password
|
||||
- "traefik.http.middlewares.traefik-auth.basicauth.users=admin:$$apr1$$8kj9d7lj$$r.x5jhLVPLuCDLvJ6x0Hd0"
|
||||
|
||||
# Allow ACME challenges without redirect (higher priority)
|
||||
- "traefik.http.routers.acme-challenge.rule=PathPrefix(`/.well-known/acme-challenge`)"
|
||||
- "traefik.http.routers.acme-challenge.entrypoints=web"
|
||||
- "traefik.http.routers.acme-challenge.priority=200"
|
||||
|
||||
# Global redirect to HTTPS (lower priority, matches everything else)
|
||||
- "traefik.http.routers.http-catchall.rule=HostRegexp(`{host:.+}`)"
|
||||
- "traefik.http.routers.http-catchall.entrypoints=web"
|
||||
- "traefik.http.routers.http-catchall.middlewares=redirect-to-https"
|
||||
- "traefik.http.routers.http-catchall.priority=1"
|
||||
- "traefik.http.middlewares.redirect-to-https.redirectscheme.scheme=https"
|
||||
- "traefik.http.middlewares.redirect-to-https.redirectscheme.permanent=true"
|
||||
|
||||
# Security headers middleware
|
||||
- "traefik.http.middlewares.security-headers.headers.frameDeny=true"
|
||||
- "traefik.http.middlewares.security-headers.headers.contentTypeNosniff=true"
|
||||
- "traefik.http.middlewares.security-headers.headers.browserXssFilter=true"
|
||||
- "traefik.http.middlewares.security-headers.headers.stsSeconds=31536000"
|
||||
- "traefik.http.middlewares.security-headers.headers.stsIncludeSubdomains=true"
|
||||
- "traefik.http.middlewares.security-headers.headers.stsPreload=true"
|
||||
|
||||
# Compression middleware
|
||||
- "traefik.http.middlewares.compression.compress=true"
|
||||
|
||||
# Rate limiting middleware (100 requests per second)
|
||||
- "traefik.http.middlewares.rate-limit.ratelimit.average=100"
|
||||
- "traefik.http.middlewares.rate-limit.ratelimit.burst=50"
|
||||
healthcheck:
|
||||
test: ["CMD", "traefik", "healthcheck", "--ping"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
|
||||
networks:
|
||||
traefik-public:
|
||||
external: true
|
||||
15
deployment/stacks/traefik/dynamic/gitea.yml
Normal file
15
deployment/stacks/traefik/dynamic/gitea.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
http:
|
||||
routers:
|
||||
gitea:
|
||||
rule: Host(`git.michaelschiemer.de`)
|
||||
entrypoints:
|
||||
- websecure
|
||||
service: gitea
|
||||
tls:
|
||||
certResolver: letsencrypt
|
||||
priority: 100
|
||||
services:
|
||||
gitea:
|
||||
loadBalancer:
|
||||
servers:
|
||||
- url: http://gitea:3000
|
||||
68
deployment/stacks/traefik/dynamic/middlewares.yml
Normal file
68
deployment/stacks/traefik/dynamic/middlewares.yml
Normal file
@@ -0,0 +1,68 @@
|
||||
# Dynamic Middleware Configuration
|
||||
|
||||
http:
|
||||
middlewares:
|
||||
# Security headers for all services
|
||||
security-headers-global:
|
||||
headers:
|
||||
frameDeny: true
|
||||
contentTypeNosniff: true
|
||||
browserXssFilter: true
|
||||
stsSeconds: 31536000
|
||||
stsIncludeSubdomains: true
|
||||
stsPreload: true
|
||||
forceSTSHeader: true
|
||||
customFrameOptionsValue: "SAMEORIGIN"
|
||||
contentSecurityPolicy: "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';"
|
||||
referrerPolicy: "strict-origin-when-cross-origin"
|
||||
permissionsPolicy: "geolocation=(), microphone=(), camera=()"
|
||||
|
||||
# Compression for better performance
|
||||
gzip-compression:
|
||||
compress:
|
||||
excludedContentTypes:
|
||||
- text/event-stream
|
||||
|
||||
# Rate limiting - strict
|
||||
rate-limit-strict:
|
||||
rateLimit:
|
||||
average: 50
|
||||
burst: 25
|
||||
period: 1s
|
||||
|
||||
# Rate limiting - moderate
|
||||
rate-limit-moderate:
|
||||
rateLimit:
|
||||
average: 100
|
||||
burst: 50
|
||||
period: 1s
|
||||
|
||||
# Rate limiting - lenient
|
||||
rate-limit-lenient:
|
||||
rateLimit:
|
||||
average: 200
|
||||
burst: 100
|
||||
period: 1s
|
||||
|
||||
# IP whitelist for admin services (example)
|
||||
# Uncomment and adjust for production
|
||||
# admin-whitelist:
|
||||
# ipWhiteList:
|
||||
# sourceRange:
|
||||
# - "127.0.0.1/32"
|
||||
# - "10.0.0.0/8"
|
||||
|
||||
# Chain multiple middlewares
|
||||
default-chain:
|
||||
chain:
|
||||
middlewares:
|
||||
- security-headers-global
|
||||
- gzip-compression
|
||||
|
||||
admin-chain:
|
||||
chain:
|
||||
middlewares:
|
||||
- security-headers-global
|
||||
- gzip-compression
|
||||
- rate-limit-strict
|
||||
# - admin-whitelist # Uncomment for IP whitelisting
|
||||
85
deployment/stacks/traefik/traefik.yml
Normal file
85
deployment/stacks/traefik/traefik.yml
Normal file
@@ -0,0 +1,85 @@
|
||||
# Static Configuration for Traefik
|
||||
|
||||
# Global Configuration
|
||||
global:
|
||||
checkNewVersion: true
|
||||
sendAnonymousUsage: false
|
||||
|
||||
# API and Dashboard
|
||||
api:
|
||||
dashboard: true
|
||||
insecure: false
|
||||
|
||||
# Entry Points
|
||||
entryPoints:
|
||||
web:
|
||||
address: ":80"
|
||||
# No global redirect - ACME challenges need HTTP access
|
||||
# Redirects are handled per-router via middleware
|
||||
|
||||
websecure:
|
||||
address: ":443"
|
||||
http:
|
||||
tls:
|
||||
certResolver: letsencrypt
|
||||
domains:
|
||||
- main: michaelschiemer.de
|
||||
sans:
|
||||
- "*.michaelschiemer.de"
|
||||
middlewares:
|
||||
- security-headers@docker
|
||||
- compression@docker
|
||||
|
||||
# Certificate Resolvers
|
||||
certificatesResolvers:
|
||||
letsencrypt:
|
||||
acme:
|
||||
email: kontakt@michaelschiemer.de
|
||||
storage: /acme.json
|
||||
caServer: https://acme-v02.api.letsencrypt.org/directory
|
||||
# Use HTTP-01 challenge (requires port 80 accessible)
|
||||
httpChallenge:
|
||||
entryPoint: web
|
||||
# Uncomment for DNS challenge (requires DNS provider)
|
||||
# dnsChallenge:
|
||||
# provider: cloudflare
|
||||
# delayBeforeCheck: 30
|
||||
|
||||
# Providers
|
||||
providers:
|
||||
docker:
|
||||
endpoint: "unix:///var/run/docker.sock"
|
||||
exposedByDefault: false
|
||||
network: traefik-public
|
||||
watch: true
|
||||
|
||||
file:
|
||||
directory: /dynamic
|
||||
watch: true
|
||||
|
||||
# Logging
|
||||
log:
|
||||
level: INFO
|
||||
filePath: /logs/traefik.log
|
||||
format: json
|
||||
|
||||
# Access Logs
|
||||
accessLog:
|
||||
filePath: /logs/access.log
|
||||
format: json
|
||||
bufferingSize: 100
|
||||
filters:
|
||||
statusCodes:
|
||||
- "400-499"
|
||||
- "500-599"
|
||||
|
||||
# Metrics
|
||||
metrics:
|
||||
prometheus:
|
||||
addEntryPointsLabels: true
|
||||
addRoutersLabels: true
|
||||
addServicesLabels: true
|
||||
|
||||
# Ping
|
||||
ping:
|
||||
entryPoint: web
|
||||
Reference in New Issue
Block a user