feat(devops): CI/CD + Docker + Monitoring infrastructure

- Multi-stage Dockerfile (builder+runner, <500MB target) - docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx - .env.example with all required secrets (never hardcoded) - requirements.txt with all dependencies including prometheus-client, alembic - GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push - GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback - Alembic migration setup + initial PostgreSQL schema (001_initial_schema) - SQLite→PostgreSQL data migration script - Prometheus metrics module (HTTP, ML, DB, business metrics) - Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy) - Grafana dashboard (overview: req/s, p95, ML accuracy, error rate) - Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers) - Structured JSON logging module - Automated daily DB backup script (pg_dump + 30-day retention) Branch: feature/devops-cicd Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-25 17:32:02 +02:00
parent ed07c8a3d1
commit dce1e9b744
25 changed files with 2659 additions and 0 deletions
--- a/infra/grafana/dashboards/turf-saas-overview.json
+++ b/infra/grafana/dashboards/turf-saas-overview.json
@@ -0,0 +1,174 @@
+{
+  "title": "Turf SaaS — Overview",
+  "uid": "turf-saas-overview",
+  "schemaVersion": 38,
+  "version": 1,
+  "refresh": "30s",
+  "time": { "from": "now-6h", "to": "now" },
+  "tags": ["turf-saas"],
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Request Rate (req/s)",
+      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "sum(rate(http_requests_total[5m]))",
+          "legendFormat": "req/s"
+        }
+      ],
+      "options": { "colorMode": "background", "graphMode": "area" }
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Error Rate (5xx)",
+      "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
+          "legendFormat": "error %"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.5 },
+              { "color": "red", "value": 1 }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "p95 Latency",
+      "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p95"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 1 },
+              { "color": "red", "value": 2 }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "ML Top-1 Accuracy",
+      "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "ml_prediction_accuracy_ratio{accuracy_type=\"top1\"} * 100",
+          "legendFormat": "top-1 %"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "red", "value": null },
+              { "color": "yellow", "value": 25 },
+              { "color": "green", "value": 35 }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "id": 5,
+      "type": "timeseries",
+      "title": "HTTP Requests by Service",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "sum(rate(http_requests_total[5m])) by (service)",
+          "legendFormat": "{{ service }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "reqps" }
+      }
+    },
+    {
+      "id": 6,
+      "type": "timeseries",
+      "title": "Request Duration p50/p95/p99",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p50"
+        },
+        {
+          "datasource": "Prometheus",
+          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p95"
+        },
+        {
+          "datasource": "Prometheus",
+          "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p99"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "id": 7,
+      "type": "timeseries",
+      "title": "ML Predictions per Hour",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "sum(increase(ml_predictions_total[1h])) by (model_type)",
+          "legendFormat": "{{ model_type }}"
+        }
+      ]
+    },
+    {
+      "id": 8,
+      "type": "timeseries",
+      "title": "DB Query Duration",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le, operation))",
+          "legendFormat": "{{ operation }} p95"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    }
+  ]
+}
--- a/infra/grafana/provisioning/dashboards/dashboards.yml
+++ b/infra/grafana/provisioning/dashboards/dashboards.yml
@@ -0,0 +1,11 @@
+apiVersion: 1
+
+providers:
+  - name: turf-saas-dashboards
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 30
+    allowUiUpdates: true
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: true
--- a/infra/grafana/provisioning/datasources/prometheus.yml
+++ b/infra/grafana/provisioning/datasources/prometheus.yml
@@ -0,0 +1,13 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    uid: prometheus-main
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: false
+    jsonData:
+      httpMethod: POST
+      timeInterval: "15s"
--- a/infra/nginx/conf.d/turf.conf
+++ b/infra/nginx/conf.d/turf.conf
@@ -0,0 +1,157 @@
+# ============================================================
+# Nginx Virtual Host — Turf SaaS
+# ============================================================
+
+# Upstream service pools
+upstream combined_api {
+    server combined-api:8790;
+    keepalive 32;
+}
+
+upstream dashboard_api {
+    server dashboard-api:8791;
+    keepalive 16;
+}
+
+upstream portal {
+    server portal:8792;
+    keepalive 16;
+}
+
+upstream grafana {
+    server grafana:3000;
+    keepalive 4;
+}
+
+# ----------------------------------------------------------
+# HTTP → HTTPS redirect
+# ----------------------------------------------------------
+server {
+    listen 80;
+    server_name _;
+
+    # Let's Encrypt ACME challenge
+    location /.well-known/acme-challenge/ {
+        root /var/www/certbot;
+    }
+
+    location / {
+        return 301 https://$host$request_uri;
+    }
+}
+
+# ----------------------------------------------------------
+# HTTPS main server
+# ----------------------------------------------------------
+server {
+    listen 443 ssl;
+    http2 on;
+    server_name ${DOMAIN};
+
+    # TLS configuration
+    ssl_certificate /etc/letsencrypt/live/${DOMAIN}/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/${DOMAIN}/privkey.pem;
+    ssl_session_cache shared:SSL:10m;
+    ssl_session_timeout 10m;
+    ssl_protocols TLSv1.2 TLSv1.3;
+    ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384;
+    ssl_prefer_server_ciphers on;
+    ssl_stapling on;
+    ssl_stapling_verify on;
+
+    # Security headers
+    add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
+    add_header X-Frame-Options DENY always;
+    add_header X-Content-Type-Options nosniff always;
+    add_header X-XSS-Protection "1; mode=block" always;
+    add_header Referrer-Policy strict-origin-when-cross-origin always;
+    add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data:;" always;
+
+    # Limits
+    client_max_body_size 10M;
+    limit_conn conn_limit 20;
+
+    # ----------------------------------------------------------
+    # Portal (root)
+    # ----------------------------------------------------------
+    location / {
+        limit_req zone=global burst=50 nodelay;
+        proxy_pass http://portal;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_set_header Connection "";
+        proxy_read_timeout 60s;
+    }
+
+    # ----------------------------------------------------------
+    # Combined API
+    # ----------------------------------------------------------
+    location /api/ {
+        limit_req zone=api burst=20 nodelay;
+        proxy_pass http://combined_api;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_set_header Connection "";
+        proxy_read_timeout 120s;
+    }
+
+    # ----------------------------------------------------------
+    # Dashboard API
+    # ----------------------------------------------------------
+    location /dashboard-api/ {
+        limit_req zone=api burst=20 nodelay;
+        proxy_pass http://dashboard_api/;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_set_header Connection "";
+        proxy_read_timeout 120s;
+    }
+
+    # ----------------------------------------------------------
+    # Grafana (restricted to internal/admin)
+    # ----------------------------------------------------------
+    location /grafana/ {
+        # Restrict to admin IPs in production
+        # allow 10.0.0.0/8;
+        # deny all;
+
+        proxy_pass http://grafana;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_set_header Connection "";
+    }
+
+    # ----------------------------------------------------------
+    # Health check (no rate limiting)
+    # ----------------------------------------------------------
+    location /health {
+        proxy_pass http://combined_api/health;
+        proxy_http_version 1.1;
+        access_log off;
+    }
+
+    # Block common attack vectors
+    location ~ /\. {
+        deny all;
+        access_log off;
+        log_not_found off;
+    }
+
+    location ~* \.(env|git|bak|sql|log)$ {
+        deny all;
+        access_log off;
+        log_not_found off;
+    }
+}
--- a/infra/nginx/nginx.conf
+++ b/infra/nginx/nginx.conf
@@ -0,0 +1,65 @@
+# ============================================================
+# Nginx — Main config
+# ============================================================
+
+user nginx;
+worker_processes auto;
+error_log /var/log/nginx/error.log warn;
+pid /var/run/nginx.pid;
+
+events {
+    worker_connections 1024;
+    use epoll;
+    multi_accept on;
+}
+
+http {
+    include /etc/nginx/mime.types;
+    default_type application/octet-stream;
+
+    # Logging
+    log_format json_combined escape=json
+        '{"time":"$time_iso8601",'
+        '"remote_addr":"$remote_addr",'
+        '"method":"$request_method",'
+        '"uri":"$request_uri",'
+        '"status":$status,'
+        '"body_bytes":$body_bytes_sent,'
+        '"duration":$request_time,'
+        '"referrer":"$http_referer",'
+        '"user_agent":"$http_user_agent",'
+        '"x_forwarded_for":"$http_x_forwarded_for"}';
+
+    access_log /var/log/nginx/access.log json_combined;
+
+    # Performance
+    sendfile on;
+    tcp_nopush on;
+    tcp_nodelay on;
+    keepalive_timeout 65;
+    types_hash_max_size 2048;
+    server_tokens off;
+
+    # Gzip
+    gzip on;
+    gzip_vary on;
+    gzip_min_length 1024;
+    gzip_proxied any;
+    gzip_comp_level 5;
+    gzip_types
+        text/plain
+        text/css
+        text/javascript
+        application/javascript
+        application/json
+        application/xml
+        image/svg+xml;
+
+    # Rate limiting zones
+    limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m;
+    limit_req_zone $binary_remote_addr zone=global:20m rate=100r/m;
+    limit_conn_zone $binary_remote_addr zone=conn_limit:10m;
+
+    # Include virtual hosts
+    include /etc/nginx/conf.d/*.conf;
+}
--- a/infra/postgres/init.sql
+++ b/infra/postgres/init.sql
@@ -0,0 +1,12 @@
+-- ============================================================
+-- PostgreSQL init script for Turf SaaS
+-- Runs on first container start (docker-entrypoint-initdb.d)
+-- ============================================================
+
+-- Create extensions
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+CREATE EXTENSION IF NOT EXISTS "pg_trgm";
+
+-- Grant privileges to the app user
+GRANT ALL PRIVILEGES ON DATABASE turf_saas TO turf;
+GRANT ALL ON SCHEMA public TO turf;
--- a/infra/prometheus/alerts.yml
+++ b/infra/prometheus/alerts.yml
@@ -0,0 +1,109 @@
+# ============================================================
+# Prometheus Alert Rules — Turf SaaS
+# ============================================================
+
+groups:
+  # ----------------------------------------------------------
+  # HTTP / API Alerts
+  # ----------------------------------------------------------
+  - name: http_alerts
+    rules:
+      - alert: HighErrorRate
+        expr: |
+          sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service)
+          /
+          sum(rate(http_requests_total[5m])) by (service)
+          > 0.01
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High 5xx error rate on {{ $labels.service }}"
+          description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
+
+      - alert: HighLatency
+        expr: |
+          histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))
+          > 2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High p95 latency on {{ $labels.service }}"
+          description: "p95 latency is {{ $value | humanizeDuration }} (threshold: 2s)"
+
+      - alert: ServiceDown
+        expr: up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service {{ $labels.job }} is down"
+          description: "{{ $labels.instance }} has been unreachable for >1 minute"
+
+  # ----------------------------------------------------------
+  # Database Alerts
+  # ----------------------------------------------------------
+  - name: database_alerts
+    rules:
+      - alert: PostgresDown
+        expr: pg_up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "PostgreSQL is down"
+          description: "Cannot connect to PostgreSQL database"
+
+      - alert: PostgresDiskUsageHigh
+        expr: |
+          (pg_database_size_bytes / (1024 * 1024 * 1024)) > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "PostgreSQL database size > 10GB"
+          description: "Database {{ $labels.datname }} is {{ $value | humanize }}GB"
+
+      - alert: DiskSpaceHigh
+        expr: |
+          (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100
+          > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk usage > 80% on {{ $labels.instance }}"
+          description: "{{ $labels.mountpoint }} is at {{ $value | humanizePercentage }}"
+
+  # ----------------------------------------------------------
+  # ML Prediction Alerts
+  # ----------------------------------------------------------
+  - name: ml_alerts
+    rules:
+      - alert: MLAccuracyDegraded
+        expr: ml_prediction_accuracy_ratio{accuracy_type="top1"} < 0.30
+        for: 60m
+        labels:
+          severity: warning
+        annotations:
+          summary: "ML top-1 accuracy below 30%"
+          description: "Current accuracy: {{ $value | humanizePercentage }}"
+
+      - alert: MLPredictionDriftHigh
+        expr: ml_prediction_drift_score > 0.5
+        for: 30m
+        labels:
+          severity: warning
+        annotations:
+          summary: "ML feature drift detected"
+          description: "Drift score for {{ $labels.feature_group }}: {{ $value }}"
+
+      - alert: NoPredictionsGenerated
+        expr: increase(ml_predictions_total[1h]) == 0
+        for: 2h
+        labels:
+          severity: warning
+        annotations:
+          summary: "No ML predictions generated in the last 2 hours"
+          description: "Check if the scheduler is running and PMU data is being scraped"
--- a/infra/prometheus/prometheus.yml
+++ b/infra/prometheus/prometheus.yml
@@ -0,0 +1,68 @@
+# ============================================================
+# Prometheus Configuration — Turf SaaS
+# ============================================================
+
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    project: turf-saas
+    env: production
+
+# Alertmanager — wire up when available
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: []
+
+# Load alert rules
+rule_files:
+  - "alerts.yml"
+
+# ============================================================
+# Scrape targets
+# ============================================================
+scrape_configs:
+  # Prometheus self-monitoring
+  - job_name: prometheus
+    static_configs:
+      - targets: [localhost:9090]
+
+  # Combined API
+  - job_name: combined-api
+    static_configs:
+      - targets: [combined-api:8790]
+    metrics_path: /metrics
+    scrape_interval: 15s
+
+  # Dashboard API
+  - job_name: dashboard-api
+    static_configs:
+      - targets: [dashboard-api:8791]
+    metrics_path: /metrics
+    scrape_interval: 15s
+
+  # Portal
+  - job_name: portal
+    static_configs:
+      - targets: [portal:8792]
+    metrics_path: /metrics
+    scrape_interval: 30s
+
+  # PostgreSQL exporter (if deployed)
+  - job_name: postgres
+    static_configs:
+      - targets: [postgres-exporter:9187]
+    scrape_interval: 30s
+
+  # Redis exporter (if deployed)
+  - job_name: redis
+    static_configs:
+      - targets: [redis-exporter:9121]
+    scrape_interval: 30s
+
+  # Node exporter (host metrics)
+  - job_name: node
+    static_configs:
+      - targets: [host.docker.internal:9100]
+    scrape_interval: 30s
--- a/infra/scripts/backup_db.sh
+++ b/infra/scripts/backup_db.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# ============================================================
+# Automated PostgreSQL Backup Script
+# Run daily via cron: 0 2 * * * /opt/turf-saas/infra/scripts/backup_db.sh
+# ============================================================
+
+set -euo pipefail
+
+BACKUP_DIR="${BACKUP_DIR:-/opt/backups/turf-saas}"
+KEEP_DAYS="${KEEP_DAYS:-30}"
+DB_NAME="${POSTGRES_DB:-turf_saas}"
+DB_USER="${POSTGRES_USER:-turf}"
+DB_HOST="${POSTGRES_HOST:-postgres}"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+BACKUP_FILE="${BACKUP_DIR}/turf_saas_${TIMESTAMP}.sql.gz"
+
+echo "[$(date -Iseconds)] Starting backup: ${BACKUP_FILE}"
+
+# Ensure backup directory exists
+mkdir -p "${BACKUP_DIR}"
+
+# Perform backup
+PGPASSWORD="${POSTGRES_PASSWORD}" pg_dump \
+    -h "${DB_HOST}" \
+    -U "${DB_USER}" \
+    -d "${DB_NAME}" \
+    --no-owner \
+    --no-acl \
+    | gzip > "${BACKUP_FILE}"
+
+SIZE=$(du -sh "${BACKUP_FILE}" | cut -f1)
+echo "[$(date -Iseconds)] Backup complete: ${BACKUP_FILE} (${SIZE})"
+
+# Remove backups older than KEEP_DAYS
+find "${BACKUP_DIR}" -name "turf_saas_*.sql.gz" -mtime "+${KEEP_DAYS}" -delete
+echo "[$(date -Iseconds)] Old backups cleaned (kept last ${KEEP_DAYS} days)"
+
+# Optional: notify on completion
+if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${TELEGRAM_CHAT_ID:-}" ]; then
+    curl -s -X POST \
+        "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
+        -d chat_id="${TELEGRAM_CHAT_ID}" \
+        -d text="✅ DB Backup OK: turf_saas ${TIMESTAMP} (${SIZE})" \
+        > /dev/null || true
+fi