feat(devops): CI/CD + Docker + Monitoring infrastructure

- Multi-stage Dockerfile (builder+runner, <500MB target) - docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx - .env.example with all required secrets (never hardcoded) - requirements.txt with all dependencies including prometheus-client, alembic - GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push - GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback - Alembic migration setup + initial PostgreSQL schema (001_initial_schema) - SQLite→PostgreSQL data migration script - Prometheus metrics module (HTTP, ML, DB, business metrics) - Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy) - Grafana dashboard (overview: req/s, p95, ML accuracy, error rate) - Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers) - Structured JSON logging module - Automated daily DB backup script (pg_dump + 30-day retention) Branch: feature/devops-cicd Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-25 17:32:02 +02:00
parent ed07c8a3d1
commit dce1e9b744
25 changed files with 2659 additions and 0 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,250 @@
+version: "3.9"
+
+# ============================================================
+# H3R7Tech Turf SaaS — Docker Compose
+# Services: app (x4) + postgres + redis + prometheus + grafana + nginx
+# ============================================================
+
+x-app-common: &app-common
+  build:
+    context: .
+    dockerfile: Dockerfile
+    target: runner
+  restart: unless-stopped
+  env_file:
+    - .env
+  depends_on:
+    postgres:
+      condition: service_healthy
+  networks:
+    - turf-net
+  volumes:
+    - ml-models:/app/data/models
+    - app-logs:/app/logs
+
+services:
+  # ----------------------------------------------------------
+  # PostgreSQL — primary database
+  # ----------------------------------------------------------
+  postgres:
+    image: postgres:16-alpine
+    restart: unless-stopped
+    environment:
+      POSTGRES_DB: ${POSTGRES_DB:-turf_saas}
+      POSTGRES_USER: ${POSTGRES_USER:-turf}
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
+    volumes:
+      - postgres-data:/var/lib/postgresql/data
+      - ./infra/postgres/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-turf} -d ${POSTGRES_DB:-turf_saas}"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 30s
+    networks:
+      - turf-net
+    ports:
+      - "127.0.0.1:5432:5432"
+
+  # ----------------------------------------------------------
+  # Redis — caching & session store
+  # ----------------------------------------------------------
+  redis:
+    image: redis:7-alpine
+    restart: unless-stopped
+    command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD}
+    volumes:
+      - redis-data:/data
+    healthcheck:
+      test: ["CMD", "redis-cli", "--pass", "${REDIS_PASSWORD}", "ping"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+    networks:
+      - turf-net
+    ports:
+      - "127.0.0.1:6379:6379"
+
+  # ----------------------------------------------------------
+  # Combined API — main predictions + ideas API (port 8790)
+  # ----------------------------------------------------------
+  combined-api:
+    <<: *app-common
+    container_name: turf-combined-api
+    command: gunicorn --bind 0.0.0.0:8790 --workers 2 --timeout 120 --access-logfile - --error-logfile - combined_api:app
+    ports:
+      - "127.0.0.1:8790:8790"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8790/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    environment:
+      PORT: 8790
+      SERVICE_NAME: combined-api
+
+  # ----------------------------------------------------------
+  # Dashboard API — analytics & ML scoring (port 8791)
+  # ----------------------------------------------------------
+  dashboard-api:
+    <<: *app-common
+    container_name: turf-dashboard-api
+    command: gunicorn --bind 0.0.0.0:8791 --workers 2 --timeout 120 --access-logfile - --error-logfile - dashboard_api:app
+    ports:
+      - "127.0.0.1:8791:8791"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8791/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    environment:
+      PORT: 8791
+      SERVICE_NAME: dashboard-api
+
+  # ----------------------------------------------------------
+  # Portal Server — frontend portal (port 8792)
+  # ----------------------------------------------------------
+  portal:
+    <<: *app-common
+    container_name: turf-portal
+    command: gunicorn --bind 0.0.0.0:8792 --workers 2 --timeout 60 --access-logfile - --error-logfile - portal_server:app
+    ports:
+      - "127.0.0.1:8792:8792"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8792/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    environment:
+      PORT: 8792
+      SERVICE_NAME: portal
+
+  # ----------------------------------------------------------
+  # Scheduler — background jobs (no external port)
+  # ----------------------------------------------------------
+  scheduler:
+    <<: *app-common
+    container_name: turf-scheduler
+    command: python turf_scheduler.py
+    environment:
+      SERVICE_NAME: scheduler
+
+  # ----------------------------------------------------------
+  # Prometheus — metrics scraping
+  # ----------------------------------------------------------
+  prometheus:
+    image: prom/prometheus:v2.53.4
+    restart: unless-stopped
+    command:
+      - "--config.file=/etc/prometheus/prometheus.yml"
+      - "--storage.tsdb.path=/prometheus"
+      - "--storage.tsdb.retention.time=30d"
+      - "--web.enable-lifecycle"
+    volumes:
+      - ./infra/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./infra/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
+      - prometheus-data:/prometheus
+    ports:
+      - "127.0.0.1:9090:9090"
+    networks:
+      - turf-net
+    healthcheck:
+      test: ["CMD", "wget", "-q", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  # ----------------------------------------------------------
+  # Grafana — dashboards
+  # ----------------------------------------------------------
+  grafana:
+    image: grafana/grafana:11.5.2
+    restart: unless-stopped
+    environment:
+      GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
+      GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD}
+      GF_USERS_ALLOW_SIGN_UP: "false"
+      GF_SERVER_DOMAIN: ${DOMAIN:-localhost}
+      GF_SERVER_ROOT_URL: https://${DOMAIN:-localhost}/grafana/
+      GF_SERVER_SERVE_FROM_SUB_PATH: "true"
+    volumes:
+      - grafana-data:/var/lib/grafana
+      - ./infra/grafana/provisioning:/etc/grafana/provisioning:ro
+      - ./infra/grafana/dashboards:/var/lib/grafana/dashboards:ro
+    ports:
+      - "127.0.0.1:3000:3000"
+    networks:
+      - turf-net
+    depends_on:
+      - prometheus
+
+  # ----------------------------------------------------------
+  # Nginx — reverse proxy + TLS termination
+  # ----------------------------------------------------------
+  nginx:
+    image: nginx:1.27-alpine
+    restart: unless-stopped
+    ports:
+      - "80:80"
+      - "443:443"
+    volumes:
+      - ./infra/nginx/nginx.conf:/etc/nginx/nginx.conf:ro
+      - ./infra/nginx/conf.d:/etc/nginx/conf.d:ro
+      - certbot-www:/var/www/certbot:ro
+      - certbot-certs:/etc/letsencrypt:ro
+    networks:
+      - turf-net
+    depends_on:
+      - combined-api
+      - dashboard-api
+      - portal
+    healthcheck:
+      test: ["CMD", "nginx", "-t"]
+      interval: 60s
+      timeout: 10s
+      retries: 3
+
+  # ----------------------------------------------------------
+  # Certbot — Let's Encrypt TLS certificate renewal
+  # ----------------------------------------------------------
+  certbot:
+    image: certbot/certbot:latest
+    restart: "no"
+    volumes:
+      - certbot-www:/var/www/certbot
+      - certbot-certs:/etc/letsencrypt
+    command: certonly --webroot --webroot-path=/var/www/certbot --email ${ADMIN_EMAIL} --agree-tos --no-eff-email -d ${DOMAIN}
+    networks:
+      - turf-net
+
+# ============================================================
+# Named volumes — persistent storage
+# ============================================================
+volumes:
+  postgres-data:
+    driver: local
+  redis-data:
+    driver: local
+  ml-models:
+    driver: local
+  app-logs:
+    driver: local
+  prometheus-data:
+    driver: local
+  grafana-data:
+    driver: local
+  certbot-www:
+    driver: local
+  certbot-certs:
+    driver: local
+
+# ============================================================
+# Network
+# ============================================================
+networks:
+  turf-net:
+    driver: bridge