feat(devops): CI/CD + Docker + Monitoring infrastructure
- Multi-stage Dockerfile (builder+runner, <500MB target) - docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx - .env.example with all required secrets (never hardcoded) - requirements.txt with all dependencies including prometheus-client, alembic - GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push - GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback - Alembic migration setup + initial PostgreSQL schema (001_initial_schema) - SQLite→PostgreSQL data migration script - Prometheus metrics module (HTTP, ML, DB, business metrics) - Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy) - Grafana dashboard (overview: req/s, p95, ML accuracy, error rate) - Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers) - Structured JSON logging module - Automated daily DB backup script (pg_dump + 30-day retention) Branch: feature/devops-cicd Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
250
docker-compose.yml
Normal file
250
docker-compose.yml
Normal file
@@ -0,0 +1,250 @@
|
||||
version: "3.9"
|
||||
|
||||
# ============================================================
|
||||
# H3R7Tech Turf SaaS — Docker Compose
|
||||
# Services: app (x4) + postgres + redis + prometheus + grafana + nginx
|
||||
# ============================================================
|
||||
|
||||
x-app-common: &app-common
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
target: runner
|
||||
restart: unless-stopped
|
||||
env_file:
|
||||
- .env
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- turf-net
|
||||
volumes:
|
||||
- ml-models:/app/data/models
|
||||
- app-logs:/app/logs
|
||||
|
||||
services:
|
||||
# ----------------------------------------------------------
|
||||
# PostgreSQL — primary database
|
||||
# ----------------------------------------------------------
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
POSTGRES_DB: ${POSTGRES_DB:-turf_saas}
|
||||
POSTGRES_USER: ${POSTGRES_USER:-turf}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||
volumes:
|
||||
- postgres-data:/var/lib/postgresql/data
|
||||
- ./infra/postgres/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-turf} -d ${POSTGRES_DB:-turf_saas}"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
networks:
|
||||
- turf-net
|
||||
ports:
|
||||
- "127.0.0.1:5432:5432"
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Redis — caching & session store
|
||||
# ----------------------------------------------------------
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
restart: unless-stopped
|
||||
command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD}
|
||||
volumes:
|
||||
- redis-data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "--pass", "${REDIS_PASSWORD}", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
networks:
|
||||
- turf-net
|
||||
ports:
|
||||
- "127.0.0.1:6379:6379"
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Combined API — main predictions + ideas API (port 8790)
|
||||
# ----------------------------------------------------------
|
||||
combined-api:
|
||||
<<: *app-common
|
||||
container_name: turf-combined-api
|
||||
command: gunicorn --bind 0.0.0.0:8790 --workers 2 --timeout 120 --access-logfile - --error-logfile - combined_api:app
|
||||
ports:
|
||||
- "127.0.0.1:8790:8790"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8790/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
environment:
|
||||
PORT: 8790
|
||||
SERVICE_NAME: combined-api
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Dashboard API — analytics & ML scoring (port 8791)
|
||||
# ----------------------------------------------------------
|
||||
dashboard-api:
|
||||
<<: *app-common
|
||||
container_name: turf-dashboard-api
|
||||
command: gunicorn --bind 0.0.0.0:8791 --workers 2 --timeout 120 --access-logfile - --error-logfile - dashboard_api:app
|
||||
ports:
|
||||
- "127.0.0.1:8791:8791"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8791/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
environment:
|
||||
PORT: 8791
|
||||
SERVICE_NAME: dashboard-api
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Portal Server — frontend portal (port 8792)
|
||||
# ----------------------------------------------------------
|
||||
portal:
|
||||
<<: *app-common
|
||||
container_name: turf-portal
|
||||
command: gunicorn --bind 0.0.0.0:8792 --workers 2 --timeout 60 --access-logfile - --error-logfile - portal_server:app
|
||||
ports:
|
||||
- "127.0.0.1:8792:8792"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8792/"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
environment:
|
||||
PORT: 8792
|
||||
SERVICE_NAME: portal
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Scheduler — background jobs (no external port)
|
||||
# ----------------------------------------------------------
|
||||
scheduler:
|
||||
<<: *app-common
|
||||
container_name: turf-scheduler
|
||||
command: python turf_scheduler.py
|
||||
environment:
|
||||
SERVICE_NAME: scheduler
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Prometheus — metrics scraping
|
||||
# ----------------------------------------------------------
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.53.4
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--storage.tsdb.retention.time=30d"
|
||||
- "--web.enable-lifecycle"
|
||||
volumes:
|
||||
- ./infra/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./infra/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
- prometheus-data:/prometheus
|
||||
ports:
|
||||
- "127.0.0.1:9090:9090"
|
||||
networks:
|
||||
- turf-net
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Grafana — dashboards
|
||||
# ----------------------------------------------------------
|
||||
grafana:
|
||||
image: grafana/grafana:11.5.2
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD}
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
GF_SERVER_DOMAIN: ${DOMAIN:-localhost}
|
||||
GF_SERVER_ROOT_URL: https://${DOMAIN:-localhost}/grafana/
|
||||
GF_SERVER_SERVE_FROM_SUB_PATH: "true"
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./infra/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./infra/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
ports:
|
||||
- "127.0.0.1:3000:3000"
|
||||
networks:
|
||||
- turf-net
|
||||
depends_on:
|
||||
- prometheus
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Nginx — reverse proxy + TLS termination
|
||||
# ----------------------------------------------------------
|
||||
nginx:
|
||||
image: nginx:1.27-alpine
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "80:80"
|
||||
- "443:443"
|
||||
volumes:
|
||||
- ./infra/nginx/nginx.conf:/etc/nginx/nginx.conf:ro
|
||||
- ./infra/nginx/conf.d:/etc/nginx/conf.d:ro
|
||||
- certbot-www:/var/www/certbot:ro
|
||||
- certbot-certs:/etc/letsencrypt:ro
|
||||
networks:
|
||||
- turf-net
|
||||
depends_on:
|
||||
- combined-api
|
||||
- dashboard-api
|
||||
- portal
|
||||
healthcheck:
|
||||
test: ["CMD", "nginx", "-t"]
|
||||
interval: 60s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Certbot — Let's Encrypt TLS certificate renewal
|
||||
# ----------------------------------------------------------
|
||||
certbot:
|
||||
image: certbot/certbot:latest
|
||||
restart: "no"
|
||||
volumes:
|
||||
- certbot-www:/var/www/certbot
|
||||
- certbot-certs:/etc/letsencrypt
|
||||
command: certonly --webroot --webroot-path=/var/www/certbot --email ${ADMIN_EMAIL} --agree-tos --no-eff-email -d ${DOMAIN}
|
||||
networks:
|
||||
- turf-net
|
||||
|
||||
# ============================================================
|
||||
# Named volumes — persistent storage
|
||||
# ============================================================
|
||||
volumes:
|
||||
postgres-data:
|
||||
driver: local
|
||||
redis-data:
|
||||
driver: local
|
||||
ml-models:
|
||||
driver: local
|
||||
app-logs:
|
||||
driver: local
|
||||
prometheus-data:
|
||||
driver: local
|
||||
grafana-data:
|
||||
driver: local
|
||||
certbot-www:
|
||||
driver: local
|
||||
certbot-certs:
|
||||
driver: local
|
||||
|
||||
# ============================================================
|
||||
# Network
|
||||
# ============================================================
|
||||
networks:
|
||||
turf-net:
|
||||
driver: bridge
|
||||
Reference in New Issue
Block a user