feat(devops): CI/CD + Docker + Monitoring infrastructure

- Multi-stage Dockerfile (builder+runner, <500MB target)
- docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx
- .env.example with all required secrets (never hardcoded)
- requirements.txt with all dependencies including prometheus-client, alembic
- GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push
- GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback
- Alembic migration setup + initial PostgreSQL schema (001_initial_schema)
- SQLite→PostgreSQL data migration script
- Prometheus metrics module (HTTP, ML, DB, business metrics)
- Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy)
- Grafana dashboard (overview: req/s, p95, ML accuracy, error rate)
- Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers)
- Structured JSON logging module
- Automated daily DB backup script (pg_dump + 30-day retention)

Branch: feature/devops-cicd

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
DevOps Engineer
2026-04-25 17:32:02 +02:00
parent ed07c8a3d1
commit dce1e9b744
25 changed files with 2659 additions and 0 deletions

View File

@@ -0,0 +1,174 @@
{
"title": "Turf SaaS — Overview",
"uid": "turf-saas-overview",
"schemaVersion": 38,
"version": 1,
"refresh": "30s",
"time": { "from": "now-6h", "to": "now" },
"tags": ["turf-saas"],
"panels": [
{
"id": 1,
"type": "stat",
"title": "Request Rate (req/s)",
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"targets": [
{
"datasource": "Prometheus",
"expr": "sum(rate(http_requests_total[5m]))",
"legendFormat": "req/s"
}
],
"options": { "colorMode": "background", "graphMode": "area" }
},
{
"id": 2,
"type": "stat",
"title": "Error Rate (5xx)",
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"targets": [
{
"datasource": "Prometheus",
"expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
"legendFormat": "error %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "red", "value": 1 }
]
}
}
}
},
{
"id": 3,
"type": "stat",
"title": "p95 Latency",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"targets": [
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p95"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 2 }
]
}
}
}
},
{
"id": 4,
"type": "stat",
"title": "ML Top-1 Accuracy",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"targets": [
{
"datasource": "Prometheus",
"expr": "ml_prediction_accuracy_ratio{accuracy_type=\"top1\"} * 100",
"legendFormat": "top-1 %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 25 },
{ "color": "green", "value": 35 }
]
}
}
}
},
{
"id": 5,
"type": "timeseries",
"title": "HTTP Requests by Service",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"targets": [
{
"datasource": "Prometheus",
"expr": "sum(rate(http_requests_total[5m])) by (service)",
"legendFormat": "{{ service }}"
}
],
"fieldConfig": {
"defaults": { "unit": "reqps" }
}
},
{
"id": 6,
"type": "timeseries",
"title": "Request Duration p50/p95/p99",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"targets": [
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p50"
},
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p95"
},
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p99"
}
],
"fieldConfig": {
"defaults": { "unit": "s" }
}
},
{
"id": 7,
"type": "timeseries",
"title": "ML Predictions per Hour",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"targets": [
{
"datasource": "Prometheus",
"expr": "sum(increase(ml_predictions_total[1h])) by (model_type)",
"legendFormat": "{{ model_type }}"
}
]
},
{
"id": 8,
"type": "timeseries",
"title": "DB Query Duration",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"targets": [
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le, operation))",
"legendFormat": "{{ operation }} p95"
}
],
"fieldConfig": {
"defaults": { "unit": "s" }
}
}
]
}

View File

@@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: turf-saas-dashboards
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true

View File

@@ -0,0 +1,13 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus-main
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
jsonData:
httpMethod: POST
timeInterval: "15s"

View File

@@ -0,0 +1,157 @@
# ============================================================
# Nginx Virtual Host — Turf SaaS
# ============================================================
# Upstream service pools
upstream combined_api {
server combined-api:8790;
keepalive 32;
}
upstream dashboard_api {
server dashboard-api:8791;
keepalive 16;
}
upstream portal {
server portal:8792;
keepalive 16;
}
upstream grafana {
server grafana:3000;
keepalive 4;
}
# ----------------------------------------------------------
# HTTP → HTTPS redirect
# ----------------------------------------------------------
server {
listen 80;
server_name _;
# Let's Encrypt ACME challenge
location /.well-known/acme-challenge/ {
root /var/www/certbot;
}
location / {
return 301 https://$host$request_uri;
}
}
# ----------------------------------------------------------
# HTTPS main server
# ----------------------------------------------------------
server {
listen 443 ssl;
http2 on;
server_name ${DOMAIN};
# TLS configuration
ssl_certificate /etc/letsencrypt/live/${DOMAIN}/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/${DOMAIN}/privkey.pem;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 10m;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384;
ssl_prefer_server_ciphers on;
ssl_stapling on;
ssl_stapling_verify on;
# Security headers
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
add_header X-Frame-Options DENY always;
add_header X-Content-Type-Options nosniff always;
add_header X-XSS-Protection "1; mode=block" always;
add_header Referrer-Policy strict-origin-when-cross-origin always;
add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data:;" always;
# Limits
client_max_body_size 10M;
limit_conn conn_limit 20;
# ----------------------------------------------------------
# Portal (root)
# ----------------------------------------------------------
location / {
limit_req zone=global burst=50 nodelay;
proxy_pass http://portal;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Connection "";
proxy_read_timeout 60s;
}
# ----------------------------------------------------------
# Combined API
# ----------------------------------------------------------
location /api/ {
limit_req zone=api burst=20 nodelay;
proxy_pass http://combined_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Connection "";
proxy_read_timeout 120s;
}
# ----------------------------------------------------------
# Dashboard API
# ----------------------------------------------------------
location /dashboard-api/ {
limit_req zone=api burst=20 nodelay;
proxy_pass http://dashboard_api/;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Connection "";
proxy_read_timeout 120s;
}
# ----------------------------------------------------------
# Grafana (restricted to internal/admin)
# ----------------------------------------------------------
location /grafana/ {
# Restrict to admin IPs in production
# allow 10.0.0.0/8;
# deny all;
proxy_pass http://grafana;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Connection "";
}
# ----------------------------------------------------------
# Health check (no rate limiting)
# ----------------------------------------------------------
location /health {
proxy_pass http://combined_api/health;
proxy_http_version 1.1;
access_log off;
}
# Block common attack vectors
location ~ /\. {
deny all;
access_log off;
log_not_found off;
}
location ~* \.(env|git|bak|sql|log)$ {
deny all;
access_log off;
log_not_found off;
}
}

65
infra/nginx/nginx.conf Normal file
View File

@@ -0,0 +1,65 @@
# ============================================================
# Nginx — Main config
# ============================================================
user nginx;
worker_processes auto;
error_log /var/log/nginx/error.log warn;
pid /var/run/nginx.pid;
events {
worker_connections 1024;
use epoll;
multi_accept on;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
# Logging
log_format json_combined escape=json
'{"time":"$time_iso8601",'
'"remote_addr":"$remote_addr",'
'"method":"$request_method",'
'"uri":"$request_uri",'
'"status":$status,'
'"body_bytes":$body_bytes_sent,'
'"duration":$request_time,'
'"referrer":"$http_referer",'
'"user_agent":"$http_user_agent",'
'"x_forwarded_for":"$http_x_forwarded_for"}';
access_log /var/log/nginx/access.log json_combined;
# Performance
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
types_hash_max_size 2048;
server_tokens off;
# Gzip
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_proxied any;
gzip_comp_level 5;
gzip_types
text/plain
text/css
text/javascript
application/javascript
application/json
application/xml
image/svg+xml;
# Rate limiting zones
limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m;
limit_req_zone $binary_remote_addr zone=global:20m rate=100r/m;
limit_conn_zone $binary_remote_addr zone=conn_limit:10m;
# Include virtual hosts
include /etc/nginx/conf.d/*.conf;
}

12
infra/postgres/init.sql Normal file
View File

@@ -0,0 +1,12 @@
-- ============================================================
-- PostgreSQL init script for Turf SaaS
-- Runs on first container start (docker-entrypoint-initdb.d)
-- ============================================================
-- Create extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS "pg_trgm";
-- Grant privileges to the app user
GRANT ALL PRIVILEGES ON DATABASE turf_saas TO turf;
GRANT ALL ON SCHEMA public TO turf;

109
infra/prometheus/alerts.yml Normal file
View File

@@ -0,0 +1,109 @@
# ============================================================
# Prometheus Alert Rules — Turf SaaS
# ============================================================
groups:
# ----------------------------------------------------------
# HTTP / API Alerts
# ----------------------------------------------------------
- name: http_alerts
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
> 0.01
for: 2m
labels:
severity: critical
annotations:
summary: "High 5xx error rate on {{ $labels.service }}"
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
- alert: HighLatency
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))
> 2
for: 5m
labels:
severity: warning
annotations:
summary: "High p95 latency on {{ $labels.service }}"
description: "p95 latency is {{ $value | humanizeDuration }} (threshold: 2s)"
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "{{ $labels.instance }} has been unreachable for >1 minute"
# ----------------------------------------------------------
# Database Alerts
# ----------------------------------------------------------
- name: database_alerts
rules:
- alert: PostgresDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL is down"
description: "Cannot connect to PostgreSQL database"
- alert: PostgresDiskUsageHigh
expr: |
(pg_database_size_bytes / (1024 * 1024 * 1024)) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL database size > 10GB"
description: "Database {{ $labels.datname }} is {{ $value | humanize }}GB"
- alert: DiskSpaceHigh
expr: |
(node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100
> 80
for: 5m
labels:
severity: warning
annotations:
summary: "Disk usage > 80% on {{ $labels.instance }}"
description: "{{ $labels.mountpoint }} is at {{ $value | humanizePercentage }}"
# ----------------------------------------------------------
# ML Prediction Alerts
# ----------------------------------------------------------
- name: ml_alerts
rules:
- alert: MLAccuracyDegraded
expr: ml_prediction_accuracy_ratio{accuracy_type="top1"} < 0.30
for: 60m
labels:
severity: warning
annotations:
summary: "ML top-1 accuracy below 30%"
description: "Current accuracy: {{ $value | humanizePercentage }}"
- alert: MLPredictionDriftHigh
expr: ml_prediction_drift_score > 0.5
for: 30m
labels:
severity: warning
annotations:
summary: "ML feature drift detected"
description: "Drift score for {{ $labels.feature_group }}: {{ $value }}"
- alert: NoPredictionsGenerated
expr: increase(ml_predictions_total[1h]) == 0
for: 2h
labels:
severity: warning
annotations:
summary: "No ML predictions generated in the last 2 hours"
description: "Check if the scheduler is running and PMU data is being scraped"

View File

@@ -0,0 +1,68 @@
# ============================================================
# Prometheus Configuration — Turf SaaS
# ============================================================
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
project: turf-saas
env: production
# Alertmanager — wire up when available
alerting:
alertmanagers:
- static_configs:
- targets: []
# Load alert rules
rule_files:
- "alerts.yml"
# ============================================================
# Scrape targets
# ============================================================
scrape_configs:
# Prometheus self-monitoring
- job_name: prometheus
static_configs:
- targets: [localhost:9090]
# Combined API
- job_name: combined-api
static_configs:
- targets: [combined-api:8790]
metrics_path: /metrics
scrape_interval: 15s
# Dashboard API
- job_name: dashboard-api
static_configs:
- targets: [dashboard-api:8791]
metrics_path: /metrics
scrape_interval: 15s
# Portal
- job_name: portal
static_configs:
- targets: [portal:8792]
metrics_path: /metrics
scrape_interval: 30s
# PostgreSQL exporter (if deployed)
- job_name: postgres
static_configs:
- targets: [postgres-exporter:9187]
scrape_interval: 30s
# Redis exporter (if deployed)
- job_name: redis
static_configs:
- targets: [redis-exporter:9121]
scrape_interval: 30s
# Node exporter (host metrics)
- job_name: node
static_configs:
- targets: [host.docker.internal:9100]
scrape_interval: 30s

45
infra/scripts/backup_db.sh Executable file
View File

@@ -0,0 +1,45 @@
#!/bin/bash
# ============================================================
# Automated PostgreSQL Backup Script
# Run daily via cron: 0 2 * * * /opt/turf-saas/infra/scripts/backup_db.sh
# ============================================================
set -euo pipefail
BACKUP_DIR="${BACKUP_DIR:-/opt/backups/turf-saas}"
KEEP_DAYS="${KEEP_DAYS:-30}"
DB_NAME="${POSTGRES_DB:-turf_saas}"
DB_USER="${POSTGRES_USER:-turf}"
DB_HOST="${POSTGRES_HOST:-postgres}"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/turf_saas_${TIMESTAMP}.sql.gz"
echo "[$(date -Iseconds)] Starting backup: ${BACKUP_FILE}"
# Ensure backup directory exists
mkdir -p "${BACKUP_DIR}"
# Perform backup
PGPASSWORD="${POSTGRES_PASSWORD}" pg_dump \
-h "${DB_HOST}" \
-U "${DB_USER}" \
-d "${DB_NAME}" \
--no-owner \
--no-acl \
| gzip > "${BACKUP_FILE}"
SIZE=$(du -sh "${BACKUP_FILE}" | cut -f1)
echo "[$(date -Iseconds)] Backup complete: ${BACKUP_FILE} (${SIZE})"
# Remove backups older than KEEP_DAYS
find "${BACKUP_DIR}" -name "turf_saas_*.sql.gz" -mtime "+${KEEP_DAYS}" -delete
echo "[$(date -Iseconds)] Old backups cleaned (kept last ${KEEP_DAYS} days)"
# Optional: notify on completion
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${TELEGRAM_CHAT_ID:-}" ]; then
curl -s -X POST \
"https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d text="✅ DB Backup OK: turf_saas ${TIMESTAMP} (${SIZE})" \
> /dev/null || true
fi