feat(devops): CI/CD + Docker + Monitoring infrastructure

- Multi-stage Dockerfile (builder+runner, <500MB target)
- docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx
- .env.example with all required secrets (never hardcoded)
- requirements.txt with all dependencies including prometheus-client, alembic
- GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push
- GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback
- Alembic migration setup + initial PostgreSQL schema (001_initial_schema)
- SQLite→PostgreSQL data migration script
- Prometheus metrics module (HTTP, ML, DB, business metrics)
- Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy)
- Grafana dashboard (overview: req/s, p95, ML accuracy, error rate)
- Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers)
- Structured JSON logging module
- Automated daily DB backup script (pg_dump + 30-day retention)

Branch: feature/devops-cicd

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
DevOps Engineer
2026-04-25 17:32:02 +02:00
parent ed07c8a3d1
commit dce1e9b744
25 changed files with 2659 additions and 0 deletions

View File

@@ -0,0 +1,174 @@
{
"title": "Turf SaaS — Overview",
"uid": "turf-saas-overview",
"schemaVersion": 38,
"version": 1,
"refresh": "30s",
"time": { "from": "now-6h", "to": "now" },
"tags": ["turf-saas"],
"panels": [
{
"id": 1,
"type": "stat",
"title": "Request Rate (req/s)",
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"targets": [
{
"datasource": "Prometheus",
"expr": "sum(rate(http_requests_total[5m]))",
"legendFormat": "req/s"
}
],
"options": { "colorMode": "background", "graphMode": "area" }
},
{
"id": 2,
"type": "stat",
"title": "Error Rate (5xx)",
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"targets": [
{
"datasource": "Prometheus",
"expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
"legendFormat": "error %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "red", "value": 1 }
]
}
}
}
},
{
"id": 3,
"type": "stat",
"title": "p95 Latency",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"targets": [
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p95"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 2 }
]
}
}
}
},
{
"id": 4,
"type": "stat",
"title": "ML Top-1 Accuracy",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"targets": [
{
"datasource": "Prometheus",
"expr": "ml_prediction_accuracy_ratio{accuracy_type=\"top1\"} * 100",
"legendFormat": "top-1 %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 25 },
{ "color": "green", "value": 35 }
]
}
}
}
},
{
"id": 5,
"type": "timeseries",
"title": "HTTP Requests by Service",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"targets": [
{
"datasource": "Prometheus",
"expr": "sum(rate(http_requests_total[5m])) by (service)",
"legendFormat": "{{ service }}"
}
],
"fieldConfig": {
"defaults": { "unit": "reqps" }
}
},
{
"id": 6,
"type": "timeseries",
"title": "Request Duration p50/p95/p99",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"targets": [
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p50"
},
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p95"
},
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p99"
}
],
"fieldConfig": {
"defaults": { "unit": "s" }
}
},
{
"id": 7,
"type": "timeseries",
"title": "ML Predictions per Hour",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"targets": [
{
"datasource": "Prometheus",
"expr": "sum(increase(ml_predictions_total[1h])) by (model_type)",
"legendFormat": "{{ model_type }}"
}
]
},
{
"id": 8,
"type": "timeseries",
"title": "DB Query Duration",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"targets": [
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le, operation))",
"legendFormat": "{{ operation }} p95"
}
],
"fieldConfig": {
"defaults": { "unit": "s" }
}
}
]
}