- Multi-stage Dockerfile (builder+runner, <500MB target) - docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx - .env.example with all required secrets (never hardcoded) - requirements.txt with all dependencies including prometheus-client, alembic - GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push - GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback - Alembic migration setup + initial PostgreSQL schema (001_initial_schema) - SQLite→PostgreSQL data migration script - Prometheus metrics module (HTTP, ML, DB, business metrics) - Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy) - Grafana dashboard (overview: req/s, p95, ML accuracy, error rate) - Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers) - Structured JSON logging module - Automated daily DB backup script (pg_dump + 30-day retention) Branch: feature/devops-cicd Co-Authored-By: Paperclip <noreply@paperclip.ing>
175 lines
4.7 KiB
JSON
175 lines
4.7 KiB
JSON
{
|
|
"title": "Turf SaaS — Overview",
|
|
"uid": "turf-saas-overview",
|
|
"schemaVersion": 38,
|
|
"version": 1,
|
|
"refresh": "30s",
|
|
"time": { "from": "now-6h", "to": "now" },
|
|
"tags": ["turf-saas"],
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"type": "stat",
|
|
"title": "Request Rate (req/s)",
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "sum(rate(http_requests_total[5m]))",
|
|
"legendFormat": "req/s"
|
|
}
|
|
],
|
|
"options": { "colorMode": "background", "graphMode": "area" }
|
|
},
|
|
{
|
|
"id": 2,
|
|
"type": "stat",
|
|
"title": "Error Rate (5xx)",
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
|
|
"legendFormat": "error %"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 0.5 },
|
|
{ "color": "red", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 3,
|
|
"type": "stat",
|
|
"title": "p95 Latency",
|
|
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p95"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "s",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 1 },
|
|
{ "color": "red", "value": 2 }
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 4,
|
|
"type": "stat",
|
|
"title": "ML Top-1 Accuracy",
|
|
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "ml_prediction_accuracy_ratio{accuracy_type=\"top1\"} * 100",
|
|
"legendFormat": "top-1 %"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "yellow", "value": 25 },
|
|
{ "color": "green", "value": 35 }
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 5,
|
|
"type": "timeseries",
|
|
"title": "HTTP Requests by Service",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "sum(rate(http_requests_total[5m])) by (service)",
|
|
"legendFormat": "{{ service }}"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "reqps" }
|
|
}
|
|
},
|
|
{
|
|
"id": 6,
|
|
"type": "timeseries",
|
|
"title": "Request Duration p50/p95/p99",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p50"
|
|
},
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p95"
|
|
},
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p99"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "s" }
|
|
}
|
|
},
|
|
{
|
|
"id": 7,
|
|
"type": "timeseries",
|
|
"title": "ML Predictions per Hour",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
|
"targets": [
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "sum(increase(ml_predictions_total[1h])) by (model_type)",
|
|
"legendFormat": "{{ model_type }}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 8,
|
|
"type": "timeseries",
|
|
"title": "DB Query Duration",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
|
"targets": [
|
|
{
|
|
"datasource": "Prometheus",
|
|
"expr": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le, operation))",
|
|
"legendFormat": "{{ operation }} p95"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "s" }
|
|
}
|
|
}
|
|
]
|
|
}
|