- Multi-stage Dockerfile (builder+runner, <500MB target) - docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx - .env.example with all required secrets (never hardcoded) - requirements.txt with all dependencies including prometheus-client, alembic - GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push - GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback - Alembic migration setup + initial PostgreSQL schema (001_initial_schema) - SQLite→PostgreSQL data migration script - Prometheus metrics module (HTTP, ML, DB, business metrics) - Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy) - Grafana dashboard (overview: req/s, p95, ML accuracy, error rate) - Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers) - Structured JSON logging module - Automated daily DB backup script (pg_dump + 30-day retention) Branch: feature/devops-cicd Co-Authored-By: Paperclip <noreply@paperclip.ing>
110 lines
3.7 KiB
YAML
110 lines
3.7 KiB
YAML
# ============================================================
|
|
# Prometheus Alert Rules — Turf SaaS
|
|
# ============================================================
|
|
|
|
groups:
|
|
# ----------------------------------------------------------
|
|
# HTTP / API Alerts
|
|
# ----------------------------------------------------------
|
|
- name: http_alerts
|
|
rules:
|
|
- alert: HighErrorRate
|
|
expr: |
|
|
sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service)
|
|
/
|
|
sum(rate(http_requests_total[5m])) by (service)
|
|
> 0.01
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High 5xx error rate on {{ $labels.service }}"
|
|
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
|
|
|
|
- alert: HighLatency
|
|
expr: |
|
|
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))
|
|
> 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High p95 latency on {{ $labels.service }}"
|
|
description: "p95 latency is {{ $value | humanizeDuration }} (threshold: 2s)"
|
|
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down"
|
|
description: "{{ $labels.instance }} has been unreachable for >1 minute"
|
|
|
|
# ----------------------------------------------------------
|
|
# Database Alerts
|
|
# ----------------------------------------------------------
|
|
- name: database_alerts
|
|
rules:
|
|
- alert: PostgresDown
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL is down"
|
|
description: "Cannot connect to PostgreSQL database"
|
|
|
|
- alert: PostgresDiskUsageHigh
|
|
expr: |
|
|
(pg_database_size_bytes / (1024 * 1024 * 1024)) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL database size > 10GB"
|
|
description: "Database {{ $labels.datname }} is {{ $value | humanize }}GB"
|
|
|
|
- alert: DiskSpaceHigh
|
|
expr: |
|
|
(node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100
|
|
> 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk usage > 80% on {{ $labels.instance }}"
|
|
description: "{{ $labels.mountpoint }} is at {{ $value | humanizePercentage }}"
|
|
|
|
# ----------------------------------------------------------
|
|
# ML Prediction Alerts
|
|
# ----------------------------------------------------------
|
|
- name: ml_alerts
|
|
rules:
|
|
- alert: MLAccuracyDegraded
|
|
expr: ml_prediction_accuracy_ratio{accuracy_type="top1"} < 0.30
|
|
for: 60m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "ML top-1 accuracy below 30%"
|
|
description: "Current accuracy: {{ $value | humanizePercentage }}"
|
|
|
|
- alert: MLPredictionDriftHigh
|
|
expr: ml_prediction_drift_score > 0.5
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "ML feature drift detected"
|
|
description: "Drift score for {{ $labels.feature_group }}: {{ $value }}"
|
|
|
|
- alert: NoPredictionsGenerated
|
|
expr: increase(ml_predictions_total[1h]) == 0
|
|
for: 2h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "No ML predictions generated in the last 2 hours"
|
|
description: "Check if the scheduler is running and PMU data is being scraped"
|