Files
turf_saas/infra/prometheus/alerts.yml
DevOps Engineer dce1e9b744 feat(devops): CI/CD + Docker + Monitoring infrastructure
- Multi-stage Dockerfile (builder+runner, <500MB target)
- docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx
- .env.example with all required secrets (never hardcoded)
- requirements.txt with all dependencies including prometheus-client, alembic
- GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push
- GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback
- Alembic migration setup + initial PostgreSQL schema (001_initial_schema)
- SQLite→PostgreSQL data migration script
- Prometheus metrics module (HTTP, ML, DB, business metrics)
- Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy)
- Grafana dashboard (overview: req/s, p95, ML accuracy, error rate)
- Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers)
- Structured JSON logging module
- Automated daily DB backup script (pg_dump + 30-day retention)

Branch: feature/devops-cicd

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-25 17:32:02 +02:00

110 lines
3.7 KiB
YAML

# ============================================================
# Prometheus Alert Rules — Turf SaaS
# ============================================================
groups:
# ----------------------------------------------------------
# HTTP / API Alerts
# ----------------------------------------------------------
- name: http_alerts
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
> 0.01
for: 2m
labels:
severity: critical
annotations:
summary: "High 5xx error rate on {{ $labels.service }}"
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
- alert: HighLatency
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))
> 2
for: 5m
labels:
severity: warning
annotations:
summary: "High p95 latency on {{ $labels.service }}"
description: "p95 latency is {{ $value | humanizeDuration }} (threshold: 2s)"
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "{{ $labels.instance }} has been unreachable for >1 minute"
# ----------------------------------------------------------
# Database Alerts
# ----------------------------------------------------------
- name: database_alerts
rules:
- alert: PostgresDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL is down"
description: "Cannot connect to PostgreSQL database"
- alert: PostgresDiskUsageHigh
expr: |
(pg_database_size_bytes / (1024 * 1024 * 1024)) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL database size > 10GB"
description: "Database {{ $labels.datname }} is {{ $value | humanize }}GB"
- alert: DiskSpaceHigh
expr: |
(node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100
> 80
for: 5m
labels:
severity: warning
annotations:
summary: "Disk usage > 80% on {{ $labels.instance }}"
description: "{{ $labels.mountpoint }} is at {{ $value | humanizePercentage }}"
# ----------------------------------------------------------
# ML Prediction Alerts
# ----------------------------------------------------------
- name: ml_alerts
rules:
- alert: MLAccuracyDegraded
expr: ml_prediction_accuracy_ratio{accuracy_type="top1"} < 0.30
for: 60m
labels:
severity: warning
annotations:
summary: "ML top-1 accuracy below 30%"
description: "Current accuracy: {{ $value | humanizePercentage }}"
- alert: MLPredictionDriftHigh
expr: ml_prediction_drift_score > 0.5
for: 30m
labels:
severity: warning
annotations:
summary: "ML feature drift detected"
description: "Drift score for {{ $labels.feature_group }}: {{ $value }}"
- alert: NoPredictionsGenerated
expr: increase(ml_predictions_total[1h]) == 0
for: 2h
labels:
severity: warning
annotations:
summary: "No ML predictions generated in the last 2 hours"
description: "Check if the scheduler is running and PMU data is being scraped"