turf_saas/infra/prometheus/alerts.yml

# ============================================================
# Prometheus Alert Rules — Turf SaaS
# ============================================================

groups:
  # ----------------------------------------------------------
  # HTTP / API Alerts
  # ----------------------------------------------------------
  - name: http_alerts
    rules:
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service)
          /
          sum(rate(http_requests_total[5m])) by (service)
          > 0.01
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "High 5xx error rate on {{ $labels.service }}"
          description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"

      - alert: HighLatency
        expr: |
          histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))
          > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High p95 latency on {{ $labels.service }}"
          description: "p95 latency is {{ $value | humanizeDuration }} (threshold: 2s)"

      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.instance }} has been unreachable for >1 minute"

  # ----------------------------------------------------------
  # Database Alerts
  # ----------------------------------------------------------
  - name: database_alerts
    rules:
      - alert: PostgresDown
        expr: pg_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "PostgreSQL is down"
          description: "Cannot connect to PostgreSQL database"

      - alert: PostgresDiskUsageHigh
        expr: |
          (pg_database_size_bytes / (1024 * 1024 * 1024)) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "PostgreSQL database size > 10GB"
          description: "Database {{ $labels.datname }} is {{ $value | humanize }}GB"

      - alert: DiskSpaceHigh
        expr: |
          (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100
          > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk usage > 80% on {{ $labels.instance }}"
          description: "{{ $labels.mountpoint }} is at {{ $value | humanizePercentage }}"

  # ----------------------------------------------------------
  # ML Prediction Alerts
  # ----------------------------------------------------------
  - name: ml_alerts
    rules:
      - alert: MLAccuracyDegraded
        expr: ml_prediction_accuracy_ratio{accuracy_type="top1"} < 0.30
        for: 60m
        labels:
          severity: warning
        annotations:
          summary: "ML top-1 accuracy below 30%"
          description: "Current accuracy: {{ $value | humanizePercentage }}"

      - alert: MLPredictionDriftHigh
        expr: ml_prediction_drift_score > 0.5
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: "ML feature drift detected"
          description: "Drift score for {{ $labels.feature_group }}: {{ $value }}"

      - alert: NoPredictionsGenerated
        expr: increase(ml_predictions_total[1h]) == 0
        for: 2h
        labels:
          severity: warning
        annotations:
          summary: "No ML predictions generated in the last 2 hours"
          description: "Check if the scheduler is running and PMU data is being scraped"