feat(devops): CI/CD + Docker + Monitoring infrastructure

- Multi-stage Dockerfile (builder+runner, <500MB target) - docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx - .env.example with all required secrets (never hardcoded) - requirements.txt with all dependencies including prometheus-client, alembic - GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push - GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback - Alembic migration setup + initial PostgreSQL schema (001_initial_schema) - SQLite→PostgreSQL data migration script - Prometheus metrics module (HTTP, ML, DB, business metrics) - Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy) - Grafana dashboard (overview: req/s, p95, ML accuracy, error rate) - Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers) - Structured JSON logging module - Automated daily DB backup script (pg_dump + 30-day retention) Branch: feature/devops-cicd Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-25 17:32:02 +02:00
parent ed07c8a3d1
commit dce1e9b744
25 changed files with 2659 additions and 0 deletions
--- a/metrics.py
+++ b/metrics.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+Prometheus metrics instrumentation for Turf SaaS.
+Import this module in Flask apps to expose /metrics endpoint.
+"""
+
+import time
+import functools
+import logging
+from typing import Callable, Any
+
+try:
+    from prometheus_client import (
+        Counter,
+        Histogram,
+        Gauge,
+        Summary,
+        generate_latest,
+        CONTENT_TYPE_LATEST,
+        CollectorRegistry,
+        multiprocess,
+        REGISTRY,
+    )
+
+    PROMETHEUS_AVAILABLE = True
+except ImportError:
+    PROMETHEUS_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+# ============================================================
+# Metric definitions
+# ============================================================
+
+if PROMETHEUS_AVAILABLE:
+    # HTTP metrics
+    HTTP_REQUESTS_TOTAL = Counter(
+        "http_requests_total",
+        "Total number of HTTP requests",
+        ["method", "endpoint", "status_code", "service"],
+    )
+
+    HTTP_REQUEST_DURATION = Histogram(
+        "http_request_duration_seconds",
+        "HTTP request duration in seconds",
+        ["method", "endpoint", "service"],
+        buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0],
+    )
+
+    HTTP_REQUESTS_IN_PROGRESS = Gauge(
+        "http_requests_in_progress",
+        "Number of HTTP requests currently being processed",
+        ["method", "endpoint", "service"],
+    )
+
+    # ML prediction metrics
+    ML_PREDICTIONS_TOTAL = Counter(
+        "ml_predictions_total",
+        "Total ML prediction requests",
+        ["model_type", "race_type"],
+    )
+
+    ML_PREDICTION_DURATION = Histogram(
+        "ml_prediction_duration_seconds",
+        "ML prediction duration in seconds",
+        ["model_type"],
+        buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0],
+    )
+
+    ML_PREDICTION_ACCURACY = Gauge(
+        "ml_prediction_accuracy_ratio",
+        "Rolling ML prediction accuracy (top-1, top-3)",
+        ["accuracy_type"],
+    )
+
+    ML_PREDICTION_DRIFT = Gauge(
+        "ml_prediction_drift_score",
+        "Feature drift score for ML models (0=no drift, 1=full drift)",
+        ["feature_group"],
+    )
+
+    # Database metrics
+    DB_QUERIES_TOTAL = Counter(
+        "db_queries_total", "Total database queries", ["operation", "table"]
+    )
+
+    DB_QUERY_DURATION = Histogram(
+        "db_query_duration_seconds",
+        "Database query duration",
+        ["operation"],
+        buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0],
+    )
+
+    DB_CONNECTION_POOL_SIZE = Gauge(
+        "db_connection_pool_size", "Current database connection pool size"
+    )
+
+    # Business metrics
+    RACES_SCRAPED_TOTAL = Counter(
+        "races_scraped_total", "Total number of races scraped", ["source", "discipline"]
+    )
+
+    PREDICTIONS_ACCURACY_DAILY = Gauge(
+        "predictions_accuracy_daily_ratio",
+        "Daily prediction accuracy ratio",
+        ["date", "race_type"],
+    )
+
+    ACTIVE_SUBSCRIPTIONS = Gauge(
+        "active_subscriptions_total", "Number of active SaaS subscriptions", ["plan"]
+    )
+
+    # App health
+    APP_INFO = Gauge(
+        "app_info", "Application build information", ["version", "service", "env"]
+    )
+
+
+# ============================================================
+# Flask integration
+# ============================================================
+
+
+def init_metrics(app, service_name: str = "unknown"):
+    """
+    Register Prometheus metrics middleware on a Flask app.
+
+    Usage:
+        from metrics import init_metrics
+        init_metrics(app, service_name="combined-api")
+    """
+    if not PROMETHEUS_AVAILABLE:
+        logger.warning("prometheus_client not installed — metrics disabled")
+        return
+
+    from flask import request, Response
+
+    # Set app info gauge
+    APP_INFO.labels(
+        version=app.config.get("VERSION", "unknown"),
+        service=service_name,
+        env=app.config.get("ENV", "unknown"),
+    ).set(1)
+
+    @app.before_request
+    def before_request():
+        request._start_time = time.time()
+        HTTP_REQUESTS_IN_PROGRESS.labels(
+            method=request.method, endpoint=request.path, service=service_name
+        ).inc()
+
+    @app.after_request
+    def after_request(response):
+        duration = time.time() - getattr(request, "_start_time", time.time())
+        endpoint = request.path
+
+        HTTP_REQUESTS_TOTAL.labels(
+            method=request.method,
+            endpoint=endpoint,
+            status_code=str(response.status_code),
+            service=service_name,
+        ).inc()
+
+        HTTP_REQUEST_DURATION.labels(
+            method=request.method, endpoint=endpoint, service=service_name
+        ).observe(duration)
+
+        HTTP_REQUESTS_IN_PROGRESS.labels(
+            method=request.method, endpoint=endpoint, service=service_name
+        ).dec()
+
+        return response
+
+    @app.route("/metrics")
+    def metrics_endpoint():
+        """Prometheus metrics scrape endpoint."""
+        return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
+
+    @app.route("/health")
+    def health_endpoint():
+        """Docker / load-balancer health check endpoint."""
+        from flask import jsonify
+
+        return jsonify({"status": "ok", "service": service_name})
+
+    logger.info(f"Prometheus metrics initialized for service: {service_name}")
+
+
+# ============================================================
+# Decorator helpers
+# ============================================================
+
+
+def track_ml_prediction(model_type: str = "xgboost", race_type: str = "flat"):
+    """Decorator to track ML prediction calls."""
+
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if not PROMETHEUS_AVAILABLE:
+                return func(*args, **kwargs)
+            start = time.time()
+            try:
+                result = func(*args, **kwargs)
+                ML_PREDICTIONS_TOTAL.labels(
+                    model_type=model_type, race_type=race_type
+                ).inc()
+                return result
+            finally:
+                ML_PREDICTION_DURATION.labels(model_type=model_type).observe(
+                    time.time() - start
+                )
+
+        return wrapper
+
+    return decorator
+
+
+def track_db_query(operation: str = "select", table: str = "unknown"):
+    """Decorator to track DB query calls."""
+
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if not PROMETHEUS_AVAILABLE:
+                return func(*args, **kwargs)
+            start = time.time()
+            try:
+                result = func(*args, **kwargs)
+                DB_QUERIES_TOTAL.labels(operation=operation, table=table).inc()
+                return result
+            finally:
+                DB_QUERY_DURATION.labels(operation=operation).observe(
+                    time.time() - start
+                )
+
+        return wrapper
+
+    return decorator
+
+
+def update_ml_accuracy(top1_accuracy: float, top3_accuracy: float):
+    """Update ML accuracy gauges (call from scheduler)."""
+    if not PROMETHEUS_AVAILABLE:
+        return
+    ML_PREDICTION_ACCURACY.labels(accuracy_type="top1").set(top1_accuracy)
+    ML_PREDICTION_ACCURACY.labels(accuracy_type="top3").set(top3_accuracy)
+
+
+def update_subscription_count(plan_counts: dict):
+    """Update subscription count gauges."""
+    if not PROMETHEUS_AVAILABLE:
+        return
+    for plan, count in plan_counts.items():
+        ACTIVE_SUBSCRIPTIONS.labels(plan=plan).set(count)