feat(devops): CI/CD + Docker + Monitoring infrastructure

- Multi-stage Dockerfile (builder+runner, <500MB target)
- docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx
- .env.example with all required secrets (never hardcoded)
- requirements.txt with all dependencies including prometheus-client, alembic
- GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push
- GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback
- Alembic migration setup + initial PostgreSQL schema (001_initial_schema)
- SQLite→PostgreSQL data migration script
- Prometheus metrics module (HTTP, ML, DB, business metrics)
- Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy)
- Grafana dashboard (overview: req/s, p95, ML accuracy, error rate)
- Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers)
- Structured JSON logging module
- Automated daily DB backup script (pg_dump + 30-day retention)

Branch: feature/devops-cicd

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
DevOps Engineer
2026-04-25 17:32:02 +02:00
parent ed07c8a3d1
commit dce1e9b744
25 changed files with 2659 additions and 0 deletions

255
metrics.py Normal file
View File

@@ -0,0 +1,255 @@
#!/usr/bin/env python3
"""
Prometheus metrics instrumentation for Turf SaaS.
Import this module in Flask apps to expose /metrics endpoint.
"""
import time
import functools
import logging
from typing import Callable, Any
try:
from prometheus_client import (
Counter,
Histogram,
Gauge,
Summary,
generate_latest,
CONTENT_TYPE_LATEST,
CollectorRegistry,
multiprocess,
REGISTRY,
)
PROMETHEUS_AVAILABLE = True
except ImportError:
PROMETHEUS_AVAILABLE = False
logger = logging.getLogger(__name__)
# ============================================================
# Metric definitions
# ============================================================
if PROMETHEUS_AVAILABLE:
# HTTP metrics
HTTP_REQUESTS_TOTAL = Counter(
"http_requests_total",
"Total number of HTTP requests",
["method", "endpoint", "status_code", "service"],
)
HTTP_REQUEST_DURATION = Histogram(
"http_request_duration_seconds",
"HTTP request duration in seconds",
["method", "endpoint", "service"],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0],
)
HTTP_REQUESTS_IN_PROGRESS = Gauge(
"http_requests_in_progress",
"Number of HTTP requests currently being processed",
["method", "endpoint", "service"],
)
# ML prediction metrics
ML_PREDICTIONS_TOTAL = Counter(
"ml_predictions_total",
"Total ML prediction requests",
["model_type", "race_type"],
)
ML_PREDICTION_DURATION = Histogram(
"ml_prediction_duration_seconds",
"ML prediction duration in seconds",
["model_type"],
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0],
)
ML_PREDICTION_ACCURACY = Gauge(
"ml_prediction_accuracy_ratio",
"Rolling ML prediction accuracy (top-1, top-3)",
["accuracy_type"],
)
ML_PREDICTION_DRIFT = Gauge(
"ml_prediction_drift_score",
"Feature drift score for ML models (0=no drift, 1=full drift)",
["feature_group"],
)
# Database metrics
DB_QUERIES_TOTAL = Counter(
"db_queries_total", "Total database queries", ["operation", "table"]
)
DB_QUERY_DURATION = Histogram(
"db_query_duration_seconds",
"Database query duration",
["operation"],
buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0],
)
DB_CONNECTION_POOL_SIZE = Gauge(
"db_connection_pool_size", "Current database connection pool size"
)
# Business metrics
RACES_SCRAPED_TOTAL = Counter(
"races_scraped_total", "Total number of races scraped", ["source", "discipline"]
)
PREDICTIONS_ACCURACY_DAILY = Gauge(
"predictions_accuracy_daily_ratio",
"Daily prediction accuracy ratio",
["date", "race_type"],
)
ACTIVE_SUBSCRIPTIONS = Gauge(
"active_subscriptions_total", "Number of active SaaS subscriptions", ["plan"]
)
# App health
APP_INFO = Gauge(
"app_info", "Application build information", ["version", "service", "env"]
)
# ============================================================
# Flask integration
# ============================================================
def init_metrics(app, service_name: str = "unknown"):
"""
Register Prometheus metrics middleware on a Flask app.
Usage:
from metrics import init_metrics
init_metrics(app, service_name="combined-api")
"""
if not PROMETHEUS_AVAILABLE:
logger.warning("prometheus_client not installed — metrics disabled")
return
from flask import request, Response
# Set app info gauge
APP_INFO.labels(
version=app.config.get("VERSION", "unknown"),
service=service_name,
env=app.config.get("ENV", "unknown"),
).set(1)
@app.before_request
def before_request():
request._start_time = time.time()
HTTP_REQUESTS_IN_PROGRESS.labels(
method=request.method, endpoint=request.path, service=service_name
).inc()
@app.after_request
def after_request(response):
duration = time.time() - getattr(request, "_start_time", time.time())
endpoint = request.path
HTTP_REQUESTS_TOTAL.labels(
method=request.method,
endpoint=endpoint,
status_code=str(response.status_code),
service=service_name,
).inc()
HTTP_REQUEST_DURATION.labels(
method=request.method, endpoint=endpoint, service=service_name
).observe(duration)
HTTP_REQUESTS_IN_PROGRESS.labels(
method=request.method, endpoint=endpoint, service=service_name
).dec()
return response
@app.route("/metrics")
def metrics_endpoint():
"""Prometheus metrics scrape endpoint."""
return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
@app.route("/health")
def health_endpoint():
"""Docker / load-balancer health check endpoint."""
from flask import jsonify
return jsonify({"status": "ok", "service": service_name})
logger.info(f"Prometheus metrics initialized for service: {service_name}")
# ============================================================
# Decorator helpers
# ============================================================
def track_ml_prediction(model_type: str = "xgboost", race_type: str = "flat"):
"""Decorator to track ML prediction calls."""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args, **kwargs):
if not PROMETHEUS_AVAILABLE:
return func(*args, **kwargs)
start = time.time()
try:
result = func(*args, **kwargs)
ML_PREDICTIONS_TOTAL.labels(
model_type=model_type, race_type=race_type
).inc()
return result
finally:
ML_PREDICTION_DURATION.labels(model_type=model_type).observe(
time.time() - start
)
return wrapper
return decorator
def track_db_query(operation: str = "select", table: str = "unknown"):
"""Decorator to track DB query calls."""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args, **kwargs):
if not PROMETHEUS_AVAILABLE:
return func(*args, **kwargs)
start = time.time()
try:
result = func(*args, **kwargs)
DB_QUERIES_TOTAL.labels(operation=operation, table=table).inc()
return result
finally:
DB_QUERY_DURATION.labels(operation=operation).observe(
time.time() - start
)
return wrapper
return decorator
def update_ml_accuracy(top1_accuracy: float, top3_accuracy: float):
"""Update ML accuracy gauges (call from scheduler)."""
if not PROMETHEUS_AVAILABLE:
return
ML_PREDICTION_ACCURACY.labels(accuracy_type="top1").set(top1_accuracy)
ML_PREDICTION_ACCURACY.labels(accuracy_type="top3").set(top3_accuracy)
def update_subscription_count(plan_counts: dict):
"""Update subscription count gauges."""
if not PROMETHEUS_AVAILABLE:
return
for plan, count in plan_counts.items():
ACTIVE_SUBSCRIPTIONS.labels(plan=plan).set(count)