- Multi-stage Dockerfile (builder+runner, <500MB target) - docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx - .env.example with all required secrets (never hardcoded) - requirements.txt with all dependencies including prometheus-client, alembic - GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push - GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback - Alembic migration setup + initial PostgreSQL schema (001_initial_schema) - SQLite→PostgreSQL data migration script - Prometheus metrics module (HTTP, ML, DB, business metrics) - Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy) - Grafana dashboard (overview: req/s, p95, ML accuracy, error rate) - Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers) - Structured JSON logging module - Automated daily DB backup script (pg_dump + 30-day retention) Branch: feature/devops-cicd Co-Authored-By: Paperclip <noreply@paperclip.ing>
256 lines
7.3 KiB
Python
256 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Prometheus metrics instrumentation for Turf SaaS.
|
|
Import this module in Flask apps to expose /metrics endpoint.
|
|
"""
|
|
|
|
import time
|
|
import functools
|
|
import logging
|
|
from typing import Callable, Any
|
|
|
|
try:
|
|
from prometheus_client import (
|
|
Counter,
|
|
Histogram,
|
|
Gauge,
|
|
Summary,
|
|
generate_latest,
|
|
CONTENT_TYPE_LATEST,
|
|
CollectorRegistry,
|
|
multiprocess,
|
|
REGISTRY,
|
|
)
|
|
|
|
PROMETHEUS_AVAILABLE = True
|
|
except ImportError:
|
|
PROMETHEUS_AVAILABLE = False
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ============================================================
|
|
# Metric definitions
|
|
# ============================================================
|
|
|
|
if PROMETHEUS_AVAILABLE:
|
|
# HTTP metrics
|
|
HTTP_REQUESTS_TOTAL = Counter(
|
|
"http_requests_total",
|
|
"Total number of HTTP requests",
|
|
["method", "endpoint", "status_code", "service"],
|
|
)
|
|
|
|
HTTP_REQUEST_DURATION = Histogram(
|
|
"http_request_duration_seconds",
|
|
"HTTP request duration in seconds",
|
|
["method", "endpoint", "service"],
|
|
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0],
|
|
)
|
|
|
|
HTTP_REQUESTS_IN_PROGRESS = Gauge(
|
|
"http_requests_in_progress",
|
|
"Number of HTTP requests currently being processed",
|
|
["method", "endpoint", "service"],
|
|
)
|
|
|
|
# ML prediction metrics
|
|
ML_PREDICTIONS_TOTAL = Counter(
|
|
"ml_predictions_total",
|
|
"Total ML prediction requests",
|
|
["model_type", "race_type"],
|
|
)
|
|
|
|
ML_PREDICTION_DURATION = Histogram(
|
|
"ml_prediction_duration_seconds",
|
|
"ML prediction duration in seconds",
|
|
["model_type"],
|
|
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0],
|
|
)
|
|
|
|
ML_PREDICTION_ACCURACY = Gauge(
|
|
"ml_prediction_accuracy_ratio",
|
|
"Rolling ML prediction accuracy (top-1, top-3)",
|
|
["accuracy_type"],
|
|
)
|
|
|
|
ML_PREDICTION_DRIFT = Gauge(
|
|
"ml_prediction_drift_score",
|
|
"Feature drift score for ML models (0=no drift, 1=full drift)",
|
|
["feature_group"],
|
|
)
|
|
|
|
# Database metrics
|
|
DB_QUERIES_TOTAL = Counter(
|
|
"db_queries_total", "Total database queries", ["operation", "table"]
|
|
)
|
|
|
|
DB_QUERY_DURATION = Histogram(
|
|
"db_query_duration_seconds",
|
|
"Database query duration",
|
|
["operation"],
|
|
buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0],
|
|
)
|
|
|
|
DB_CONNECTION_POOL_SIZE = Gauge(
|
|
"db_connection_pool_size", "Current database connection pool size"
|
|
)
|
|
|
|
# Business metrics
|
|
RACES_SCRAPED_TOTAL = Counter(
|
|
"races_scraped_total", "Total number of races scraped", ["source", "discipline"]
|
|
)
|
|
|
|
PREDICTIONS_ACCURACY_DAILY = Gauge(
|
|
"predictions_accuracy_daily_ratio",
|
|
"Daily prediction accuracy ratio",
|
|
["date", "race_type"],
|
|
)
|
|
|
|
ACTIVE_SUBSCRIPTIONS = Gauge(
|
|
"active_subscriptions_total", "Number of active SaaS subscriptions", ["plan"]
|
|
)
|
|
|
|
# App health
|
|
APP_INFO = Gauge(
|
|
"app_info", "Application build information", ["version", "service", "env"]
|
|
)
|
|
|
|
|
|
# ============================================================
|
|
# Flask integration
|
|
# ============================================================
|
|
|
|
|
|
def init_metrics(app, service_name: str = "unknown"):
|
|
"""
|
|
Register Prometheus metrics middleware on a Flask app.
|
|
|
|
Usage:
|
|
from metrics import init_metrics
|
|
init_metrics(app, service_name="combined-api")
|
|
"""
|
|
if not PROMETHEUS_AVAILABLE:
|
|
logger.warning("prometheus_client not installed — metrics disabled")
|
|
return
|
|
|
|
from flask import request, Response
|
|
|
|
# Set app info gauge
|
|
APP_INFO.labels(
|
|
version=app.config.get("VERSION", "unknown"),
|
|
service=service_name,
|
|
env=app.config.get("ENV", "unknown"),
|
|
).set(1)
|
|
|
|
@app.before_request
|
|
def before_request():
|
|
request._start_time = time.time()
|
|
HTTP_REQUESTS_IN_PROGRESS.labels(
|
|
method=request.method, endpoint=request.path, service=service_name
|
|
).inc()
|
|
|
|
@app.after_request
|
|
def after_request(response):
|
|
duration = time.time() - getattr(request, "_start_time", time.time())
|
|
endpoint = request.path
|
|
|
|
HTTP_REQUESTS_TOTAL.labels(
|
|
method=request.method,
|
|
endpoint=endpoint,
|
|
status_code=str(response.status_code),
|
|
service=service_name,
|
|
).inc()
|
|
|
|
HTTP_REQUEST_DURATION.labels(
|
|
method=request.method, endpoint=endpoint, service=service_name
|
|
).observe(duration)
|
|
|
|
HTTP_REQUESTS_IN_PROGRESS.labels(
|
|
method=request.method, endpoint=endpoint, service=service_name
|
|
).dec()
|
|
|
|
return response
|
|
|
|
@app.route("/metrics")
|
|
def metrics_endpoint():
|
|
"""Prometheus metrics scrape endpoint."""
|
|
return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
|
|
|
|
@app.route("/health")
|
|
def health_endpoint():
|
|
"""Docker / load-balancer health check endpoint."""
|
|
from flask import jsonify
|
|
|
|
return jsonify({"status": "ok", "service": service_name})
|
|
|
|
logger.info(f"Prometheus metrics initialized for service: {service_name}")
|
|
|
|
|
|
# ============================================================
|
|
# Decorator helpers
|
|
# ============================================================
|
|
|
|
|
|
def track_ml_prediction(model_type: str = "xgboost", race_type: str = "flat"):
|
|
"""Decorator to track ML prediction calls."""
|
|
|
|
def decorator(func: Callable) -> Callable:
|
|
@functools.wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
if not PROMETHEUS_AVAILABLE:
|
|
return func(*args, **kwargs)
|
|
start = time.time()
|
|
try:
|
|
result = func(*args, **kwargs)
|
|
ML_PREDICTIONS_TOTAL.labels(
|
|
model_type=model_type, race_type=race_type
|
|
).inc()
|
|
return result
|
|
finally:
|
|
ML_PREDICTION_DURATION.labels(model_type=model_type).observe(
|
|
time.time() - start
|
|
)
|
|
|
|
return wrapper
|
|
|
|
return decorator
|
|
|
|
|
|
def track_db_query(operation: str = "select", table: str = "unknown"):
|
|
"""Decorator to track DB query calls."""
|
|
|
|
def decorator(func: Callable) -> Callable:
|
|
@functools.wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
if not PROMETHEUS_AVAILABLE:
|
|
return func(*args, **kwargs)
|
|
start = time.time()
|
|
try:
|
|
result = func(*args, **kwargs)
|
|
DB_QUERIES_TOTAL.labels(operation=operation, table=table).inc()
|
|
return result
|
|
finally:
|
|
DB_QUERY_DURATION.labels(operation=operation).observe(
|
|
time.time() - start
|
|
)
|
|
|
|
return wrapper
|
|
|
|
return decorator
|
|
|
|
|
|
def update_ml_accuracy(top1_accuracy: float, top3_accuracy: float):
|
|
"""Update ML accuracy gauges (call from scheduler)."""
|
|
if not PROMETHEUS_AVAILABLE:
|
|
return
|
|
ML_PREDICTION_ACCURACY.labels(accuracy_type="top1").set(top1_accuracy)
|
|
ML_PREDICTION_ACCURACY.labels(accuracy_type="top3").set(top3_accuracy)
|
|
|
|
|
|
def update_subscription_count(plan_counts: dict):
|
|
"""Update subscription count gauges."""
|
|
if not PROMETHEUS_AVAILABLE:
|
|
return
|
|
for plan, count in plan_counts.items():
|
|
ACTIVE_SUBSCRIPTIONS.labels(plan=plan).set(count)
|