feat(devops): CI/CD + Docker + Monitoring infrastructure
- Multi-stage Dockerfile (builder+runner, <500MB target) - docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx - .env.example with all required secrets (never hardcoded) - requirements.txt with all dependencies including prometheus-client, alembic - GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push - GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback - Alembic migration setup + initial PostgreSQL schema (001_initial_schema) - SQLite→PostgreSQL data migration script - Prometheus metrics module (HTTP, ML, DB, business metrics) - Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy) - Grafana dashboard (overview: req/s, p95, ML accuracy, error rate) - Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers) - Structured JSON logging module - Automated daily DB backup script (pg_dump + 30-day retention) Branch: feature/devops-cicd Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
255
metrics.py
Normal file
255
metrics.py
Normal file
@@ -0,0 +1,255 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Prometheus metrics instrumentation for Turf SaaS.
|
||||
Import this module in Flask apps to expose /metrics endpoint.
|
||||
"""
|
||||
|
||||
import time
|
||||
import functools
|
||||
import logging
|
||||
from typing import Callable, Any
|
||||
|
||||
try:
|
||||
from prometheus_client import (
|
||||
Counter,
|
||||
Histogram,
|
||||
Gauge,
|
||||
Summary,
|
||||
generate_latest,
|
||||
CONTENT_TYPE_LATEST,
|
||||
CollectorRegistry,
|
||||
multiprocess,
|
||||
REGISTRY,
|
||||
)
|
||||
|
||||
PROMETHEUS_AVAILABLE = True
|
||||
except ImportError:
|
||||
PROMETHEUS_AVAILABLE = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ============================================================
|
||||
# Metric definitions
|
||||
# ============================================================
|
||||
|
||||
if PROMETHEUS_AVAILABLE:
|
||||
# HTTP metrics
|
||||
HTTP_REQUESTS_TOTAL = Counter(
|
||||
"http_requests_total",
|
||||
"Total number of HTTP requests",
|
||||
["method", "endpoint", "status_code", "service"],
|
||||
)
|
||||
|
||||
HTTP_REQUEST_DURATION = Histogram(
|
||||
"http_request_duration_seconds",
|
||||
"HTTP request duration in seconds",
|
||||
["method", "endpoint", "service"],
|
||||
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0],
|
||||
)
|
||||
|
||||
HTTP_REQUESTS_IN_PROGRESS = Gauge(
|
||||
"http_requests_in_progress",
|
||||
"Number of HTTP requests currently being processed",
|
||||
["method", "endpoint", "service"],
|
||||
)
|
||||
|
||||
# ML prediction metrics
|
||||
ML_PREDICTIONS_TOTAL = Counter(
|
||||
"ml_predictions_total",
|
||||
"Total ML prediction requests",
|
||||
["model_type", "race_type"],
|
||||
)
|
||||
|
||||
ML_PREDICTION_DURATION = Histogram(
|
||||
"ml_prediction_duration_seconds",
|
||||
"ML prediction duration in seconds",
|
||||
["model_type"],
|
||||
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0],
|
||||
)
|
||||
|
||||
ML_PREDICTION_ACCURACY = Gauge(
|
||||
"ml_prediction_accuracy_ratio",
|
||||
"Rolling ML prediction accuracy (top-1, top-3)",
|
||||
["accuracy_type"],
|
||||
)
|
||||
|
||||
ML_PREDICTION_DRIFT = Gauge(
|
||||
"ml_prediction_drift_score",
|
||||
"Feature drift score for ML models (0=no drift, 1=full drift)",
|
||||
["feature_group"],
|
||||
)
|
||||
|
||||
# Database metrics
|
||||
DB_QUERIES_TOTAL = Counter(
|
||||
"db_queries_total", "Total database queries", ["operation", "table"]
|
||||
)
|
||||
|
||||
DB_QUERY_DURATION = Histogram(
|
||||
"db_query_duration_seconds",
|
||||
"Database query duration",
|
||||
["operation"],
|
||||
buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0],
|
||||
)
|
||||
|
||||
DB_CONNECTION_POOL_SIZE = Gauge(
|
||||
"db_connection_pool_size", "Current database connection pool size"
|
||||
)
|
||||
|
||||
# Business metrics
|
||||
RACES_SCRAPED_TOTAL = Counter(
|
||||
"races_scraped_total", "Total number of races scraped", ["source", "discipline"]
|
||||
)
|
||||
|
||||
PREDICTIONS_ACCURACY_DAILY = Gauge(
|
||||
"predictions_accuracy_daily_ratio",
|
||||
"Daily prediction accuracy ratio",
|
||||
["date", "race_type"],
|
||||
)
|
||||
|
||||
ACTIVE_SUBSCRIPTIONS = Gauge(
|
||||
"active_subscriptions_total", "Number of active SaaS subscriptions", ["plan"]
|
||||
)
|
||||
|
||||
# App health
|
||||
APP_INFO = Gauge(
|
||||
"app_info", "Application build information", ["version", "service", "env"]
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Flask integration
|
||||
# ============================================================
|
||||
|
||||
|
||||
def init_metrics(app, service_name: str = "unknown"):
|
||||
"""
|
||||
Register Prometheus metrics middleware on a Flask app.
|
||||
|
||||
Usage:
|
||||
from metrics import init_metrics
|
||||
init_metrics(app, service_name="combined-api")
|
||||
"""
|
||||
if not PROMETHEUS_AVAILABLE:
|
||||
logger.warning("prometheus_client not installed — metrics disabled")
|
||||
return
|
||||
|
||||
from flask import request, Response
|
||||
|
||||
# Set app info gauge
|
||||
APP_INFO.labels(
|
||||
version=app.config.get("VERSION", "unknown"),
|
||||
service=service_name,
|
||||
env=app.config.get("ENV", "unknown"),
|
||||
).set(1)
|
||||
|
||||
@app.before_request
|
||||
def before_request():
|
||||
request._start_time = time.time()
|
||||
HTTP_REQUESTS_IN_PROGRESS.labels(
|
||||
method=request.method, endpoint=request.path, service=service_name
|
||||
).inc()
|
||||
|
||||
@app.after_request
|
||||
def after_request(response):
|
||||
duration = time.time() - getattr(request, "_start_time", time.time())
|
||||
endpoint = request.path
|
||||
|
||||
HTTP_REQUESTS_TOTAL.labels(
|
||||
method=request.method,
|
||||
endpoint=endpoint,
|
||||
status_code=str(response.status_code),
|
||||
service=service_name,
|
||||
).inc()
|
||||
|
||||
HTTP_REQUEST_DURATION.labels(
|
||||
method=request.method, endpoint=endpoint, service=service_name
|
||||
).observe(duration)
|
||||
|
||||
HTTP_REQUESTS_IN_PROGRESS.labels(
|
||||
method=request.method, endpoint=endpoint, service=service_name
|
||||
).dec()
|
||||
|
||||
return response
|
||||
|
||||
@app.route("/metrics")
|
||||
def metrics_endpoint():
|
||||
"""Prometheus metrics scrape endpoint."""
|
||||
return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
|
||||
|
||||
@app.route("/health")
|
||||
def health_endpoint():
|
||||
"""Docker / load-balancer health check endpoint."""
|
||||
from flask import jsonify
|
||||
|
||||
return jsonify({"status": "ok", "service": service_name})
|
||||
|
||||
logger.info(f"Prometheus metrics initialized for service: {service_name}")
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Decorator helpers
|
||||
# ============================================================
|
||||
|
||||
|
||||
def track_ml_prediction(model_type: str = "xgboost", race_type: str = "flat"):
|
||||
"""Decorator to track ML prediction calls."""
|
||||
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if not PROMETHEUS_AVAILABLE:
|
||||
return func(*args, **kwargs)
|
||||
start = time.time()
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
ML_PREDICTIONS_TOTAL.labels(
|
||||
model_type=model_type, race_type=race_type
|
||||
).inc()
|
||||
return result
|
||||
finally:
|
||||
ML_PREDICTION_DURATION.labels(model_type=model_type).observe(
|
||||
time.time() - start
|
||||
)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def track_db_query(operation: str = "select", table: str = "unknown"):
|
||||
"""Decorator to track DB query calls."""
|
||||
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if not PROMETHEUS_AVAILABLE:
|
||||
return func(*args, **kwargs)
|
||||
start = time.time()
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
DB_QUERIES_TOTAL.labels(operation=operation, table=table).inc()
|
||||
return result
|
||||
finally:
|
||||
DB_QUERY_DURATION.labels(operation=operation).observe(
|
||||
time.time() - start
|
||||
)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def update_ml_accuracy(top1_accuracy: float, top3_accuracy: float):
|
||||
"""Update ML accuracy gauges (call from scheduler)."""
|
||||
if not PROMETHEUS_AVAILABLE:
|
||||
return
|
||||
ML_PREDICTION_ACCURACY.labels(accuracy_type="top1").set(top1_accuracy)
|
||||
ML_PREDICTION_ACCURACY.labels(accuracy_type="top3").set(top3_accuracy)
|
||||
|
||||
|
||||
def update_subscription_count(plan_counts: dict):
|
||||
"""Update subscription count gauges."""
|
||||
if not PROMETHEUS_AVAILABLE:
|
||||
return
|
||||
for plan, count in plan_counts.items():
|
||||
ACTIVE_SUBSCRIPTIONS.labels(plan=plan).set(count)
|
||||
Reference in New Issue
Block a user