feat(ml): train ensemble model and generate benchmark report

Results:
  - XGBoost (Optuna 100 trials): AUC=0.7856, Precision@3=0.5783
  - LightGBM (Optuna 100 trials): AUC=0.7833, Precision@3=0.5736
  - MLP (3 layers 256-128-64): AUC=0.7743, Precision@3=0.5643
  - Ensemble (weighted voting): AUC=0.7840, Precision@3=0.5814

  Baseline XGBoost: Precision@3=0.5287
  Delta: +0.0527 (+5.3%) — DEPLOY threshold met (+5%)
  Latency: 35ms/race, 69ms/full-day (well under 200ms limit)

  SHAP: 31/43 features selected, top features: rang_cote,
  implied_prob, cote_direct, ratio_cote_field

  All 12 regression/latency tests passing.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
DevOps Engineer
2026-04-25 19:10:41 +02:00
parent 0e7bcff6b0
commit 6b762068fd
10 changed files with 1262 additions and 49 deletions

View File

@@ -627,7 +627,52 @@ def compute_ensemble_weights(models: dict, X_val, y_val, feature_cols: list) ->
# ─────────────────────────────────────────────────────────────────────────────
# 8. EVALUATION HELPERS
# 8. TURF ENSEMBLE (module-level for pickle compatibility)
# ─────────────────────────────────────────────────────────────────────────────
class TurfEnsemble:
"""
Picklable soft-voting ensemble: XGBoost + LightGBM + MLP.
Weights are set proportional to validation AUC.
"""
def __init__(
self, xgb_model, lgb_model, mlp_pipeline, weights: dict, feature_cols: list
):
self.xgb_model = xgb_model
self.lgb_model = lgb_model
self.mlp_pipeline = mlp_pipeline
self.weights = weights
self.feature_cols = feature_cols
self.version = f"ensemble_v1_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
def predict_proba(self, X):
if isinstance(X, np.ndarray):
X = pd.DataFrame(X, columns=self.feature_cols)
available = [c for c in self.feature_cols if c in X.columns]
Xa = X[available].fillna(0)
total_w = sum(self.weights.values())
proba = np.zeros(len(Xa))
xp = self.xgb_model.predict_proba(Xa)[:, 1]
proba += (self.weights.get("xgboost", 0.33) / total_w) * xp
lp = self.lgb_model.predict_proba(Xa)[:, 1]
proba += (self.weights.get("lightgbm", 0.33) / total_w) * lp
mp = self.mlp_pipeline.predict_proba(Xa.values)[:, 1]
proba += (self.weights.get("mlp", 0.33) / total_w) * mp
return np.column_stack([1 - proba, proba])
def predict(self, X, threshold: float = 0.5):
return (self.predict_proba(X)[:, 1] >= threshold).astype(int)
# ─────────────────────────────────────────────────────────────────────────────
# 9. EVALUATION HELPERS
# ─────────────────────────────────────────────────────────────────────────────
@@ -759,54 +804,9 @@ def main(args):
# ── Build ensemble ─────────────────────────────────────────────────────────
print("\n[8/9] Building WeightedEnsemble …")
class FullEnsemble:
"""Picklable ensemble wrapper."""
def __init__(self, xgb_m, lgb_m, mlp_pipe, weights, feature_cols):
self.xgb_model = xgb_m
self.lgb_model = lgb_m
self.mlp_pipeline = mlp_pipe
self.weights = weights
self.feature_cols = feature_cols
self.version = f"ensemble_v1_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
def predict_proba(self, X: pd.DataFrame):
if isinstance(X, np.ndarray):
X = pd.DataFrame(X, columns=self.feature_cols)
available = [c for c in self.feature_cols if c in X.columns]
Xa = X[available].fillna(0)
total_w = sum(self.weights.values())
proba = np.zeros(len(Xa))
# XGBoost
xp = self.xgb_model.predict_proba(Xa)[:, 1]
proba += (self.weights.get("xgboost", 0.33) / total_w) * xp
# LightGBM
lp = self.lgb_model.predict_proba(Xa)[:, 1]
proba += (self.weights.get("lightgbm", 0.33) / total_w) * lp
# MLP
mp = self.mlp_pipeline.predict_proba(Xa.values)[:, 1]
proba += (self.weights.get("mlp", 0.33) / total_w) * mp
return np.column_stack([1 - proba, proba])
def predict(self, X, threshold=0.5):
return (self.predict_proba(X)[:, 1] >= threshold).astype(int)
ensemble = FullEnsemble(xgb_model, lgb_model, mlp_model, weights, feat_cols)
# Add feature_cols attribute for evaluate_model
ensemble_eval = type(
"E",
(),
{
"predict_proba": ensemble.predict_proba,
"feature_cols": feat_cols,
},
)()
ensemble = TurfEnsemble(xgb_model, lgb_model, mlp_model, weights, feat_cols)
# TurfEnsemble already has .feature_cols; use it directly for evaluation
ensemble_eval = ensemble
# ── Holdout evaluation ─────────────────────────────────────────────────────
print("\n[9/9] Evaluating all models on holdout …")