feat(ml): train ensemble model and generate benchmark report

Results: - XGBoost (Optuna 100 trials): AUC=0.7856, Precision@3=0.5783 - LightGBM (Optuna 100 trials): AUC=0.7833, Precision@3=0.5736 - MLP (3 layers 256-128-64): AUC=0.7743, Precision@3=0.5643 - Ensemble (weighted voting): AUC=0.7840, Precision@3=0.5814 Baseline XGBoost: Precision@3=0.5287 Delta: +0.0527 (+5.3%) — DEPLOY threshold met (+5%) Latency: 35ms/race, 69ms/full-day (well under 200ms limit) SHAP: 31/43 features selected, top features: rang_cote, implied_prob, cote_direct, ratio_cote_field All 12 regression/latency tests passing. Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-25 19:10:41 +02:00
parent 0e7bcff6b0
commit 6b762068fd
10 changed files with 1262 additions and 49 deletions
--- a/train_ensemble.py
+++ b/train_ensemble.py
@@ -627,7 +627,52 @@ def compute_ensemble_weights(models: dict, X_val, y_val, feature_cols: list) ->


 # ─────────────────────────────────────────────────────────────────────────────
-# 8. EVALUATION HELPERS
+# 8. TURF ENSEMBLE (module-level for pickle compatibility)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class TurfEnsemble:
+    """
+    Picklable soft-voting ensemble: XGBoost + LightGBM + MLP.
+    Weights are set proportional to validation AUC.
+    """
+
+    def __init__(
+        self, xgb_model, lgb_model, mlp_pipeline, weights: dict, feature_cols: list
+    ):
+        self.xgb_model = xgb_model
+        self.lgb_model = lgb_model
+        self.mlp_pipeline = mlp_pipeline
+        self.weights = weights
+        self.feature_cols = feature_cols
+        self.version = f"ensemble_v1_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+
+    def predict_proba(self, X):
+        if isinstance(X, np.ndarray):
+            X = pd.DataFrame(X, columns=self.feature_cols)
+        available = [c for c in self.feature_cols if c in X.columns]
+        Xa = X[available].fillna(0)
+
+        total_w = sum(self.weights.values())
+        proba = np.zeros(len(Xa))
+
+        xp = self.xgb_model.predict_proba(Xa)[:, 1]
+        proba += (self.weights.get("xgboost", 0.33) / total_w) * xp
+
+        lp = self.lgb_model.predict_proba(Xa)[:, 1]
+        proba += (self.weights.get("lightgbm", 0.33) / total_w) * lp
+
+        mp = self.mlp_pipeline.predict_proba(Xa.values)[:, 1]
+        proba += (self.weights.get("mlp", 0.33) / total_w) * mp
+
+        return np.column_stack([1 - proba, proba])
+
+    def predict(self, X, threshold: float = 0.5):
+        return (self.predict_proba(X)[:, 1] >= threshold).astype(int)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# 9. EVALUATION HELPERS
 # ─────────────────────────────────────────────────────────────────────────────


@@ -759,54 +804,9 @@ def main(args):

    # ── Build ensemble ─────────────────────────────────────────────────────────
    print("\n[8/9] Building WeightedEnsemble …")
-
-    class FullEnsemble:
-        """Picklable ensemble wrapper."""
-
-        def __init__(self, xgb_m, lgb_m, mlp_pipe, weights, feature_cols):
-            self.xgb_model = xgb_m
-            self.lgb_model = lgb_m
-            self.mlp_pipeline = mlp_pipe
-            self.weights = weights
-            self.feature_cols = feature_cols
-            self.version = f"ensemble_v1_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-
-        def predict_proba(self, X: pd.DataFrame):
-            if isinstance(X, np.ndarray):
-                X = pd.DataFrame(X, columns=self.feature_cols)
-            available = [c for c in self.feature_cols if c in X.columns]
-            Xa = X[available].fillna(0)
-
-            total_w = sum(self.weights.values())
-            proba = np.zeros(len(Xa))
-
-            # XGBoost
-            xp = self.xgb_model.predict_proba(Xa)[:, 1]
-            proba += (self.weights.get("xgboost", 0.33) / total_w) * xp
-
-            # LightGBM
-            lp = self.lgb_model.predict_proba(Xa)[:, 1]
-            proba += (self.weights.get("lightgbm", 0.33) / total_w) * lp
-
-            # MLP
-            mp = self.mlp_pipeline.predict_proba(Xa.values)[:, 1]
-            proba += (self.weights.get("mlp", 0.33) / total_w) * mp
-
-            return np.column_stack([1 - proba, proba])
-
-        def predict(self, X, threshold=0.5):
-            return (self.predict_proba(X)[:, 1] >= threshold).astype(int)
-
-    ensemble = FullEnsemble(xgb_model, lgb_model, mlp_model, weights, feat_cols)
-    # Add feature_cols attribute for evaluate_model
-    ensemble_eval = type(
-        "E",
-        (),
-        {
-            "predict_proba": ensemble.predict_proba,
-            "feature_cols": feat_cols,
-        },
-    )()
+    ensemble = TurfEnsemble(xgb_model, lgb_model, mlp_model, weights, feat_cols)
+    # TurfEnsemble already has .feature_cols; use it directly for evaluation
+    ensemble_eval = ensemble

    # ── Holdout evaluation ─────────────────────────────────────────────────────
    print("\n[9/9] Evaluating all models on holdout …")