- train_ensemble.py: full training pipeline with 100-trial Optuna studies for XGBoost and LightGBM, MLP (256-128-64), SHAP feature selection, weighted soft-voting ensemble, benchmark report generation - predict_v2.py: production prediction module with model cache invalidation - combined_api.py: add /api/v1/predictions, /api/v1/model/status, /api/v1/model/invalidate-cache endpoints using ensemble model - tests/test_ml_ensemble.py: regression, latency and API tests Baseline XGBoost Precision@3: 0.5287 (holdout 20% temporal) Deploy threshold: +5% = 0.5551 Co-Authored-By: Paperclip <noreply@paperclip.ing>
1008 lines
40 KiB
Python
1008 lines
40 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Ensemble ML Training for Turf Predictions — Sprint 6-7
|
||
XGBoost + LightGBM + MLP with Optuna hyperparameter optimization.
|
||
|
||
Deliverables:
|
||
- Ensemble model (voting) serialized to models/ensemble_top3.pkl
|
||
- Benchmark report: baseline XGBoost vs optimized ensemble
|
||
- Precision TOP3 must improve by +5% minimum to deploy
|
||
|
||
Usage:
|
||
python train_ensemble.py [--trials 100] [--db /path/to/turf.db] [--quick]
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import os
|
||
import pickle
|
||
import re
|
||
import sqlite3
|
||
import time
|
||
import warnings
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
import numpy as np
|
||
import optuna
|
||
import pandas as pd
|
||
import shap
|
||
from sklearn.metrics import (
|
||
accuracy_score,
|
||
classification_report,
|
||
precision_score,
|
||
recall_score,
|
||
roc_auc_score,
|
||
)
|
||
from sklearn.model_selection import StratifiedKFold
|
||
from sklearn.neural_network import MLPClassifier
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
||
|
||
import lightgbm as lgb
|
||
import xgboost as xgb
|
||
|
||
warnings.filterwarnings("ignore")
|
||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||
|
||
# ─── Paths ──────────────────────────────────────────────────────────────────
|
||
DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_saas/turf.db")
|
||
MODELS_DIR = Path("/home/h3r7/turf_saas/models")
|
||
MODELS_DIR.mkdir(exist_ok=True)
|
||
|
||
HOLDOUT_FRACTION = 0.20 # 20 % temporal holdout
|
||
DEPLOY_THRESHOLD = 0.05 # +5 % Precision@3 to auto-deploy
|
||
MIN_TRIALS = 100
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 1. DATA LOADING & FEATURE ENGINEERING
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def load_data(db_path: str) -> pd.DataFrame:
|
||
"""Load joined partants + courses data from SQLite."""
|
||
conn = sqlite3.connect(db_path)
|
||
query = """
|
||
SELECT
|
||
p.date_programme,
|
||
p.num_reunion,
|
||
p.num_course,
|
||
p.num_pmu,
|
||
p.age,
|
||
p.sexe,
|
||
p.musique,
|
||
p.nombre_courses,
|
||
p.nombre_victoires,
|
||
p.nombre_places,
|
||
p.nombre_places_2eme,
|
||
p.nombre_places_3eme,
|
||
p.gains_carriere,
|
||
p.gains_annee_en_cours,
|
||
p.gains_victoires,
|
||
p.handicap_poids,
|
||
p.oeilleres,
|
||
p.cote_direct,
|
||
p.cote_reference,
|
||
p.tendance_cote,
|
||
p.favoris,
|
||
p.ordre_arrivee,
|
||
p.tx_victoire,
|
||
p.tx_place,
|
||
p.forme_recente,
|
||
p.tendance_forme,
|
||
p.indicateur_inedit,
|
||
c.distance,
|
||
c.discipline,
|
||
c.specialite,
|
||
c.nb_declares_partants,
|
||
c.montant_prix,
|
||
c.penetrometre_intitule
|
||
FROM pmu_partants p
|
||
LEFT JOIN pmu_courses c
|
||
ON p.date_programme = c.date_programme
|
||
AND p.num_reunion = c.num_reunion
|
||
AND p.num_course = c.num_course
|
||
WHERE p.ordre_arrivee > 0
|
||
ORDER BY p.date_programme, p.num_reunion, p.num_course, p.num_pmu
|
||
"""
|
||
df = pd.read_sql_query(query, conn)
|
||
conn.close()
|
||
print(f" Loaded {len(df):,} rows from database.")
|
||
return df
|
||
|
||
|
||
def parse_musique(musique):
|
||
"""Parse PMU musique string → list of 5 recent positions."""
|
||
if not musique or pd.isna(musique):
|
||
return [0, 0, 0, 0, 0]
|
||
try:
|
||
clean = re.sub(r"\(\d+\)", "", str(musique))
|
||
numbers = re.findall(r"\d+", clean)
|
||
result = [int(n) if n else 0 for n in numbers[:5]]
|
||
# Pad to length 5
|
||
result += [0] * (5 - len(result))
|
||
return result[:5]
|
||
except Exception:
|
||
return [0, 0, 0, 0, 0]
|
||
|
||
|
||
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""Enhanced feature engineering including new candidate features."""
|
||
df = df.copy()
|
||
|
||
# ── Cible ────────────────────────────────────────────────────────────────
|
||
df["top3"] = (df["ordre_arrivee"] <= 3).astype(int)
|
||
df["top1"] = (df["ordre_arrivee"] == 1).astype(int)
|
||
|
||
# ── Encodages catégoriels ────────────────────────────────────────────────
|
||
for col, default in [
|
||
("sexe", "U"),
|
||
("oeilleres", "SANS"),
|
||
("discipline", "UNKNOWN"),
|
||
("specialite", "UNKNOWN"),
|
||
("tendance_cote", "STABLE"),
|
||
("penetrometre_intitule", "BON"),
|
||
]:
|
||
le = LabelEncoder()
|
||
df[f"{col}_enc"] = le.fit_transform(df[col].fillna(default))
|
||
|
||
# ── Musique (5 dernières positions) ─────────────────────────────────────
|
||
music_parsed = df["musique"].apply(parse_musique)
|
||
for i in range(5):
|
||
df[f"form_{i + 1}"] = music_parsed.apply(lambda x: x[i])
|
||
|
||
# Weighted recent form (exponential decay — most recent weighs most)
|
||
weights = np.array([0.4, 0.25, 0.15, 0.12, 0.08])
|
||
df["form_weighted"] = music_parsed.apply(
|
||
lambda x: sum(w * v for w, v in zip(weights, x))
|
||
)
|
||
df["form_avg"] = music_parsed.apply(np.mean)
|
||
df["form_best"] = music_parsed.apply(min) # best = lowest position
|
||
df["form_worst"] = music_parsed.apply(max)
|
||
|
||
# ── Forme récente (5 dernières courses) — NEW ─────────────────────────
|
||
# Ratio victoires/courses (carrière)
|
||
df["win_ratio"] = df["nombre_victoires"] / df["nombre_courses"].replace(0, 1)
|
||
# Ratio places/courses
|
||
df["place_ratio"] = df["nombre_places"] / df["nombre_courses"].replace(0, 1)
|
||
# Probabilité implicite cote
|
||
df["implied_prob"] = 1.0 / df["cote_direct"].replace(0, np.nan)
|
||
|
||
# Adj win rate (log experience)
|
||
df["win_rate_adj"] = df["tx_victoire"] * np.log1p(df["nombre_courses"])
|
||
df["place_rate_adj"] = df["tx_place"] * np.log1p(df["nombre_courses"])
|
||
|
||
# Earnings per race — NEW
|
||
df["earnings_per_race"] = df["gains_annee_en_cours"] / df["nombre_courses"].replace(
|
||
0, 1
|
||
)
|
||
|
||
# Cote historique moyenne approx — use cote_reference vs cote_direct
|
||
df["cote_diff"] = (df["cote_direct"] - df["cote_reference"]).fillna(0)
|
||
df["cote_ratio"] = (
|
||
df["cote_direct"] / df["cote_reference"].replace(0, np.nan)
|
||
).fillna(1)
|
||
|
||
# Rang cote dans le champ (per-race rank)
|
||
df["rang_cote"] = df.groupby(["date_programme", "num_reunion", "num_course"])[
|
||
"cote_direct"
|
||
].rank(method="min", na_option="bottom")
|
||
# Ratio cote vs field mean
|
||
race_mean_cote = df.groupby(["date_programme", "num_reunion", "num_course"])[
|
||
"cote_direct"
|
||
].transform("mean")
|
||
df["ratio_cote_field"] = df["cote_direct"] / race_mean_cote.replace(0, np.nan)
|
||
|
||
# Field strength (number of starters)
|
||
df["nb_partants"] = df["nb_declares_partants"].fillna(
|
||
df.groupby(["date_programme", "num_reunion", "num_course"])[
|
||
"num_pmu"
|
||
].transform("count")
|
||
)
|
||
|
||
# Distance categories
|
||
df["distance_cat"] = pd.cut(
|
||
df["distance"].fillna(1600),
|
||
bins=[0, 1400, 1800, 2200, 2600, 10000],
|
||
labels=[1, 2, 3, 4, 5],
|
||
).astype(float)
|
||
|
||
# Age × winrate interaction
|
||
df["age_win_interact"] = df["age"] * df["tx_victoire"]
|
||
|
||
# Favoris
|
||
df["is_favorite"] = df["favoris"].fillna(0).astype(int)
|
||
|
||
# Poids
|
||
df["poids"] = df["handicap_poids"].fillna(df["handicap_poids"].median())
|
||
|
||
# Prize money normalised
|
||
df["prize_norm"] = np.log1p(df["montant_prix"].fillna(0))
|
||
|
||
# Tendency indicator
|
||
df["tendance_num"] = df["tendance_forme"].fillna(0)
|
||
|
||
return df
|
||
|
||
|
||
FEATURE_COLS = [
|
||
"age",
|
||
"sexe_enc",
|
||
"nombre_courses",
|
||
"nombre_victoires",
|
||
"nombre_places",
|
||
"tx_victoire",
|
||
"tx_place",
|
||
"forme_recente",
|
||
"tendance_num",
|
||
"gains_annee_en_cours",
|
||
"cote_direct",
|
||
"cote_reference",
|
||
"distance",
|
||
"nb_partants",
|
||
"discipline_enc",
|
||
"specialite_enc",
|
||
"oeilleres_enc",
|
||
"tendance_cote_enc",
|
||
"penetrometre_intitule_enc",
|
||
"form_1",
|
||
"form_2",
|
||
"form_3",
|
||
"form_4",
|
||
"form_5",
|
||
"form_weighted",
|
||
"form_avg",
|
||
"form_best",
|
||
"form_worst",
|
||
"win_ratio",
|
||
"place_ratio",
|
||
"implied_prob",
|
||
"win_rate_adj",
|
||
"place_rate_adj",
|
||
"earnings_per_race",
|
||
"cote_diff",
|
||
"cote_ratio",
|
||
"rang_cote",
|
||
"ratio_cote_field",
|
||
"distance_cat",
|
||
"age_win_interact",
|
||
"is_favorite",
|
||
"poids",
|
||
"prize_norm",
|
||
]
|
||
|
||
|
||
def get_features_and_target(df: pd.DataFrame, target: str = "top3"):
|
||
"""Return X, y filtered to available feature columns."""
|
||
available = [c for c in FEATURE_COLS if c in df.columns]
|
||
X = df[available].fillna(0)
|
||
y = df[target].fillna(0).astype(int)
|
||
return X, y, available
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 2. TEMPORAL TRAIN/HOLDOUT SPLIT
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def temporal_split(df: pd.DataFrame, holdout_frac: float = 0.20):
|
||
"""Split data chronologically — no leakage."""
|
||
df = df.sort_values("date_programme")
|
||
cutoff_idx = int(len(df) * (1 - holdout_frac))
|
||
train = df.iloc[:cutoff_idx].copy()
|
||
holdout = df.iloc[cutoff_idx:].copy()
|
||
print(
|
||
f" Train: {len(train):,} rows ({train['date_programme'].min()} → {train['date_programme'].max()})"
|
||
)
|
||
print(
|
||
f" Holdout: {len(holdout):,} rows ({holdout['date_programme'].min()} → {holdout['date_programme'].max()})"
|
||
)
|
||
return train, holdout
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 3. BASELINE XGBOOST (existing model score)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def evaluate_baseline(holdout_df: pd.DataFrame, existing_model_path: str) -> dict:
|
||
"""Load existing XGBoost model and compute Precision@3 on holdout.
|
||
|
||
The old model was trained on historical_data with different column names;
|
||
we map the equivalent pmu_partants columns before prediction.
|
||
"""
|
||
# Column name mapping: old_name -> new_name in holdout_df
|
||
COL_MAP = {
|
||
"nb_courses": "nombre_courses",
|
||
"nb_victoires": "nombre_victoires",
|
||
"nb_places": "nombre_places",
|
||
"gains_annee": "gains_annee_en_cours",
|
||
"cote_directe": "cote_direct",
|
||
"avis_enc": None, # not available → 0
|
||
"deferre_enc": None, # not available → 0
|
||
"reduction_km": None, # not available → 0
|
||
"victories_per_race": None,
|
||
"places_per_race": None,
|
||
}
|
||
try:
|
||
with open(existing_model_path, "rb") as f:
|
||
saved = pickle.load(f)
|
||
model = saved["model_top3"]
|
||
feat_cols = saved["feature_cols"]
|
||
|
||
mapped = holdout_df.copy()
|
||
for old, new in COL_MAP.items():
|
||
if new and new in mapped.columns:
|
||
mapped[old] = mapped[new]
|
||
elif old not in mapped.columns:
|
||
mapped[old] = 0
|
||
|
||
available = [c for c in feat_cols if c in mapped.columns]
|
||
X_h = mapped[available].fillna(0)
|
||
y_h = holdout_df["top3"].fillna(0).astype(int)
|
||
proba = model.predict_proba(X_h)[:, 1]
|
||
precision_at3 = compute_precision_at3(proba, y_h, holdout_df)
|
||
auc = roc_auc_score(y_h, proba)
|
||
return {
|
||
"model": "XGBoost (baseline)",
|
||
"precision_at3": precision_at3,
|
||
"auc": auc,
|
||
}
|
||
except Exception as e:
|
||
print(f" [WARN] Could not load baseline model: {e}")
|
||
# Compute a proper random baseline for comparison purposes
|
||
y_h = holdout_df["top3"].fillna(0).astype(int)
|
||
p3 = float(y_h.mean()) # random / naive baseline
|
||
return {
|
||
"model": "XGBoost (baseline — fallback naive)",
|
||
"precision_at3": round(p3, 4),
|
||
"auc": 0.5,
|
||
}
|
||
|
||
|
||
def compute_precision_at3(proba, y_true, df: pd.DataFrame) -> float:
|
||
"""
|
||
Per-race Precision@3: for each race, take top-3 predicted horses,
|
||
count fraction that are truly in top3. Average across races.
|
||
"""
|
||
tmp = df[["date_programme", "num_reunion", "num_course"]].copy()
|
||
tmp["proba"] = proba
|
||
tmp["actual"] = y_true.values
|
||
|
||
precisions = []
|
||
for _, group in tmp.groupby(["date_programme", "num_reunion", "num_course"]):
|
||
if len(group) < 3:
|
||
continue
|
||
top3_pred = group.nlargest(3, "proba")
|
||
prec = top3_pred["actual"].sum() / 3.0
|
||
precisions.append(prec)
|
||
|
||
return float(np.mean(precisions)) if precisions else 0.0
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 4. OPTUNA HYPERPARAMETER OPTIMIZATION
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def optuna_xgboost(X_train, y_train, n_trials: int = MIN_TRIALS) -> dict:
|
||
"""Optuna study for XGBoost hyperparameters."""
|
||
print(f"\n [Optuna] XGBoost — {n_trials} trials …")
|
||
scale_pos = float((len(y_train) - y_train.sum()) / max(y_train.sum(), 1))
|
||
|
||
def objective(trial):
|
||
params = {
|
||
"objective": "binary:logistic",
|
||
"eval_metric": "auc",
|
||
"verbosity": 0,
|
||
"random_state": 42,
|
||
"scale_pos_weight": scale_pos,
|
||
"n_estimators": trial.suggest_int("n_estimators", 50, 400),
|
||
"max_depth": trial.suggest_int("max_depth", 3, 10),
|
||
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
|
||
"subsample": trial.suggest_float("subsample", 0.5, 1.0),
|
||
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
|
||
"min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
|
||
"reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
|
||
"reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
|
||
"gamma": trial.suggest_float("gamma", 0, 5),
|
||
}
|
||
model = xgb.XGBClassifier(**params)
|
||
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
||
scores = []
|
||
for tr_idx, val_idx in cv.split(X_train, y_train):
|
||
X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
|
||
y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
|
||
model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
|
||
prob = model.predict_proba(X_val)[:, 1]
|
||
scores.append(roc_auc_score(y_val, prob))
|
||
return float(np.mean(scores))
|
||
|
||
study = optuna.create_study(
|
||
direction="maximize",
|
||
pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=0),
|
||
sampler=optuna.samplers.TPESampler(seed=42),
|
||
)
|
||
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
|
||
print(f" Best AUC: {study.best_value:.4f} params: {study.best_params}")
|
||
return study.best_params
|
||
|
||
|
||
def optuna_lightgbm(X_train, y_train, n_trials: int = MIN_TRIALS) -> dict:
|
||
"""Optuna study for LightGBM hyperparameters."""
|
||
print(f"\n [Optuna] LightGBM — {n_trials} trials …")
|
||
scale_pos = float((len(y_train) - y_train.sum()) / max(y_train.sum(), 1))
|
||
|
||
def objective(trial):
|
||
params = {
|
||
"objective": "binary",
|
||
"metric": "auc",
|
||
"verbose": -1,
|
||
"random_state": 42,
|
||
"is_unbalance": False,
|
||
"scale_pos_weight": scale_pos,
|
||
"n_estimators": trial.suggest_int("n_estimators", 50, 400),
|
||
"max_depth": trial.suggest_int("max_depth", 3, 12),
|
||
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
|
||
"num_leaves": trial.suggest_int("num_leaves", 15, 150),
|
||
"subsample": trial.suggest_float("subsample", 0.5, 1.0),
|
||
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
|
||
"min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
|
||
"reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
|
||
"reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
|
||
}
|
||
model = lgb.LGBMClassifier(**params)
|
||
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
||
scores = []
|
||
for tr_idx, val_idx in cv.split(X_train, y_train):
|
||
X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
|
||
y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
|
||
model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
|
||
prob = model.predict_proba(X_val)[:, 1]
|
||
scores.append(roc_auc_score(y_val, prob))
|
||
return float(np.mean(scores))
|
||
|
||
study = optuna.create_study(
|
||
direction="maximize",
|
||
pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=0),
|
||
sampler=optuna.samplers.TPESampler(seed=42),
|
||
)
|
||
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
|
||
print(f" Best AUC: {study.best_value:.4f} params: {study.best_params}")
|
||
return study.best_params
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 5. SHAP FEATURE SELECTION
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def shap_feature_selection(
|
||
model, X_train: pd.DataFrame, threshold: float = 0.005
|
||
) -> list:
|
||
"""Use SHAP values to keep features with mean |SHAP| >= threshold."""
|
||
print("\n [SHAP] Computing feature importance …")
|
||
try:
|
||
explainer = shap.TreeExplainer(model)
|
||
sample = X_train.sample(min(1000, len(X_train)), random_state=42)
|
||
shap_values = explainer.shap_values(sample)
|
||
if isinstance(shap_values, list):
|
||
shap_values = shap_values[1]
|
||
mean_abs = np.abs(shap_values).mean(axis=0)
|
||
importance_df = pd.DataFrame(
|
||
{"feature": X_train.columns, "shap_importance": mean_abs}
|
||
).sort_values("shap_importance", ascending=False)
|
||
print(importance_df.head(15).to_string(index=False))
|
||
selected = importance_df[importance_df["shap_importance"] >= threshold][
|
||
"feature"
|
||
].tolist()
|
||
print(
|
||
f" → {len(selected)}/{len(X_train.columns)} features selected (threshold={threshold})"
|
||
)
|
||
return selected, importance_df
|
||
except Exception as e:
|
||
print(f" [WARN] SHAP failed: {e}. Using all features.")
|
||
return list(X_train.columns), None
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 6. TRAIN INDIVIDUAL MODELS
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def train_xgboost(X_train, y_train, best_params: dict):
|
||
"""Train final XGBoost on all training data."""
|
||
scale_pos = float((len(y_train) - y_train.sum()) / max(y_train.sum(), 1))
|
||
params = {
|
||
"objective": "binary:logistic",
|
||
"eval_metric": "auc",
|
||
"verbosity": 0,
|
||
"random_state": 42,
|
||
"scale_pos_weight": scale_pos,
|
||
**best_params,
|
||
}
|
||
model = xgb.XGBClassifier(**params)
|
||
model.fit(X_train, y_train)
|
||
return model
|
||
|
||
|
||
def train_lightgbm(X_train, y_train, best_params: dict):
|
||
"""Train final LightGBM on all training data."""
|
||
scale_pos = float((len(y_train) - y_train.sum()) / max(y_train.sum(), 1))
|
||
params = {
|
||
"objective": "binary",
|
||
"metric": "auc",
|
||
"verbose": -1,
|
||
"random_state": 42,
|
||
"scale_pos_weight": scale_pos,
|
||
**best_params,
|
||
}
|
||
model = lgb.LGBMClassifier(**params)
|
||
model.fit(X_train, y_train)
|
||
return model
|
||
|
||
|
||
def train_mlp(X_train, y_train) -> Pipeline:
|
||
"""Train MLP (3-layer) with StandardScaler pipeline."""
|
||
pipeline = Pipeline(
|
||
[
|
||
("scaler", StandardScaler()),
|
||
(
|
||
"mlp",
|
||
MLPClassifier(
|
||
hidden_layer_sizes=(256, 128, 64),
|
||
activation="relu",
|
||
solver="adam",
|
||
alpha=1e-3,
|
||
batch_size=128,
|
||
learning_rate="adaptive",
|
||
learning_rate_init=1e-3,
|
||
max_iter=200,
|
||
early_stopping=True,
|
||
validation_fraction=0.1,
|
||
n_iter_no_change=15,
|
||
random_state=42,
|
||
),
|
||
),
|
||
]
|
||
)
|
||
pipeline.fit(X_train, y_train)
|
||
return pipeline
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 7. WEIGHTED VOTING ENSEMBLE
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
class WeightedEnsemble:
|
||
"""Soft-voting ensemble with per-model weights."""
|
||
|
||
def __init__(self, models: dict, weights: dict, feature_cols: list):
|
||
self.models = models # {name: model}
|
||
self.weights = weights # {name: float}
|
||
self.feature_cols = feature_cols
|
||
self.version = f"ensemble_v1_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||
|
||
def predict_proba_all(self, X: pd.DataFrame):
|
||
"""Return dict of model probabilities."""
|
||
probas = {}
|
||
for name, model in self.models.items():
|
||
try:
|
||
available = [c for c in self.feature_cols if c in X.columns]
|
||
probas[name] = model.predict_proba(X[available])[:, 1]
|
||
except Exception as e:
|
||
print(f" [WARN] {name} predict failed: {e}")
|
||
probas[name] = np.zeros(len(X))
|
||
return probas
|
||
|
||
def predict_proba(self, X: pd.DataFrame):
|
||
probas = self.predict_proba_all(X)
|
||
total_w = sum(self.weights.values())
|
||
ensemble_proba = np.zeros(len(X))
|
||
for name, proba in probas.items():
|
||
w = self.weights.get(name, 1.0) / total_w
|
||
ensemble_proba += w * proba
|
||
return np.column_stack([1 - ensemble_proba, ensemble_proba])
|
||
|
||
def predict(self, X: pd.DataFrame, threshold: float = 0.5):
|
||
return (self.predict_proba(X)[:, 1] >= threshold).astype(int)
|
||
|
||
|
||
def compute_ensemble_weights(models: dict, X_val, y_val, feature_cols: list) -> dict:
|
||
"""Compute weights proportional to AUC on validation set."""
|
||
weights = {}
|
||
for name, model in models.items():
|
||
try:
|
||
available = [c for c in feature_cols if c in X_val.columns]
|
||
prob = model.predict_proba(X_val[available])[:, 1]
|
||
auc = roc_auc_score(y_val, prob)
|
||
weights[name] = max(auc - 0.5, 0.01) # clamp positives
|
||
print(f" {name}: AUC={auc:.4f} weight={weights[name]:.4f}")
|
||
except Exception as e:
|
||
print(f" [WARN] {name} weight computation failed: {e}")
|
||
weights[name] = 0.01
|
||
return weights
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 8. EVALUATION HELPERS
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def evaluate_model(
|
||
model, X_holdout, y_holdout, holdout_df: pd.DataFrame, name: str
|
||
) -> dict:
|
||
"""Full evaluation: AUC, Accuracy, Precision@3."""
|
||
t0 = time.time()
|
||
available = [c for c in model.feature_cols if c in X_holdout.columns]
|
||
proba = model.predict_proba(X_holdout[available])[:, 1]
|
||
latency_ms = (time.time() - t0) / len(X_holdout) * 1000
|
||
|
||
auc = roc_auc_score(y_holdout, proba)
|
||
pred = (proba >= 0.5).astype(int)
|
||
acc = accuracy_score(y_holdout, pred)
|
||
prec = precision_score(y_holdout, pred, zero_division=0)
|
||
rec = recall_score(y_holdout, pred, zero_division=0)
|
||
p_at3 = compute_precision_at3(proba, y_holdout, holdout_df)
|
||
|
||
return {
|
||
"model": name,
|
||
"auc": round(auc, 4),
|
||
"accuracy": round(acc, 4),
|
||
"precision": round(prec, 4),
|
||
"recall": round(rec, 4),
|
||
"precision_at3": round(p_at3, 4),
|
||
"latency_ms_per_row": round(latency_ms, 4),
|
||
}
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 9. MAIN PIPELINE
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def main(args):
|
||
print("\n" + "=" * 65)
|
||
print("TURF ML UPGRADE — Ensemble + Optuna (Sprint 6-7)")
|
||
print("=" * 65)
|
||
|
||
# ── Load & feature engineer ──────────────────────────────────────────────
|
||
print("\n[1/9] Loading data …")
|
||
df = load_data(args.db)
|
||
df = engineer_features(df)
|
||
|
||
# ── Temporal split ────────────────────────────────────────────────────────
|
||
print("\n[2/9] Temporal split (80/20) …")
|
||
train_df, holdout_df = temporal_split(df, holdout_frac=HOLDOUT_FRACTION)
|
||
|
||
X_train, y_train, feat_cols = get_features_and_target(train_df, "top3")
|
||
X_holdout, y_holdout, _ = get_features_and_target(holdout_df, "top3")
|
||
|
||
# Sub-split: 80 % actual train, 10 % Optuna val, 10 % weight cal
|
||
n = len(X_train)
|
||
n_val = int(n * 0.15)
|
||
X_tr = X_train.iloc[: n - n_val]
|
||
y_tr = y_train.iloc[: n - n_val]
|
||
X_val = X_train.iloc[n - n_val :]
|
||
y_val = y_train.iloc[n - n_val :]
|
||
val_df = train_df.iloc[n - n_val :]
|
||
|
||
print(f" Train subset: {len(X_tr):,}")
|
||
print(f" Val subset: {len(X_val):,}")
|
||
print(f" Holdout: {len(X_holdout):,}")
|
||
|
||
# ── Baseline evaluation ───────────────────────────────────────────────────
|
||
print("\n[3/9] Evaluating baseline XGBoost …")
|
||
existing_path = "/home/h3r7/turf_saas/xgboost_models.pkl"
|
||
baseline = evaluate_baseline(holdout_df, existing_path)
|
||
print(
|
||
f" Baseline Precision@3: {baseline['precision_at3']:.4f} AUC: {baseline['auc']:.4f}"
|
||
)
|
||
|
||
# ── Optuna ────────────────────────────────────────────────────────────────
|
||
n_trials = args.trials
|
||
|
||
print(f"\n[4/9] Optuna optimization ({n_trials} trials each) …")
|
||
xgb_params = optuna_xgboost(X_tr, y_tr, n_trials=n_trials)
|
||
lgb_params = optuna_lightgbm(X_tr, y_tr, n_trials=n_trials)
|
||
|
||
# ── Train individual models ───────────────────────────────────────────────
|
||
print("\n[5/9] Training individual models …")
|
||
print(" Training optimized XGBoost …")
|
||
xgb_model = train_xgboost(X_tr, y_tr, xgb_params)
|
||
|
||
print(" Training LightGBM …")
|
||
lgb_model = train_lightgbm(X_tr, y_tr, lgb_params)
|
||
|
||
print(" Training MLP (3 layers) …")
|
||
mlp_model = train_mlp(X_tr.values, y_tr)
|
||
|
||
# ── SHAP feature analysis ─────────────────────────────────────────────────
|
||
print("\n[6/9] SHAP feature importance (XGBoost) …")
|
||
selected_features, shap_df = shap_feature_selection(xgb_model, X_tr)
|
||
|
||
# ── Compute weights ────────────────────────────────────────────────────────
|
||
print("\n[7/9] Computing ensemble weights on validation …")
|
||
|
||
# Wrap MLP to use feature_cols interface
|
||
class WrappedMLP:
|
||
def __init__(self, pipeline, cols):
|
||
self.pipeline = pipeline
|
||
self.feature_cols = cols
|
||
|
||
def predict_proba(self, X):
|
||
available = [c for c in self.feature_cols if c in X.columns]
|
||
return self.pipeline.predict_proba(X[available].values)
|
||
|
||
class WrappedTree:
|
||
def __init__(self, model, cols):
|
||
self.model = model
|
||
self.feature_cols = cols
|
||
|
||
def predict_proba(self, X):
|
||
available = [c for c in self.feature_cols if c in X.columns]
|
||
return self.model.predict_proba(X[available])
|
||
|
||
wrapped_xgb = WrappedTree(xgb_model, feat_cols)
|
||
wrapped_lgb = WrappedTree(lgb_model, feat_cols)
|
||
wrapped_mlp = WrappedMLP(mlp_model, feat_cols)
|
||
|
||
model_dict = {
|
||
"xgboost": wrapped_xgb,
|
||
"lightgbm": wrapped_lgb,
|
||
"mlp": wrapped_mlp,
|
||
}
|
||
|
||
weights = compute_ensemble_weights(model_dict, X_val, y_val, feat_cols)
|
||
|
||
# ── Build ensemble ─────────────────────────────────────────────────────────
|
||
print("\n[8/9] Building WeightedEnsemble …")
|
||
|
||
class FullEnsemble:
|
||
"""Picklable ensemble wrapper."""
|
||
|
||
def __init__(self, xgb_m, lgb_m, mlp_pipe, weights, feature_cols):
|
||
self.xgb_model = xgb_m
|
||
self.lgb_model = lgb_m
|
||
self.mlp_pipeline = mlp_pipe
|
||
self.weights = weights
|
||
self.feature_cols = feature_cols
|
||
self.version = f"ensemble_v1_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||
|
||
def predict_proba(self, X: pd.DataFrame):
|
||
if isinstance(X, np.ndarray):
|
||
X = pd.DataFrame(X, columns=self.feature_cols)
|
||
available = [c for c in self.feature_cols if c in X.columns]
|
||
Xa = X[available].fillna(0)
|
||
|
||
total_w = sum(self.weights.values())
|
||
proba = np.zeros(len(Xa))
|
||
|
||
# XGBoost
|
||
xp = self.xgb_model.predict_proba(Xa)[:, 1]
|
||
proba += (self.weights.get("xgboost", 0.33) / total_w) * xp
|
||
|
||
# LightGBM
|
||
lp = self.lgb_model.predict_proba(Xa)[:, 1]
|
||
proba += (self.weights.get("lightgbm", 0.33) / total_w) * lp
|
||
|
||
# MLP
|
||
mp = self.mlp_pipeline.predict_proba(Xa.values)[:, 1]
|
||
proba += (self.weights.get("mlp", 0.33) / total_w) * mp
|
||
|
||
return np.column_stack([1 - proba, proba])
|
||
|
||
def predict(self, X, threshold=0.5):
|
||
return (self.predict_proba(X)[:, 1] >= threshold).astype(int)
|
||
|
||
ensemble = FullEnsemble(xgb_model, lgb_model, mlp_model, weights, feat_cols)
|
||
# Add feature_cols attribute for evaluate_model
|
||
ensemble_eval = type(
|
||
"E",
|
||
(),
|
||
{
|
||
"predict_proba": ensemble.predict_proba,
|
||
"feature_cols": feat_cols,
|
||
},
|
||
)()
|
||
|
||
# ── Holdout evaluation ─────────────────────────────────────────────────────
|
||
print("\n[9/9] Evaluating all models on holdout …")
|
||
results = {}
|
||
|
||
# Individual models
|
||
for name, wrapped in model_dict.items():
|
||
res = evaluate_model(wrapped, X_holdout, y_holdout, holdout_df, name)
|
||
results[name] = res
|
||
print(
|
||
f" {name:12s} Precision@3={res['precision_at3']:.4f} AUC={res['auc']:.4f}"
|
||
)
|
||
|
||
# Ensemble
|
||
ens_res = evaluate_model(
|
||
ensemble_eval, X_holdout, y_holdout, holdout_df, "ensemble"
|
||
)
|
||
results["ensemble"] = ens_res
|
||
print(
|
||
f" {'ensemble':12s} Precision@3={ens_res['precision_at3']:.4f} AUC={ens_res['auc']:.4f}"
|
||
)
|
||
|
||
# Baseline comparison
|
||
baseline_p3 = baseline["precision_at3"]
|
||
ensemble_p3 = ens_res["precision_at3"]
|
||
delta = ensemble_p3 - baseline_p3
|
||
|
||
print("\n" + "=" * 65)
|
||
print("BENCHMARK SUMMARY")
|
||
print("=" * 65)
|
||
print(f" Baseline XGBoost Precision@3: {baseline_p3:.4f}")
|
||
print(f" Optimized Ensemble Precision@3: {ensemble_p3:.4f}")
|
||
print(f" Delta: {delta:+.4f} ({delta * 100:+.1f}%)")
|
||
deploy = delta >= DEPLOY_THRESHOLD
|
||
print(
|
||
f" Deploy threshold (+{DEPLOY_THRESHOLD * 100:.0f}%): {'✅ DEPLOY' if deploy else '❌ BELOW THRESHOLD'}"
|
||
)
|
||
print("=" * 65)
|
||
|
||
# ── Save models ────────────────────────────────────────────────────────────
|
||
ensemble_path = MODELS_DIR / "ensemble_top3.pkl"
|
||
with open(ensemble_path, "wb") as f:
|
||
pickle.dump(ensemble, f)
|
||
print(f"\n ✅ Ensemble saved → {ensemble_path}")
|
||
|
||
# Also save individual optimized models
|
||
for name, model in [("xgboost_optimized", xgb_model), ("lightgbm", lgb_model)]:
|
||
model_path = MODELS_DIR / f"{name}_top3.pkl"
|
||
with open(model_path, "wb") as f:
|
||
pickle.dump(
|
||
{
|
||
"model": model,
|
||
"feature_cols": feat_cols,
|
||
"params": xgb_params if name.startswith("xgb") else lgb_params,
|
||
},
|
||
f,
|
||
)
|
||
print(f" ✅ {name} saved → {model_path}")
|
||
|
||
mlp_path = MODELS_DIR / "mlp_top3.pkl"
|
||
with open(mlp_path, "wb") as f:
|
||
pickle.dump({"pipeline": mlp_model, "feature_cols": feat_cols}, f)
|
||
print(f" ✅ MLP saved → {mlp_path}")
|
||
|
||
# ── Save benchmark report ──────────────────────────────────────────────────
|
||
report = {
|
||
"run_date": datetime.now().isoformat(),
|
||
"dataset": {
|
||
"db_path": args.db,
|
||
"total_rows": len(df),
|
||
"train_rows": len(X_train),
|
||
"holdout_rows": len(X_holdout),
|
||
"train_date_range": [
|
||
str(train_df["date_programme"].min()),
|
||
str(train_df["date_programme"].max()),
|
||
],
|
||
"holdout_date_range": [
|
||
str(holdout_df["date_programme"].min()),
|
||
str(holdout_df["date_programme"].max()),
|
||
],
|
||
},
|
||
"baseline": baseline,
|
||
"individual_models": {k: v for k, v in results.items() if k != "ensemble"},
|
||
"ensemble": ens_res,
|
||
"delta_precision_at3": round(delta, 4),
|
||
"deploy": deploy,
|
||
"optuna": {
|
||
"n_trials": n_trials,
|
||
"xgboost_best_params": xgb_params,
|
||
"lightgbm_best_params": lgb_params,
|
||
},
|
||
"features": {
|
||
"total": len(feat_cols),
|
||
"selected_by_shap": len(selected_features),
|
||
"feature_list": feat_cols,
|
||
"shap_selected": selected_features,
|
||
},
|
||
"ensemble_weights": weights,
|
||
}
|
||
|
||
report_path = MODELS_DIR / "benchmark_report.json"
|
||
with open(report_path, "w") as f:
|
||
json.dump(report, f, indent=2)
|
||
print(f" ✅ Benchmark report → {report_path}")
|
||
|
||
# Human-readable markdown
|
||
md_path = MODELS_DIR / "benchmark_report.md"
|
||
_write_markdown_report(report, md_path)
|
||
print(f" ✅ Markdown report → {md_path}")
|
||
|
||
return report
|
||
|
||
|
||
def _write_markdown_report(report: dict, path: Path):
|
||
"""Write a human-readable markdown benchmark report."""
|
||
b = report["baseline"]
|
||
e = report["ensemble"]
|
||
delta = report["delta_precision_at3"]
|
||
deploy_str = (
|
||
"✅ DEPLOIEMENT RECOMMANDE"
|
||
if report["deploy"]
|
||
else "❌ EN DESSOUS DU SEUIL (+5%)"
|
||
)
|
||
|
||
lines = [
|
||
f"# Benchmark ML Ensemble — Turf Prédictions",
|
||
f"",
|
||
f"**Date:** {report['run_date'][:10]} ",
|
||
f"**Dataset:** {report['dataset']['total_rows']:,} partants ",
|
||
f"**Holdout:** {report['dataset']['holdout_rows']:,} lignes ({report['dataset']['holdout_date_range'][0]} → {report['dataset']['holdout_date_range'][1]})",
|
||
f"",
|
||
f"## Résultats",
|
||
f"",
|
||
f"| Modèle | Precision@3 | AUC | Latence/prédiction |",
|
||
f"|--------|-------------|-----|-------------------|",
|
||
f"| XGBoost (baseline) | {b['precision_at3']:.4f} | {b['auc']:.4f} | — |",
|
||
]
|
||
for name, res in report["individual_models"].items():
|
||
lines.append(
|
||
f"| {name} | {res['precision_at3']:.4f} | {res['auc']:.4f} | {res['latency_ms_per_row']:.2f} ms |"
|
||
)
|
||
lines += [
|
||
f"| **Ensemble** | **{e['precision_at3']:.4f}** | **{e['auc']:.4f}** | **{e['latency_ms_per_row']:.2f} ms** |",
|
||
f"",
|
||
f"## Décision de déploiement",
|
||
f"",
|
||
f"- Delta Precision@3 : **{delta:+.4f}** ({delta * 100:+.1f}%)",
|
||
f"- Seuil requis : **+5%**",
|
||
f"- Résultat : **{deploy_str}**",
|
||
f"",
|
||
f"## Optimisation Optuna",
|
||
f"",
|
||
f"- Trials XGBoost : {report['optuna']['n_trials']}",
|
||
f"- Trials LightGBM : {report['optuna']['n_trials']}",
|
||
f"- Pruning : MedianPruner",
|
||
f"",
|
||
f"### Meilleurs hyperparamètres XGBoost",
|
||
f"```json",
|
||
json.dumps(report["optuna"]["xgboost_best_params"], indent=2),
|
||
f"```",
|
||
f"",
|
||
f"### Meilleurs hyperparamètres LightGBM",
|
||
f"```json",
|
||
json.dumps(report["optuna"]["lightgbm_best_params"], indent=2),
|
||
f"```",
|
||
f"",
|
||
f"## Features",
|
||
f"",
|
||
f"- Total features : {report['features']['total']}",
|
||
f"- Retenues par SHAP : {report['features']['selected_by_shap']}",
|
||
f"",
|
||
f"## Poids de l'ensemble",
|
||
f"",
|
||
]
|
||
for name, w in report["ensemble_weights"].items():
|
||
lines.append(f"- {name} : {w:.4f}")
|
||
|
||
path.write_text("\n".join(lines))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
parser = argparse.ArgumentParser(
|
||
description="Train ensemble ML model for turf predictions"
|
||
)
|
||
parser.add_argument("--db", default=DB_PATH, help="Path to SQLite database")
|
||
parser.add_argument(
|
||
"--trials", type=int, default=MIN_TRIALS, help="Optuna trials per model"
|
||
)
|
||
parser.add_argument(
|
||
"--quick", action="store_true", help="Quick mode: 10 trials only"
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
if args.quick:
|
||
args.trials = 10
|
||
|
||
report = main(args)
|
||
print(f"\nDone. Deploy={report['deploy']}")
|