Results: - XGBoost (Optuna 100 trials): AUC=0.7856, Precision@3=0.5783 - LightGBM (Optuna 100 trials): AUC=0.7833, Precision@3=0.5736 - MLP (3 layers 256-128-64): AUC=0.7743, Precision@3=0.5643 - Ensemble (weighted voting): AUC=0.7840, Precision@3=0.5814 Baseline XGBoost: Precision@3=0.5287 Delta: +0.0527 (+5.3%) — DEPLOY threshold met (+5%) Latency: 35ms/race, 69ms/full-day (well under 200ms limit) SHAP: 31/43 features selected, top features: rang_cote, implied_prob, cote_direct, ratio_cote_field All 12 regression/latency tests passing. Co-Authored-By: Paperclip <noreply@paperclip.ing>
183 lines
6.3 KiB
Python
183 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Rebuild ensemble using known best Optuna params (from completed study).
|
|
Skips the 100-trial Optuna search and goes straight to training + pickling.
|
|
"""
|
|
import sys
|
|
sys.path.insert(0, '/home/h3r7/turf_saas')
|
|
|
|
from train_ensemble import (
|
|
load_data, engineer_features, temporal_split, get_features_and_target,
|
|
evaluate_baseline, train_xgboost, train_lightgbm, train_mlp,
|
|
shap_feature_selection, compute_ensemble_weights,
|
|
evaluate_model, compute_precision_at3, TurfEnsemble,
|
|
MODELS_DIR, DEPLOY_THRESHOLD, _write_markdown_report
|
|
)
|
|
import json, pickle, numpy as np
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
DB_PATH = '/home/h3r7/turf_saas/turf.db'
|
|
|
|
# Best params from the 100-trial Optuna run
|
|
XGB_BEST = {
|
|
'n_estimators': 141, 'max_depth': 5,
|
|
'learning_rate': 0.016298172447266404,
|
|
'subsample': 0.7660470794373848,
|
|
'colsample_bytree': 0.471124415020467,
|
|
'min_child_weight': 14,
|
|
'reg_alpha': 1.9364166463791586,
|
|
'reg_lambda': 6.018030083488602,
|
|
'gamma': 4.614943551368141,
|
|
}
|
|
LGB_BEST = {
|
|
'n_estimators': 186, 'max_depth': 4,
|
|
'learning_rate': 0.012915117465216954,
|
|
'num_leaves': 141,
|
|
'subsample': 0.6193119116922561,
|
|
'colsample_bytree': 0.539310022549326,
|
|
'min_child_samples': 9,
|
|
'reg_alpha': 0.6864583098112754,
|
|
'reg_lambda': 0.0549259590914184,
|
|
}
|
|
|
|
print("=" * 65)
|
|
print("TURF ENSEMBLE REBUILD (using pre-computed Optuna params)")
|
|
print("=" * 65)
|
|
|
|
print("\n[1/7] Loading data...")
|
|
df = load_data(DB_PATH)
|
|
df = engineer_features(df)
|
|
|
|
print("\n[2/7] Temporal split...")
|
|
train_df, holdout_df = temporal_split(df)
|
|
X_train, y_train, feat_cols = get_features_and_target(train_df)
|
|
X_holdout, y_holdout, _ = get_features_and_target(holdout_df)
|
|
|
|
n = len(X_train); n_val = int(n * 0.15)
|
|
X_tr = X_train.iloc[:n-n_val]; y_tr = y_train.iloc[:n-n_val]
|
|
X_val = X_train.iloc[n-n_val:]; y_val = y_train.iloc[n-n_val:]
|
|
|
|
print("\n[3/7] Evaluating baseline XGBoost...")
|
|
baseline = evaluate_baseline(holdout_df, '/home/h3r7/turf_saas/xgboost_models.pkl')
|
|
print(f" Baseline P@3={baseline['precision_at3']:.4f} AUC={baseline['auc']:.4f}")
|
|
|
|
print("\n[4/7] Training models with best params...")
|
|
print(" XGBoost...")
|
|
xgb_model = train_xgboost(X_tr, y_tr, XGB_BEST)
|
|
print(" LightGBM...")
|
|
lgb_model = train_lightgbm(X_tr, y_tr, LGB_BEST)
|
|
print(" MLP...")
|
|
mlp_model = train_mlp(X_tr.values, y_tr)
|
|
|
|
print("\n[5/7] SHAP analysis...")
|
|
selected_features, shap_df = shap_feature_selection(xgb_model, X_tr)
|
|
|
|
print("\n[6/7] Computing ensemble weights...")
|
|
class WrappedMLP:
|
|
def __init__(self, pipeline, cols):
|
|
self.pipeline = pipeline
|
|
self.feature_cols = cols
|
|
def predict_proba(self, X):
|
|
import pandas as pd
|
|
available = [c for c in self.feature_cols if c in X.columns]
|
|
return self.pipeline.predict_proba(X[available].values)
|
|
|
|
class WrappedTree:
|
|
def __init__(self, model, cols):
|
|
self.model = model
|
|
self.feature_cols = cols
|
|
def predict_proba(self, X):
|
|
available = [c for c in self.feature_cols if c in X.columns]
|
|
return self.model.predict_proba(X[available])
|
|
|
|
wrapped_xgb = WrappedTree(xgb_model, feat_cols)
|
|
wrapped_lgb = WrappedTree(lgb_model, feat_cols)
|
|
wrapped_mlp = WrappedMLP(mlp_model, feat_cols)
|
|
model_dict = {'xgboost': wrapped_xgb, 'lightgbm': wrapped_lgb, 'mlp': wrapped_mlp}
|
|
|
|
weights = compute_ensemble_weights(model_dict, X_val, y_val, feat_cols)
|
|
print(" Weights:", weights)
|
|
|
|
print("\n[7/7] Evaluating + saving ensemble...")
|
|
ensemble = TurfEnsemble(xgb_model, lgb_model, mlp_model, weights, feat_cols)
|
|
|
|
results = {}
|
|
for name, wrapped in model_dict.items():
|
|
res = evaluate_model(wrapped, X_holdout, y_holdout, holdout_df, name)
|
|
results[name] = res
|
|
print(f" {name:12s} P@3={res['precision_at3']:.4f} AUC={res['auc']:.4f}")
|
|
|
|
ens_res = evaluate_model(ensemble, X_holdout, y_holdout, holdout_df, "ensemble")
|
|
results["ensemble"] = ens_res
|
|
print(f" {'ensemble':12s} P@3={ens_res['precision_at3']:.4f} AUC={ens_res['auc']:.4f}")
|
|
|
|
delta = ens_res['precision_at3'] - baseline['precision_at3']
|
|
deploy = delta >= DEPLOY_THRESHOLD
|
|
print(f"\n Delta: {delta:+.4f} ({delta*100:+.1f}%) Deploy={'YES' if deploy else 'NO'}")
|
|
|
|
# Save ensemble
|
|
ensemble_path = MODELS_DIR / "ensemble_top3.pkl"
|
|
with open(ensemble_path, "wb") as f:
|
|
pickle.dump(ensemble, f)
|
|
print(f"\n ✅ ensemble_top3.pkl saved ({ensemble_path.stat().st_size//1024} KB)")
|
|
|
|
# Save individual models
|
|
for name, model in [("xgboost_optimized", xgb_model), ("lightgbm", lgb_model)]:
|
|
path = MODELS_DIR / f"{name}_top3.pkl"
|
|
with open(path, "wb") as f:
|
|
pickle.dump({"model": model, "feature_cols": feat_cols}, f)
|
|
print(f" ✅ {name}_top3.pkl saved")
|
|
|
|
mlp_path = MODELS_DIR / "mlp_top3.pkl"
|
|
with open(mlp_path, "wb") as f:
|
|
pickle.dump({"pipeline": mlp_model, "feature_cols": feat_cols}, f)
|
|
print(f" ✅ mlp_top3.pkl saved")
|
|
|
|
# Benchmark report
|
|
report = {
|
|
"run_date": datetime.now().isoformat(),
|
|
"dataset": {
|
|
"db_path": DB_PATH,
|
|
"total_rows": len(df),
|
|
"train_rows": len(X_train),
|
|
"holdout_rows": len(X_holdout),
|
|
"train_date_range": [str(train_df["date_programme"].min()), str(train_df["date_programme"].max())],
|
|
"holdout_date_range": [str(holdout_df["date_programme"].min()), str(holdout_df["date_programme"].max())],
|
|
},
|
|
"baseline": baseline,
|
|
"individual_models": {k: v for k, v in results.items() if k != "ensemble"},
|
|
"ensemble": ens_res,
|
|
"delta_precision_at3": round(delta, 4),
|
|
"deploy": deploy,
|
|
"optuna": {
|
|
"n_trials": 100,
|
|
"xgboost_best_params": XGB_BEST,
|
|
"lightgbm_best_params": LGB_BEST,
|
|
},
|
|
"features": {
|
|
"total": len(feat_cols),
|
|
"selected_by_shap": len(selected_features),
|
|
"feature_list": feat_cols,
|
|
"shap_selected": selected_features,
|
|
},
|
|
"ensemble_weights": weights,
|
|
}
|
|
|
|
report_path = MODELS_DIR / "benchmark_report.json"
|
|
with open(report_path, "w") as f:
|
|
json.dump(report, f, indent=2)
|
|
print(f" ✅ benchmark_report.json saved")
|
|
|
|
md_path = MODELS_DIR / "benchmark_report.md"
|
|
_write_markdown_report(report, md_path)
|
|
print(f" ✅ benchmark_report.md saved")
|
|
|
|
print("\n" + "=" * 65)
|
|
print("DONE")
|
|
print(f" Baseline P@3: {baseline['precision_at3']:.4f}")
|
|
print(f" Ensemble P@3: {ens_res['precision_at3']:.4f}")
|
|
print(f" Delta: {delta:+.4f} ({delta*100:+.1f}%)")
|
|
print(f" Deploy: {'✅ YES' if deploy else '❌ NO'}")
|
|
print("=" * 65)
|