#!/usr/bin/env python3 """ Rebuild ensemble using known best Optuna params (from completed study). Skips the 100-trial Optuna search and goes straight to training + pickling. """ import sys sys.path.insert(0, '/home/h3r7/turf_saas') from train_ensemble import ( load_data, engineer_features, temporal_split, get_features_and_target, evaluate_baseline, train_xgboost, train_lightgbm, train_mlp, shap_feature_selection, compute_ensemble_weights, evaluate_model, compute_precision_at3, TurfEnsemble, MODELS_DIR, DEPLOY_THRESHOLD, _write_markdown_report ) import json, pickle, numpy as np from datetime import datetime from pathlib import Path DB_PATH = '/home/h3r7/turf_saas/turf.db' # Best params from the 100-trial Optuna run XGB_BEST = { 'n_estimators': 141, 'max_depth': 5, 'learning_rate': 0.016298172447266404, 'subsample': 0.7660470794373848, 'colsample_bytree': 0.471124415020467, 'min_child_weight': 14, 'reg_alpha': 1.9364166463791586, 'reg_lambda': 6.018030083488602, 'gamma': 4.614943551368141, } LGB_BEST = { 'n_estimators': 186, 'max_depth': 4, 'learning_rate': 0.012915117465216954, 'num_leaves': 141, 'subsample': 0.6193119116922561, 'colsample_bytree': 0.539310022549326, 'min_child_samples': 9, 'reg_alpha': 0.6864583098112754, 'reg_lambda': 0.0549259590914184, } print("=" * 65) print("TURF ENSEMBLE REBUILD (using pre-computed Optuna params)") print("=" * 65) print("\n[1/7] Loading data...") df = load_data(DB_PATH) df = engineer_features(df) print("\n[2/7] Temporal split...") train_df, holdout_df = temporal_split(df) X_train, y_train, feat_cols = get_features_and_target(train_df) X_holdout, y_holdout, _ = get_features_and_target(holdout_df) n = len(X_train); n_val = int(n * 0.15) X_tr = X_train.iloc[:n-n_val]; y_tr = y_train.iloc[:n-n_val] X_val = X_train.iloc[n-n_val:]; y_val = y_train.iloc[n-n_val:] print("\n[3/7] Evaluating baseline XGBoost...") baseline = evaluate_baseline(holdout_df, '/home/h3r7/turf_saas/xgboost_models.pkl') print(f" Baseline P@3={baseline['precision_at3']:.4f} AUC={baseline['auc']:.4f}") print("\n[4/7] Training models with best params...") print(" XGBoost...") xgb_model = train_xgboost(X_tr, y_tr, XGB_BEST) print(" LightGBM...") lgb_model = train_lightgbm(X_tr, y_tr, LGB_BEST) print(" MLP...") mlp_model = train_mlp(X_tr.values, y_tr) print("\n[5/7] SHAP analysis...") selected_features, shap_df = shap_feature_selection(xgb_model, X_tr) print("\n[6/7] Computing ensemble weights...") class WrappedMLP: def __init__(self, pipeline, cols): self.pipeline = pipeline self.feature_cols = cols def predict_proba(self, X): import pandas as pd available = [c for c in self.feature_cols if c in X.columns] return self.pipeline.predict_proba(X[available].values) class WrappedTree: def __init__(self, model, cols): self.model = model self.feature_cols = cols def predict_proba(self, X): available = [c for c in self.feature_cols if c in X.columns] return self.model.predict_proba(X[available]) wrapped_xgb = WrappedTree(xgb_model, feat_cols) wrapped_lgb = WrappedTree(lgb_model, feat_cols) wrapped_mlp = WrappedMLP(mlp_model, feat_cols) model_dict = {'xgboost': wrapped_xgb, 'lightgbm': wrapped_lgb, 'mlp': wrapped_mlp} weights = compute_ensemble_weights(model_dict, X_val, y_val, feat_cols) print(" Weights:", weights) print("\n[7/7] Evaluating + saving ensemble...") ensemble = TurfEnsemble(xgb_model, lgb_model, mlp_model, weights, feat_cols) results = {} for name, wrapped in model_dict.items(): res = evaluate_model(wrapped, X_holdout, y_holdout, holdout_df, name) results[name] = res print(f" {name:12s} P@3={res['precision_at3']:.4f} AUC={res['auc']:.4f}") ens_res = evaluate_model(ensemble, X_holdout, y_holdout, holdout_df, "ensemble") results["ensemble"] = ens_res print(f" {'ensemble':12s} P@3={ens_res['precision_at3']:.4f} AUC={ens_res['auc']:.4f}") delta = ens_res['precision_at3'] - baseline['precision_at3'] deploy = delta >= DEPLOY_THRESHOLD print(f"\n Delta: {delta:+.4f} ({delta*100:+.1f}%) Deploy={'YES' if deploy else 'NO'}") # Save ensemble ensemble_path = MODELS_DIR / "ensemble_top3.pkl" with open(ensemble_path, "wb") as f: pickle.dump(ensemble, f) print(f"\n ✅ ensemble_top3.pkl saved ({ensemble_path.stat().st_size//1024} KB)") # Save individual models for name, model in [("xgboost_optimized", xgb_model), ("lightgbm", lgb_model)]: path = MODELS_DIR / f"{name}_top3.pkl" with open(path, "wb") as f: pickle.dump({"model": model, "feature_cols": feat_cols}, f) print(f" ✅ {name}_top3.pkl saved") mlp_path = MODELS_DIR / "mlp_top3.pkl" with open(mlp_path, "wb") as f: pickle.dump({"pipeline": mlp_model, "feature_cols": feat_cols}, f) print(f" ✅ mlp_top3.pkl saved") # Benchmark report report = { "run_date": datetime.now().isoformat(), "dataset": { "db_path": DB_PATH, "total_rows": len(df), "train_rows": len(X_train), "holdout_rows": len(X_holdout), "train_date_range": [str(train_df["date_programme"].min()), str(train_df["date_programme"].max())], "holdout_date_range": [str(holdout_df["date_programme"].min()), str(holdout_df["date_programme"].max())], }, "baseline": baseline, "individual_models": {k: v for k, v in results.items() if k != "ensemble"}, "ensemble": ens_res, "delta_precision_at3": round(delta, 4), "deploy": deploy, "optuna": { "n_trials": 100, "xgboost_best_params": XGB_BEST, "lightgbm_best_params": LGB_BEST, }, "features": { "total": len(feat_cols), "selected_by_shap": len(selected_features), "feature_list": feat_cols, "shap_selected": selected_features, }, "ensemble_weights": weights, } report_path = MODELS_DIR / "benchmark_report.json" with open(report_path, "w") as f: json.dump(report, f, indent=2) print(f" ✅ benchmark_report.json saved") md_path = MODELS_DIR / "benchmark_report.md" _write_markdown_report(report, md_path) print(f" ✅ benchmark_report.md saved") print("\n" + "=" * 65) print("DONE") print(f" Baseline P@3: {baseline['precision_at3']:.4f}") print(f" Ensemble P@3: {ens_res['precision_at3']:.4f}") print(f" Delta: {delta:+.4f} ({delta*100:+.1f}%)") print(f" Deploy: {'✅ YES' if deploy else '❌ NO'}") print("=" * 65)