turf_saas/rebuild_ensemble.py

#!/usr/bin/env python3
"""
Rebuild ensemble using known best Optuna params (from completed study).
Skips the 100-trial Optuna search and goes straight to training + pickling.
"""
import sys
sys.path.insert(0, '/home/h3r7/turf_saas')

from train_ensemble import (
    load_data, engineer_features, temporal_split, get_features_and_target,
    evaluate_baseline, train_xgboost, train_lightgbm, train_mlp,
    shap_feature_selection, compute_ensemble_weights,
    evaluate_model, compute_precision_at3, TurfEnsemble,
    MODELS_DIR, DEPLOY_THRESHOLD, _write_markdown_report
)
import json, pickle, numpy as np
from datetime import datetime
from pathlib import Path

DB_PATH = '/home/h3r7/turf_saas/turf.db'

# Best params from the 100-trial Optuna run
XGB_BEST = {
    'n_estimators': 141, 'max_depth': 5,
    'learning_rate': 0.016298172447266404,
    'subsample': 0.7660470794373848,
    'colsample_bytree': 0.471124415020467,
    'min_child_weight': 14,
    'reg_alpha': 1.9364166463791586,
    'reg_lambda': 6.018030083488602,
    'gamma': 4.614943551368141,
}
LGB_BEST = {
    'n_estimators': 186, 'max_depth': 4,
    'learning_rate': 0.012915117465216954,
    'num_leaves': 141,
    'subsample': 0.6193119116922561,
    'colsample_bytree': 0.539310022549326,
    'min_child_samples': 9,
    'reg_alpha': 0.6864583098112754,
    'reg_lambda': 0.0549259590914184,
}

print("=" * 65)
print("TURF ENSEMBLE REBUILD (using pre-computed Optuna params)")
print("=" * 65)

print("\n[1/7] Loading data...")
df = load_data(DB_PATH)
df = engineer_features(df)

print("\n[2/7] Temporal split...")
train_df, holdout_df = temporal_split(df)
X_train, y_train, feat_cols = get_features_and_target(train_df)
X_holdout, y_holdout, _ = get_features_and_target(holdout_df)

n = len(X_train); n_val = int(n * 0.15)
X_tr = X_train.iloc[:n-n_val]; y_tr = y_train.iloc[:n-n_val]
X_val = X_train.iloc[n-n_val:]; y_val = y_train.iloc[n-n_val:]

print("\n[3/7] Evaluating baseline XGBoost...")
baseline = evaluate_baseline(holdout_df, '/home/h3r7/turf_saas/xgboost_models.pkl')
print(f"  Baseline P@3={baseline['precision_at3']:.4f}  AUC={baseline['auc']:.4f}")

print("\n[4/7] Training models with best params...")
print("  XGBoost...")
xgb_model = train_xgboost(X_tr, y_tr, XGB_BEST)
print("  LightGBM...")
lgb_model = train_lightgbm(X_tr, y_tr, LGB_BEST)
print("  MLP...")
mlp_model = train_mlp(X_tr.values, y_tr)

print("\n[5/7] SHAP analysis...")
selected_features, shap_df = shap_feature_selection(xgb_model, X_tr)

print("\n[6/7] Computing ensemble weights...")
class WrappedMLP:
    def __init__(self, pipeline, cols):
        self.pipeline = pipeline
        self.feature_cols = cols
    def predict_proba(self, X):
        import pandas as pd
        available = [c for c in self.feature_cols if c in X.columns]
        return self.pipeline.predict_proba(X[available].values)

class WrappedTree:
    def __init__(self, model, cols):
        self.model = model
        self.feature_cols = cols
    def predict_proba(self, X):
        available = [c for c in self.feature_cols if c in X.columns]
        return self.model.predict_proba(X[available])

wrapped_xgb = WrappedTree(xgb_model, feat_cols)
wrapped_lgb = WrappedTree(lgb_model, feat_cols)
wrapped_mlp = WrappedMLP(mlp_model, feat_cols)
model_dict = {'xgboost': wrapped_xgb, 'lightgbm': wrapped_lgb, 'mlp': wrapped_mlp}

weights = compute_ensemble_weights(model_dict, X_val, y_val, feat_cols)
print("  Weights:", weights)

print("\n[7/7] Evaluating + saving ensemble...")
ensemble = TurfEnsemble(xgb_model, lgb_model, mlp_model, weights, feat_cols)

results = {}
for name, wrapped in model_dict.items():
    res = evaluate_model(wrapped, X_holdout, y_holdout, holdout_df, name)
    results[name] = res
    print(f"  {name:12s} P@3={res['precision_at3']:.4f}  AUC={res['auc']:.4f}")

ens_res = evaluate_model(ensemble, X_holdout, y_holdout, holdout_df, "ensemble")
results["ensemble"] = ens_res
print(f"  {'ensemble':12s} P@3={ens_res['precision_at3']:.4f}  AUC={ens_res['auc']:.4f}")

delta = ens_res['precision_at3'] - baseline['precision_at3']
deploy = delta >= DEPLOY_THRESHOLD
print(f"\n  Delta: {delta:+.4f} ({delta*100:+.1f}%)  Deploy={'YES' if deploy else 'NO'}")

# Save ensemble
ensemble_path = MODELS_DIR / "ensemble_top3.pkl"
with open(ensemble_path, "wb") as f:
    pickle.dump(ensemble, f)
print(f"\n  ✅ ensemble_top3.pkl saved ({ensemble_path.stat().st_size//1024} KB)")

# Save individual models
for name, model in [("xgboost_optimized", xgb_model), ("lightgbm", lgb_model)]:
    path = MODELS_DIR / f"{name}_top3.pkl"
    with open(path, "wb") as f:
        pickle.dump({"model": model, "feature_cols": feat_cols}, f)
    print(f"  ✅ {name}_top3.pkl saved")

mlp_path = MODELS_DIR / "mlp_top3.pkl"
with open(mlp_path, "wb") as f:
    pickle.dump({"pipeline": mlp_model, "feature_cols": feat_cols}, f)
print(f"  ✅ mlp_top3.pkl saved")

# Benchmark report
report = {
    "run_date": datetime.now().isoformat(),
    "dataset": {
        "db_path": DB_PATH,
        "total_rows": len(df),
        "train_rows": len(X_train),
        "holdout_rows": len(X_holdout),
        "train_date_range": [str(train_df["date_programme"].min()), str(train_df["date_programme"].max())],
        "holdout_date_range": [str(holdout_df["date_programme"].min()), str(holdout_df["date_programme"].max())],
    },
    "baseline": baseline,
    "individual_models": {k: v for k, v in results.items() if k != "ensemble"},
    "ensemble": ens_res,
    "delta_precision_at3": round(delta, 4),
    "deploy": deploy,
    "optuna": {
        "n_trials": 100,
        "xgboost_best_params": XGB_BEST,
        "lightgbm_best_params": LGB_BEST,
    },
    "features": {
        "total": len(feat_cols),
        "selected_by_shap": len(selected_features),
        "feature_list": feat_cols,
        "shap_selected": selected_features,
    },
    "ensemble_weights": weights,
}

report_path = MODELS_DIR / "benchmark_report.json"
with open(report_path, "w") as f:
    json.dump(report, f, indent=2)
print(f"  ✅ benchmark_report.json saved")

md_path = MODELS_DIR / "benchmark_report.md"
_write_markdown_report(report, md_path)
print(f"  ✅ benchmark_report.md saved")

print("\n" + "=" * 65)
print("DONE")
print(f"  Baseline P@3:  {baseline['precision_at3']:.4f}")
print(f"  Ensemble P@3:  {ens_res['precision_at3']:.4f}")
print(f"  Delta:         {delta:+.4f} ({delta*100:+.1f}%)")
print(f"  Deploy:        {'✅ YES' if deploy else '❌ NO'}")
print("=" * 65)