#!/usr/bin/env python3 """ XGBoost Training for Turf Predictions - Predict top1 (winner) and top3 (placed) - Cross-validation for robust evaluation - Feature importance analysis """ import sqlite3 import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold from sklearn.preprocessing import LabelEncoder from sklearn.metrics import accuracy_score, classification_report, roc_auc_score import xgboost as xgb import os import json from datetime import datetime DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db") OUTPUT_DIR = "/home/h3r7/turf_scraper" def load_data(): """Load historical data from database.""" conn = sqlite3.connect(DB_PATH) query = """ SELECT date, hippodrome, distance, discipline, allocation, nb_partants, horse_name, horse_number, age, sexe, musique, nb_courses, nb_victoires, nb_places, nb_places_2, nb_places_3, gains_carriere, gains_annee, gains_victoires, reduction_km, avis_entraineur, oeilleres, deferre, cote_directe, cote_reference, indicateur_tendance, est_favori, tx_victoire, tx_place, forme_recente, tendance_forme, nb_disq, rang_cote, ratio_cote_field, ordre_arrivee, top1, top3, top5 FROM historical_data WHERE ordre_arrivee > 0 """ df = pd.read_sql_query(query, conn) conn.close() print(f"āœ… Loaded {len(df)} rows from historical_data") return df def create_features(df): """Create features for ML model.""" df = df.copy() # Encode categorical variables le_discipline = LabelEncoder() le_sexe = LabelEncoder() le_avis = LabelEncoder() le_oeilleres = LabelEncoder() le_deferre = LabelEncoder() df['discipline_enc'] = le_discipline.fit_transform(df['discipline'].fillna('UNKNOWN')) df['sexe_enc'] = le_sexe.fit_transform(df['sexe'].fillna('U')) df['avis_enc'] = le_avis.fit_transform(df['avis_entraineur'].fillna('NEUTRE')) df['oeilleres_enc'] = le_oeilleres.fit_transform(df['oeilleres'].fillna('SANS')) df['deferre_enc'] = le_deferre.fit_transform(df['deferre'].fillna('NON')) # Parse musique (last 5 races form) def parse_music(music): if not music or pd.isna(music): return [0, 0, 0, 0, 0] try: # Extract numbers from music string like "1a2a3a4a5a" import re numbers = re.findall(r'\d+', str(music)) return [int(n) if n else 0 for n in numbers[:5]] except: return [0, 0, 0, 0, 0] music_parsed = df['musique'].apply(parse_music) df['form_1'] = music_parsed.apply(lambda x: x[0] if len(x) > 0 else 0) df['form_2'] = music_parsed.apply(lambda x: x[1] if len(x) > 1 else 0) df['form_3'] = music_parsed.apply(lambda x: x[2] if len(x) > 2 else 0) df['form_4'] = music_parsed.apply(lambda x: x[3] if len(x) > 3 else 0) df['form_5'] = music_parsed.apply(lambda x: x[4] if len(x) > 4 else 0) # Average form (lower is better in turf) df['form_avg'] = df[['form_1', 'form_2', 'form_3', 'form_4', 'form_5']].mean(axis=1) # Win rate adjusted by number of races df['win_rate_adj'] = df['tx_victoire'] * np.log1p(df['nb_courses']) # Place rate adjusted df['place_rate_adj'] = df['tx_place'] * np.log1p(df['nb_courses']) # Odds implied probability df['implied_prob'] = 1 / df['cote_directe'].replace(0, np.nan) # Performance metrics df['victories_per_race'] = df['nb_victoires'] / df['nb_courses'].replace(0, 1) df['places_per_race'] = df['nb_places'] / df['nb_courses'].replace(0, 1) # Earnings per race df['earnings_per_race'] = df['gains_annee'] / df['nb_courses'].replace(0, 1) # Age-performance interaction df['age_win_interact'] = df['age'] * df['tx_victoire'] # Distance category df['distance_cat'] = pd.cut(df['distance'], bins=[0, 1500, 2000, 2500, 4000], labels=[1, 2, 3, 4]).astype(float) # Favoritism indicator df['is_favorite'] = (df['cote_directe'] < 5).astype(int) return df def prepare_ml_data(df, target_col): """Prepare features and target for ML.""" feature_cols = [ 'age', 'sexe_enc', 'nb_courses', 'nb_victoires', 'nb_places', 'tx_victoire', 'tx_place', 'forme_recente', 'reduction_km', 'gains_annee', 'cote_directe', 'distance', 'nb_partants', 'discipline_enc', 'avis_enc', 'oeilleres_enc', 'deferre_enc', 'form_1', 'form_2', 'form_3', 'form_4', 'form_5', 'form_avg', 'win_rate_adj', 'place_rate_adj', 'implied_prob', 'victories_per_race', 'places_per_race', 'earnings_per_race', 'age_win_interact', 'distance_cat', 'is_favorite', 'rang_cote', 'ratio_cote_field' ] # Filter valid features feature_cols = [c for c in feature_cols if c in df.columns] X = df[feature_cols].fillna(0) y = df[target_col].fillna(0).astype(int) return X, y, feature_cols def train_xgboost_model(X, y, target_name): """Train XGBoost model with cross-validation.""" print(f"\n{'='*60}") print(f"Training XGBoost for {target_name}") print(f"{'='*60}") # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f"Train size: {len(X_train)}, Test size: {len(X_test)}") print(f"Positive class: {y.sum()} ({y.mean()*100:.1f}%)") # XGBoost parameters params = { 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': (len(y) - y.sum()) / y.sum(), # Handle imbalance 'random_state': 42, 'verbosity': 0 } # Train model model = xgb.XGBClassifier(**params) model.fit(X_train, y_train) # Cross-validation cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) cv_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc') print(f"\nCross-validation AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})") # Test predictions y_pred = model.predict(X_test) y_prob = model.predict_proba(X_test)[:, 1] accuracy = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_prob) print(f"\nTest Accuracy: {accuracy:.3f}") print(f"Test AUC: {auc:.3f}") # Classification report print(f"\nClassification Report:") print(classification_report(y_test, y_pred, target_names=['Not ' + target_name, target_name])) return model, X_test, y_test, cv_scores.mean() def analyze_feature_importance(model, feature_cols, target_name): """Analyze and display feature importance.""" importance = model.feature_importances_ importance_df = pd.DataFrame({ 'feature': feature_cols, 'importance': importance }).sort_values('importance', ascending=False) print(f"\n{'='*60}") print(f"Top 15 Features for {target_name}:") print(f"{'='*60}") for i, row in importance_df.head(15).iterrows(): print(f" {row['feature']:25s} {row['importance']:.4f}") return importance_df def compare_with_baseline(y): """Compare with baseline (random) performance.""" baseline_top1 = y.mean() baseline_top3 = y.mean() print(f"\n{'='*60}") print("Baseline Comparison:") print(f"{'='*60}") print(f" Random baseline (top1): {baseline_top1*100:.1f}%") print(f" Random baseline (top3): {baseline_top3*100:.1f}%") return baseline_top1, baseline_top3 def main(): print(f"\n{'='*60}") print("XGBoost Training for Turf Predictions") print(f"{'='*60}") # Load data df = load_data() # Create features df = create_features(df) # Train model for top1 (winner) print("\n" + "="*60) print("MODEL 1: Predicting TOP 1 (Winner)") print("="*60) X, y_top1, feature_cols = prepare_ml_data(df, 'top1') model_top1, X_test_top1, y_test_top1, cv_auc_top1 = train_xgboost_model(X, y_top1, 'top1') importance_top1 = analyze_feature_importance(model_top1, feature_cols, 'top1') baseline_top1, _ = compare_with_baseline(y_top1) # Train model for top3 (placed) print("\n" + "="*60) print("MODEL 2: Predicting TOP 3 (Placed)") print("="*60) _, y_top3, _ = prepare_ml_data(df, 'top3') model_top3, X_test_top3, y_test_top3, cv_auc_top3 = train_xgboost_model(X, y_top3, 'top3') importance_top3 = analyze_feature_importance(model_top3, feature_cols, 'top3') _, baseline_top3 = compare_with_baseline(y_top3) # Summary print("\n" + "="*60) print("SUMMARY") print("="*60) print(f"\nTop 1 (Winner) Prediction:") print(f" - CV AUC: {cv_auc_top1:.3f}") print(f" - Improvement over random: +{(cv_auc_top1 - 0.5)*100:.1f}%") print(f"\nTop 3 (Placed) Prediction:") print(f" - CV AUC: {cv_auc_top3:.3f}") print(f" - Improvement over random: +{(cv_auc_top3 - 0.5)*100:.1f}%") # Save models import pickle model_path = f"{OUTPUT_DIR}/xgboost_models.pkl" with open(model_path, 'wb') as f: pickle.dump({ 'model_top1': model_top1, 'model_top3': model_top3, 'feature_cols': feature_cols, 'discipline_encoder': None, 'sexe_encoder': None }, f) print(f"\nāœ… Models saved to {model_path}") # Save feature importance importance_top1.to_csv(f"{OUTPUT_DIR}/feature_importance_top1.csv", index=False) importance_top3.to_csv(f"{OUTPUT_DIR}/feature_importance_top3.csv", index=False) print(f"āœ… Feature importance saved") return { 'cv_auc_top1': cv_auc_top1, 'cv_auc_top3': cv_auc_top3, 'baseline_top1': baseline_top1, 'baseline_top3': baseline_top3 } if __name__ == "__main__": results = main()