Files
turf_saas/train_xgboost.py
2026-04-25 17:18:43 +02:00

300 lines
10 KiB
Python

#!/usr/bin/env python3
"""
XGBoost Training for Turf Predictions
- Predict top1 (winner) and top3 (placed)
- Cross-validation for robust evaluation
- Feature importance analysis
"""
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import xgboost as xgb
import os
import json
from datetime import datetime
DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db")
OUTPUT_DIR = "/home/h3r7/turf_scraper"
def load_data():
"""Load historical data from database."""
conn = sqlite3.connect(DB_PATH)
query = """
SELECT
date, hippodrome, distance, discipline, allocation, nb_partants,
horse_name, horse_number, age, sexe, musique,
nb_courses, nb_victoires, nb_places, nb_places_2, nb_places_3,
gains_carriere, gains_annee, gains_victoires,
reduction_km, avis_entraineur, oeilleres, deferre,
cote_directe, cote_reference, indicateur_tendance, est_favori,
tx_victoire, tx_place, forme_recente, tendance_forme,
nb_disq, rang_cote, ratio_cote_field,
ordre_arrivee, top1, top3, top5
FROM historical_data
WHERE ordre_arrivee > 0
"""
df = pd.read_sql_query(query, conn)
conn.close()
print(f"✅ Loaded {len(df)} rows from historical_data")
return df
def create_features(df):
"""Create features for ML model."""
df = df.copy()
# Encode categorical variables
le_discipline = LabelEncoder()
le_sexe = LabelEncoder()
le_avis = LabelEncoder()
le_oeilleres = LabelEncoder()
le_deferre = LabelEncoder()
df['discipline_enc'] = le_discipline.fit_transform(df['discipline'].fillna('UNKNOWN'))
df['sexe_enc'] = le_sexe.fit_transform(df['sexe'].fillna('U'))
df['avis_enc'] = le_avis.fit_transform(df['avis_entraineur'].fillna('NEUTRE'))
df['oeilleres_enc'] = le_oeilleres.fit_transform(df['oeilleres'].fillna('SANS'))
df['deferre_enc'] = le_deferre.fit_transform(df['deferre'].fillna('NON'))
# Parse musique (last 5 races form)
def parse_music(music):
if not music or pd.isna(music):
return [0, 0, 0, 0, 0]
try:
# Extract numbers from music string like "1a2a3a4a5a"
import re
numbers = re.findall(r'\d+', str(music))
return [int(n) if n else 0 for n in numbers[:5]]
except:
return [0, 0, 0, 0, 0]
music_parsed = df['musique'].apply(parse_music)
df['form_1'] = music_parsed.apply(lambda x: x[0] if len(x) > 0 else 0)
df['form_2'] = music_parsed.apply(lambda x: x[1] if len(x) > 1 else 0)
df['form_3'] = music_parsed.apply(lambda x: x[2] if len(x) > 2 else 0)
df['form_4'] = music_parsed.apply(lambda x: x[3] if len(x) > 3 else 0)
df['form_5'] = music_parsed.apply(lambda x: x[4] if len(x) > 4 else 0)
# Average form (lower is better in turf)
df['form_avg'] = df[['form_1', 'form_2', 'form_3', 'form_4', 'form_5']].mean(axis=1)
# Win rate adjusted by number of races
df['win_rate_adj'] = df['tx_victoire'] * np.log1p(df['nb_courses'])
# Place rate adjusted
df['place_rate_adj'] = df['tx_place'] * np.log1p(df['nb_courses'])
# Odds implied probability
df['implied_prob'] = 1 / df['cote_directe'].replace(0, np.nan)
# Performance metrics
df['victories_per_race'] = df['nb_victoires'] / df['nb_courses'].replace(0, 1)
df['places_per_race'] = df['nb_places'] / df['nb_courses'].replace(0, 1)
# Earnings per race
df['earnings_per_race'] = df['gains_annee'] / df['nb_courses'].replace(0, 1)
# Age-performance interaction
df['age_win_interact'] = df['age'] * df['tx_victoire']
# Distance category
df['distance_cat'] = pd.cut(df['distance'], bins=[0, 1500, 2000, 2500, 4000],
labels=[1, 2, 3, 4]).astype(float)
# Favoritism indicator
df['is_favorite'] = (df['cote_directe'] < 5).astype(int)
return df
def prepare_ml_data(df, target_col):
"""Prepare features and target for ML."""
feature_cols = [
'age', 'sexe_enc', 'nb_courses', 'nb_victoires', 'nb_places',
'tx_victoire', 'tx_place', 'forme_recente', 'reduction_km',
'gains_annee', 'cote_directe', 'distance', 'nb_partants',
'discipline_enc', 'avis_enc', 'oeilleres_enc', 'deferre_enc',
'form_1', 'form_2', 'form_3', 'form_4', 'form_5', 'form_avg',
'win_rate_adj', 'place_rate_adj', 'implied_prob',
'victories_per_race', 'places_per_race', 'earnings_per_race',
'age_win_interact', 'distance_cat', 'is_favorite',
'rang_cote', 'ratio_cote_field'
]
# Filter valid features
feature_cols = [c for c in feature_cols if c in df.columns]
X = df[feature_cols].fillna(0)
y = df[target_col].fillna(0).astype(int)
return X, y, feature_cols
def train_xgboost_model(X, y, target_name):
"""Train XGBoost model with cross-validation."""
print(f"\n{'='*60}")
print(f"Training XGBoost for {target_name}")
print(f"{'='*60}")
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
print(f"Positive class: {y.sum()} ({y.mean()*100:.1f}%)")
# XGBoost parameters
params = {
'objective': 'binary:logistic',
'eval_metric': 'auc',
'max_depth': 6,
'learning_rate': 0.1,
'n_estimators': 100,
'subsample': 0.8,
'colsample_bytree': 0.8,
'scale_pos_weight': (len(y) - y.sum()) / y.sum(), # Handle imbalance
'random_state': 42,
'verbosity': 0
}
# Train model
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
print(f"\nCross-validation AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")
# Test predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
print(f"\nTest Accuracy: {accuracy:.3f}")
print(f"Test AUC: {auc:.3f}")
# Classification report
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not ' + target_name, target_name]))
return model, X_test, y_test, cv_scores.mean()
def analyze_feature_importance(model, feature_cols, target_name):
"""Analyze and display feature importance."""
importance = model.feature_importances_
importance_df = pd.DataFrame({
'feature': feature_cols,
'importance': importance
}).sort_values('importance', ascending=False)
print(f"\n{'='*60}")
print(f"Top 15 Features for {target_name}:")
print(f"{'='*60}")
for i, row in importance_df.head(15).iterrows():
print(f" {row['feature']:25s} {row['importance']:.4f}")
return importance_df
def compare_with_baseline(y):
"""Compare with baseline (random) performance."""
baseline_top1 = y.mean()
baseline_top3 = y.mean()
print(f"\n{'='*60}")
print("Baseline Comparison:")
print(f"{'='*60}")
print(f" Random baseline (top1): {baseline_top1*100:.1f}%")
print(f" Random baseline (top3): {baseline_top3*100:.1f}%")
return baseline_top1, baseline_top3
def main():
print(f"\n{'='*60}")
print("XGBoost Training for Turf Predictions")
print(f"{'='*60}")
# Load data
df = load_data()
# Create features
df = create_features(df)
# Train model for top1 (winner)
print("\n" + "="*60)
print("MODEL 1: Predicting TOP 1 (Winner)")
print("="*60)
X, y_top1, feature_cols = prepare_ml_data(df, 'top1')
model_top1, X_test_top1, y_test_top1, cv_auc_top1 = train_xgboost_model(X, y_top1, 'top1')
importance_top1 = analyze_feature_importance(model_top1, feature_cols, 'top1')
baseline_top1, _ = compare_with_baseline(y_top1)
# Train model for top3 (placed)
print("\n" + "="*60)
print("MODEL 2: Predicting TOP 3 (Placed)")
print("="*60)
_, y_top3, _ = prepare_ml_data(df, 'top3')
model_top3, X_test_top3, y_test_top3, cv_auc_top3 = train_xgboost_model(X, y_top3, 'top3')
importance_top3 = analyze_feature_importance(model_top3, feature_cols, 'top3')
_, baseline_top3 = compare_with_baseline(y_top3)
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"\nTop 1 (Winner) Prediction:")
print(f" - CV AUC: {cv_auc_top1:.3f}")
print(f" - Improvement over random: +{(cv_auc_top1 - 0.5)*100:.1f}%")
print(f"\nTop 3 (Placed) Prediction:")
print(f" - CV AUC: {cv_auc_top3:.3f}")
print(f" - Improvement over random: +{(cv_auc_top3 - 0.5)*100:.1f}%")
# Save models
import pickle
model_path = f"{OUTPUT_DIR}/xgboost_models.pkl"
with open(model_path, 'wb') as f:
pickle.dump({
'model_top1': model_top1,
'model_top3': model_top3,
'feature_cols': feature_cols,
'discipline_encoder': None,
'sexe_encoder': None
}, f)
print(f"\n✅ Models saved to {model_path}")
# Save feature importance
importance_top1.to_csv(f"{OUTPUT_DIR}/feature_importance_top1.csv", index=False)
importance_top3.to_csv(f"{OUTPUT_DIR}/feature_importance_top3.csv", index=False)
print(f"✅ Feature importance saved")
return {
'cv_auc_top1': cv_auc_top1,
'cv_auc_top3': cv_auc_top3,
'baseline_top1': baseline_top1,
'baseline_top3': baseline_top3
}
if __name__ == "__main__":
results = main()