300 lines
10 KiB
Python
300 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
XGBoost Training for Turf Predictions
|
|
- Predict top1 (winner) and top3 (placed)
|
|
- Cross-validation for robust evaluation
|
|
- Feature importance analysis
|
|
"""
|
|
|
|
import sqlite3
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
|
|
from sklearn.preprocessing import LabelEncoder
|
|
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
|
|
import xgboost as xgb
|
|
import os
|
|
import json
|
|
from datetime import datetime
|
|
|
|
DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db")
|
|
OUTPUT_DIR = "/home/h3r7/turf_scraper"
|
|
|
|
|
|
def load_data():
|
|
"""Load historical data from database."""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
|
|
query = """
|
|
SELECT
|
|
date, hippodrome, distance, discipline, allocation, nb_partants,
|
|
horse_name, horse_number, age, sexe, musique,
|
|
nb_courses, nb_victoires, nb_places, nb_places_2, nb_places_3,
|
|
gains_carriere, gains_annee, gains_victoires,
|
|
reduction_km, avis_entraineur, oeilleres, deferre,
|
|
cote_directe, cote_reference, indicateur_tendance, est_favori,
|
|
tx_victoire, tx_place, forme_recente, tendance_forme,
|
|
nb_disq, rang_cote, ratio_cote_field,
|
|
ordre_arrivee, top1, top3, top5
|
|
FROM historical_data
|
|
WHERE ordre_arrivee > 0
|
|
"""
|
|
|
|
df = pd.read_sql_query(query, conn)
|
|
conn.close()
|
|
|
|
print(f"✅ Loaded {len(df)} rows from historical_data")
|
|
return df
|
|
|
|
|
|
def create_features(df):
|
|
"""Create features for ML model."""
|
|
df = df.copy()
|
|
|
|
# Encode categorical variables
|
|
le_discipline = LabelEncoder()
|
|
le_sexe = LabelEncoder()
|
|
le_avis = LabelEncoder()
|
|
le_oeilleres = LabelEncoder()
|
|
le_deferre = LabelEncoder()
|
|
|
|
df['discipline_enc'] = le_discipline.fit_transform(df['discipline'].fillna('UNKNOWN'))
|
|
df['sexe_enc'] = le_sexe.fit_transform(df['sexe'].fillna('U'))
|
|
df['avis_enc'] = le_avis.fit_transform(df['avis_entraineur'].fillna('NEUTRE'))
|
|
df['oeilleres_enc'] = le_oeilleres.fit_transform(df['oeilleres'].fillna('SANS'))
|
|
df['deferre_enc'] = le_deferre.fit_transform(df['deferre'].fillna('NON'))
|
|
|
|
# Parse musique (last 5 races form)
|
|
def parse_music(music):
|
|
if not music or pd.isna(music):
|
|
return [0, 0, 0, 0, 0]
|
|
try:
|
|
# Extract numbers from music string like "1a2a3a4a5a"
|
|
import re
|
|
numbers = re.findall(r'\d+', str(music))
|
|
return [int(n) if n else 0 for n in numbers[:5]]
|
|
except:
|
|
return [0, 0, 0, 0, 0]
|
|
|
|
music_parsed = df['musique'].apply(parse_music)
|
|
df['form_1'] = music_parsed.apply(lambda x: x[0] if len(x) > 0 else 0)
|
|
df['form_2'] = music_parsed.apply(lambda x: x[1] if len(x) > 1 else 0)
|
|
df['form_3'] = music_parsed.apply(lambda x: x[2] if len(x) > 2 else 0)
|
|
df['form_4'] = music_parsed.apply(lambda x: x[3] if len(x) > 3 else 0)
|
|
df['form_5'] = music_parsed.apply(lambda x: x[4] if len(x) > 4 else 0)
|
|
|
|
# Average form (lower is better in turf)
|
|
df['form_avg'] = df[['form_1', 'form_2', 'form_3', 'form_4', 'form_5']].mean(axis=1)
|
|
|
|
# Win rate adjusted by number of races
|
|
df['win_rate_adj'] = df['tx_victoire'] * np.log1p(df['nb_courses'])
|
|
|
|
# Place rate adjusted
|
|
df['place_rate_adj'] = df['tx_place'] * np.log1p(df['nb_courses'])
|
|
|
|
# Odds implied probability
|
|
df['implied_prob'] = 1 / df['cote_directe'].replace(0, np.nan)
|
|
|
|
# Performance metrics
|
|
df['victories_per_race'] = df['nb_victoires'] / df['nb_courses'].replace(0, 1)
|
|
df['places_per_race'] = df['nb_places'] / df['nb_courses'].replace(0, 1)
|
|
|
|
# Earnings per race
|
|
df['earnings_per_race'] = df['gains_annee'] / df['nb_courses'].replace(0, 1)
|
|
|
|
# Age-performance interaction
|
|
df['age_win_interact'] = df['age'] * df['tx_victoire']
|
|
|
|
# Distance category
|
|
df['distance_cat'] = pd.cut(df['distance'], bins=[0, 1500, 2000, 2500, 4000],
|
|
labels=[1, 2, 3, 4]).astype(float)
|
|
|
|
# Favoritism indicator
|
|
df['is_favorite'] = (df['cote_directe'] < 5).astype(int)
|
|
|
|
return df
|
|
|
|
|
|
def prepare_ml_data(df, target_col):
|
|
"""Prepare features and target for ML."""
|
|
feature_cols = [
|
|
'age', 'sexe_enc', 'nb_courses', 'nb_victoires', 'nb_places',
|
|
'tx_victoire', 'tx_place', 'forme_recente', 'reduction_km',
|
|
'gains_annee', 'cote_directe', 'distance', 'nb_partants',
|
|
'discipline_enc', 'avis_enc', 'oeilleres_enc', 'deferre_enc',
|
|
'form_1', 'form_2', 'form_3', 'form_4', 'form_5', 'form_avg',
|
|
'win_rate_adj', 'place_rate_adj', 'implied_prob',
|
|
'victories_per_race', 'places_per_race', 'earnings_per_race',
|
|
'age_win_interact', 'distance_cat', 'is_favorite',
|
|
'rang_cote', 'ratio_cote_field'
|
|
]
|
|
|
|
# Filter valid features
|
|
feature_cols = [c for c in feature_cols if c in df.columns]
|
|
|
|
X = df[feature_cols].fillna(0)
|
|
y = df[target_col].fillna(0).astype(int)
|
|
|
|
return X, y, feature_cols
|
|
|
|
|
|
def train_xgboost_model(X, y, target_name):
|
|
"""Train XGBoost model with cross-validation."""
|
|
print(f"\n{'='*60}")
|
|
print(f"Training XGBoost for {target_name}")
|
|
print(f"{'='*60}")
|
|
|
|
# Split data
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=0.2, random_state=42, stratify=y
|
|
)
|
|
|
|
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
|
|
print(f"Positive class: {y.sum()} ({y.mean()*100:.1f}%)")
|
|
|
|
# XGBoost parameters
|
|
params = {
|
|
'objective': 'binary:logistic',
|
|
'eval_metric': 'auc',
|
|
'max_depth': 6,
|
|
'learning_rate': 0.1,
|
|
'n_estimators': 100,
|
|
'subsample': 0.8,
|
|
'colsample_bytree': 0.8,
|
|
'scale_pos_weight': (len(y) - y.sum()) / y.sum(), # Handle imbalance
|
|
'random_state': 42,
|
|
'verbosity': 0
|
|
}
|
|
|
|
# Train model
|
|
model = xgb.XGBClassifier(**params)
|
|
model.fit(X_train, y_train)
|
|
|
|
# Cross-validation
|
|
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
|
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
|
|
|
|
print(f"\nCross-validation AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")
|
|
|
|
# Test predictions
|
|
y_pred = model.predict(X_test)
|
|
y_prob = model.predict_proba(X_test)[:, 1]
|
|
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
auc = roc_auc_score(y_test, y_prob)
|
|
|
|
print(f"\nTest Accuracy: {accuracy:.3f}")
|
|
print(f"Test AUC: {auc:.3f}")
|
|
|
|
# Classification report
|
|
print(f"\nClassification Report:")
|
|
print(classification_report(y_test, y_pred, target_names=['Not ' + target_name, target_name]))
|
|
|
|
return model, X_test, y_test, cv_scores.mean()
|
|
|
|
|
|
def analyze_feature_importance(model, feature_cols, target_name):
|
|
"""Analyze and display feature importance."""
|
|
importance = model.feature_importances_
|
|
importance_df = pd.DataFrame({
|
|
'feature': feature_cols,
|
|
'importance': importance
|
|
}).sort_values('importance', ascending=False)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Top 15 Features for {target_name}:")
|
|
print(f"{'='*60}")
|
|
|
|
for i, row in importance_df.head(15).iterrows():
|
|
print(f" {row['feature']:25s} {row['importance']:.4f}")
|
|
|
|
return importance_df
|
|
|
|
|
|
def compare_with_baseline(y):
|
|
"""Compare with baseline (random) performance."""
|
|
baseline_top1 = y.mean()
|
|
baseline_top3 = y.mean()
|
|
|
|
print(f"\n{'='*60}")
|
|
print("Baseline Comparison:")
|
|
print(f"{'='*60}")
|
|
print(f" Random baseline (top1): {baseline_top1*100:.1f}%")
|
|
print(f" Random baseline (top3): {baseline_top3*100:.1f}%")
|
|
|
|
return baseline_top1, baseline_top3
|
|
|
|
|
|
def main():
|
|
print(f"\n{'='*60}")
|
|
print("XGBoost Training for Turf Predictions")
|
|
print(f"{'='*60}")
|
|
|
|
# Load data
|
|
df = load_data()
|
|
|
|
# Create features
|
|
df = create_features(df)
|
|
|
|
# Train model for top1 (winner)
|
|
print("\n" + "="*60)
|
|
print("MODEL 1: Predicting TOP 1 (Winner)")
|
|
print("="*60)
|
|
|
|
X, y_top1, feature_cols = prepare_ml_data(df, 'top1')
|
|
model_top1, X_test_top1, y_test_top1, cv_auc_top1 = train_xgboost_model(X, y_top1, 'top1')
|
|
importance_top1 = analyze_feature_importance(model_top1, feature_cols, 'top1')
|
|
baseline_top1, _ = compare_with_baseline(y_top1)
|
|
|
|
# Train model for top3 (placed)
|
|
print("\n" + "="*60)
|
|
print("MODEL 2: Predicting TOP 3 (Placed)")
|
|
print("="*60)
|
|
|
|
_, y_top3, _ = prepare_ml_data(df, 'top3')
|
|
model_top3, X_test_top3, y_test_top3, cv_auc_top3 = train_xgboost_model(X, y_top3, 'top3')
|
|
importance_top3 = analyze_feature_importance(model_top3, feature_cols, 'top3')
|
|
_, baseline_top3 = compare_with_baseline(y_top3)
|
|
|
|
# Summary
|
|
print("\n" + "="*60)
|
|
print("SUMMARY")
|
|
print("="*60)
|
|
print(f"\nTop 1 (Winner) Prediction:")
|
|
print(f" - CV AUC: {cv_auc_top1:.3f}")
|
|
print(f" - Improvement over random: +{(cv_auc_top1 - 0.5)*100:.1f}%")
|
|
print(f"\nTop 3 (Placed) Prediction:")
|
|
print(f" - CV AUC: {cv_auc_top3:.3f}")
|
|
print(f" - Improvement over random: +{(cv_auc_top3 - 0.5)*100:.1f}%")
|
|
|
|
# Save models
|
|
import pickle
|
|
|
|
model_path = f"{OUTPUT_DIR}/xgboost_models.pkl"
|
|
with open(model_path, 'wb') as f:
|
|
pickle.dump({
|
|
'model_top1': model_top1,
|
|
'model_top3': model_top3,
|
|
'feature_cols': feature_cols,
|
|
'discipline_encoder': None,
|
|
'sexe_encoder': None
|
|
}, f)
|
|
|
|
print(f"\n✅ Models saved to {model_path}")
|
|
|
|
# Save feature importance
|
|
importance_top1.to_csv(f"{OUTPUT_DIR}/feature_importance_top1.csv", index=False)
|
|
importance_top3.to_csv(f"{OUTPUT_DIR}/feature_importance_top3.csv", index=False)
|
|
print(f"✅ Feature importance saved")
|
|
|
|
return {
|
|
'cv_auc_top1': cv_auc_top1,
|
|
'cv_auc_top3': cv_auc_top3,
|
|
'baseline_top1': baseline_top1,
|
|
'baseline_top3': baseline_top3
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
results = main()
|