525 lines
17 KiB
Python
Executable File
525 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Analyse REX - Corrélations et calibration des poids du scoring
|
|
Lit historical_data + performance pour améliorer le modèle de prédiction
|
|
"""
|
|
|
|
import sqlite3
|
|
import json
|
|
import os
|
|
import math
|
|
from datetime import datetime, timedelta
|
|
|
|
DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db")
|
|
|
|
# ============================================================
|
|
# UTILITAIRES STATISTIQUES
|
|
# ============================================================
|
|
|
|
def moyenne(valeurs):
|
|
v = [x for x in valeurs if x is not None]
|
|
return sum(v) / len(v) if v else 0
|
|
|
|
def ecart_type(valeurs):
|
|
v = [x for x in valeurs if x is not None]
|
|
if len(v) < 2:
|
|
return 0
|
|
m = moyenne(v)
|
|
return math.sqrt(sum((x - m)**2 for x in v) / len(v))
|
|
|
|
def correlation_pearson(x_list, y_list):
|
|
"""Corrélation de Pearson entre deux listes"""
|
|
pairs = [(x, y) for x, y in zip(x_list, y_list) if x is not None and y is not None]
|
|
if len(pairs) < 5:
|
|
return 0
|
|
xs = [p[0] for p in pairs]
|
|
ys = [p[1] for p in pairs]
|
|
mx, my = moyenne(xs), moyenne(ys)
|
|
num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
|
|
den = math.sqrt(sum((x - mx)**2 for x in xs) * sum((y - my)**2 for y in ys))
|
|
return round(num / den, 4) if den else 0
|
|
|
|
def taux_top5_par_segment(conn, feature, nb_segments=5):
|
|
"""
|
|
Découpe une feature en segments et calcule le taux top5 de chaque segment.
|
|
Permet de voir si la feature est discriminante.
|
|
"""
|
|
c = conn.cursor()
|
|
c.execute(f"""
|
|
SELECT {feature}, top5
|
|
FROM historical_data
|
|
WHERE {feature} IS NOT NULL AND {feature} > 0
|
|
ORDER BY {feature} ASC
|
|
""")
|
|
rows = c.fetchall()
|
|
if len(rows) < nb_segments:
|
|
return []
|
|
|
|
segment_size = len(rows) // nb_segments
|
|
segments = []
|
|
for i in range(nb_segments):
|
|
debut = i * segment_size
|
|
fin = debut + segment_size if i < nb_segments - 1 else len(rows)
|
|
seg = rows[debut:fin]
|
|
vals = [r[0] for r in seg]
|
|
top5s = [r[1] for r in seg]
|
|
segments.append({
|
|
'segment': i + 1,
|
|
'min': round(min(vals), 2),
|
|
'max': round(max(vals), 2),
|
|
'nb': len(seg),
|
|
'taux_top5': round(sum(top5s) / len(top5s) * 100, 1)
|
|
})
|
|
return segments
|
|
|
|
# ============================================================
|
|
# ANALYSES PRINCIPALES
|
|
# ============================================================
|
|
|
|
def analyse_volume(conn):
|
|
"""Statistiques générales sur les données disponibles"""
|
|
c = conn.cursor()
|
|
|
|
c.execute("""
|
|
SELECT COUNT(DISTINCT date) as jours,
|
|
COUNT(*) as lignes,
|
|
MIN(date) as debut,
|
|
MAX(date) as fin,
|
|
AVG(top1) as taux_gagnant,
|
|
AVG(top5) as taux_top5
|
|
FROM historical_data
|
|
""")
|
|
row = c.fetchone()
|
|
|
|
c.execute("SELECT COUNT(DISTINCT race_name) FROM historical_data")
|
|
nb_courses = c.fetchone()[0]
|
|
|
|
c.execute("""
|
|
SELECT discipline, COUNT(DISTINCT date) as nb
|
|
FROM historical_data
|
|
GROUP BY discipline
|
|
ORDER BY nb DESC
|
|
""")
|
|
disciplines = c.fetchall()
|
|
|
|
return {
|
|
'nb_jours': row[0],
|
|
'nb_lignes': row[1],
|
|
'nb_courses': nb_courses,
|
|
'debut': row[2],
|
|
'fin': row[3],
|
|
'taux_gagnant': round((row[4] or 0) * 100, 1),
|
|
'taux_top5': round((row[5] or 0) * 100, 1),
|
|
'disciplines': disciplines,
|
|
}
|
|
|
|
|
|
def analyse_correlations(conn):
|
|
"""
|
|
Calcule la corrélation de chaque feature avec top5 et top1.
|
|
Features négatives avec top5 = meilleur prédicteur (ex: cote basse → top5 élevé)
|
|
"""
|
|
c = conn.cursor()
|
|
c.execute("""
|
|
SELECT cote_directe, cote_reference, reduction_km,
|
|
forme_recente, tendance_forme, tx_victoire, tx_place,
|
|
gains_carriere, gains_annee, rang_cote, ratio_cote_field,
|
|
nb_courses, nb_victoires, age, driver_change,
|
|
indicateur_tendance, nb_disq,
|
|
top1, top5
|
|
FROM historical_data
|
|
WHERE cote_directe > 0
|
|
""")
|
|
rows = c.fetchall()
|
|
|
|
features = [
|
|
'cote_directe', 'cote_reference', 'reduction_km',
|
|
'forme_recente', 'tendance_forme', 'tx_victoire', 'tx_place',
|
|
'gains_carriere', 'gains_annee', 'rang_cote', 'ratio_cote_field',
|
|
'nb_courses', 'nb_victoires', 'age', 'driver_change',
|
|
'indicateur_tendance', 'nb_disq'
|
|
]
|
|
|
|
top1_vals = [r[17] for r in rows]
|
|
top5_vals = [r[18] for r in rows]
|
|
|
|
correlations = []
|
|
for i, feat in enumerate(features):
|
|
feat_vals = [r[i] for r in rows]
|
|
corr_top1 = correlation_pearson(feat_vals, top1_vals)
|
|
corr_top5 = correlation_pearson(feat_vals, top5_vals)
|
|
correlations.append({
|
|
'feature': feat,
|
|
'corr_top1': corr_top1,
|
|
'corr_top5': corr_top5,
|
|
'abs_top5': abs(corr_top5),
|
|
})
|
|
|
|
# Trier par corrélation absolue avec top5
|
|
correlations.sort(key=lambda x: x['abs_top5'], reverse=True)
|
|
return correlations
|
|
|
|
|
|
def analyse_cote(conn):
|
|
"""Analyse détaillée de la cote comme prédicteur"""
|
|
c = conn.cursor()
|
|
|
|
# Taux de réussite par tranche de cote
|
|
tranches = [
|
|
(0, 3, "Très favori (< 3)"),
|
|
(3, 6, "Favori (3-6)"),
|
|
(6, 10, "Second favori (6-10)"),
|
|
(10, 20, "Outsider (10-20)"),
|
|
(20, 50, "Longshot (20-50)"),
|
|
(50, 999, "Outsider extrême (50+)"),
|
|
]
|
|
|
|
resultats = []
|
|
for cmin, cmax, label in tranches:
|
|
c.execute("""
|
|
SELECT COUNT(*) as nb,
|
|
AVG(top1) as taux_gagnant,
|
|
AVG(top5) as taux_top5,
|
|
AVG(ordre_arrivee) as pos_moy
|
|
FROM historical_data
|
|
WHERE cote_directe >= ? AND cote_directe < ? AND ordre_arrivee > 0
|
|
""", (cmin, cmax))
|
|
row = c.fetchone()
|
|
if row[0] > 0:
|
|
resultats.append({
|
|
'tranche': label,
|
|
'nb': row[0],
|
|
'taux_gagnant': round((row[1] or 0) * 100, 1),
|
|
'taux_top5': round((row[2] or 0) * 100, 1),
|
|
'position_moy': round(row[3] or 0, 1),
|
|
})
|
|
return resultats
|
|
|
|
|
|
def analyse_forme(conn):
|
|
"""Analyse de la forme récente comme prédicteur"""
|
|
c = conn.cursor()
|
|
|
|
tranches = [
|
|
(0, 1.5, "Excellente (< 1.5)"),
|
|
(1.5, 3, "Bonne (1.5-3)"),
|
|
(3, 5, "Moyenne (3-5)"),
|
|
(5, 8, "Mauvaise (5-8)"),
|
|
(8, 99, "Très mauvaise (8+)"),
|
|
]
|
|
|
|
resultats = []
|
|
for fmin, fmax, label in tranches:
|
|
c.execute("""
|
|
SELECT COUNT(*) as nb,
|
|
AVG(top1) as taux_gagnant,
|
|
AVG(top5) as taux_top5
|
|
FROM historical_data
|
|
WHERE forme_recente >= ? AND forme_recente < ? AND ordre_arrivee > 0
|
|
""", (fmin, fmax))
|
|
row = c.fetchone()
|
|
if row[0] > 0:
|
|
resultats.append({
|
|
'tranche': label,
|
|
'nb': row[0],
|
|
'taux_gagnant': round((row[1] or 0) * 100, 1),
|
|
'taux_top5': round((row[2] or 0) * 100, 1),
|
|
})
|
|
return resultats
|
|
|
|
|
|
def analyse_avis_entraineur(conn):
|
|
"""Impact de l'avis entraîneur"""
|
|
c = conn.cursor()
|
|
c.execute("""
|
|
SELECT avis_entraineur,
|
|
COUNT(*) as nb,
|
|
AVG(top1) as taux_gagnant,
|
|
AVG(top5) as taux_top5,
|
|
AVG(cote_directe) as cote_moy
|
|
FROM historical_data
|
|
WHERE ordre_arrivee > 0
|
|
GROUP BY avis_entraineur
|
|
ORDER BY AVG(top5) DESC
|
|
""")
|
|
rows = c.fetchall()
|
|
return [{
|
|
'avis': r[0],
|
|
'nb': r[1],
|
|
'taux_gagnant': round((r[2] or 0) * 100, 1),
|
|
'taux_top5': round((r[3] or 0) * 100, 1),
|
|
'cote_moy': round(r[4] or 0, 1),
|
|
} for r in rows]
|
|
|
|
|
|
def analyse_driver_change(conn):
|
|
"""Impact du changement de driver"""
|
|
c = conn.cursor()
|
|
c.execute("""
|
|
SELECT driver_change,
|
|
COUNT(*) as nb,
|
|
AVG(top1) as taux_gagnant,
|
|
AVG(top5) as taux_top5
|
|
FROM historical_data
|
|
WHERE ordre_arrivee > 0
|
|
GROUP BY driver_change
|
|
""")
|
|
rows = c.fetchall()
|
|
return [{
|
|
'driver_change': 'Oui' if r[0] else 'Non',
|
|
'nb': r[1],
|
|
'taux_gagnant': round((r[2] or 0) * 100, 1),
|
|
'taux_top5': round((r[3] or 0) * 100, 1),
|
|
} for r in rows]
|
|
|
|
|
|
def analyse_top_chevaux(conn):
|
|
"""Chevaux les plus performants dans l'historique"""
|
|
c = conn.cursor()
|
|
c.execute("""
|
|
SELECT horse_name,
|
|
COUNT(*) as nb_courses,
|
|
SUM(top1) as nb_gagnant,
|
|
SUM(top5) as nb_top5,
|
|
AVG(cote_directe) as cote_moy
|
|
FROM historical_data
|
|
WHERE ordre_arrivee > 0
|
|
GROUP BY horse_name
|
|
HAVING COUNT(*) >= 3
|
|
ORDER BY AVG(top5) DESC, SUM(top1) DESC
|
|
LIMIT 15
|
|
""")
|
|
rows = c.fetchall()
|
|
return [{
|
|
'cheval': r[0],
|
|
'nb_courses': r[1],
|
|
'nb_gagnant': r[2],
|
|
'nb_top5': r[3],
|
|
'tx_top5': round(r[3] / r[1] * 100, 1),
|
|
'cote_moy': round(r[4] or 0, 1),
|
|
} for r in rows]
|
|
|
|
|
|
# ============================================================
|
|
# CALIBRATION DES POIDS
|
|
# ============================================================
|
|
|
|
def calibrer_poids(correlations):
|
|
"""
|
|
Recalcule les pondérations du scoring basé sur les corrélations REX.
|
|
Les features avec plus forte corrélation absolue reçoivent plus de poids.
|
|
"""
|
|
# Mapping feature → critère scoring actuel
|
|
feature_to_critere = {
|
|
'cote_directe': 'cote',
|
|
'forme_recente': 'forme',
|
|
'tx_victoire': 'tx_victoire',
|
|
'tx_place': 'tx_place',
|
|
'reduction_km': 'reduction_km',
|
|
'tendance_forme': 'tendance',
|
|
'rang_cote': 'cote', # corrélé à cote
|
|
}
|
|
|
|
# Agréger les corrélations par critère
|
|
criteres = {
|
|
'cote': [],
|
|
'forme': [],
|
|
'tx_victoire': [],
|
|
'tx_place': [],
|
|
'reduction_km':[],
|
|
'tendance': [],
|
|
'avis': [0.05], # valeur fixe (avis entraîneur difficile à corréler)
|
|
}
|
|
|
|
for corr in correlations:
|
|
critere = feature_to_critere.get(corr['feature'])
|
|
if critere and critere in criteres:
|
|
criteres[critere].append(corr['abs_top5'])
|
|
|
|
# Moyenne par critère
|
|
scores = {}
|
|
for critere, vals in criteres.items():
|
|
scores[critere] = moyenne(vals) if vals else 0.01
|
|
|
|
# Normaliser pour que la somme = 100%
|
|
total = sum(scores.values())
|
|
poids_calibres = {k: round(v / total * 100, 1) for k, v in scores.items()}
|
|
|
|
# Poids actuels (référence)
|
|
poids_actuels = {
|
|
'cote': 20.0,
|
|
'forme': 25.0,
|
|
'tx_victoire': 15.0,
|
|
'tx_place': 15.0,
|
|
'reduction_km': 10.0,
|
|
'tendance': 10.0,
|
|
'avis': 5.0,
|
|
}
|
|
|
|
return {
|
|
'poids_actuels': poids_actuels,
|
|
'poids_calibres': poids_calibres,
|
|
'delta': {k: round(poids_calibres.get(k, 0) - poids_actuels.get(k, 0), 1)
|
|
for k in poids_actuels}
|
|
}
|
|
|
|
|
|
# ============================================================
|
|
# RAPPORT
|
|
# ============================================================
|
|
|
|
def print_rapport(volume, correlations, cote_analyse, forme_analyse,
|
|
avis_analyse, driver_analyse, top_chevaux, poids):
|
|
|
|
print(f"\n{'='*65}")
|
|
print(f"📊 ANALYSE REX — {datetime.now().strftime('%d/%m/%Y %H:%M')}")
|
|
print(f"{'='*65}")
|
|
|
|
# Volume
|
|
print(f"\n📦 DONNÉES DISPONIBLES")
|
|
print(f" Jours : {volume['nb_jours']}")
|
|
print(f" Courses : {volume['nb_courses']}")
|
|
print(f" Partants : {volume['nb_lignes']}")
|
|
print(f" Période : {volume['debut']} → {volume['fin']}")
|
|
print(f" Taux top5 moyen : {volume['taux_top5']}%")
|
|
for disc, nb in volume['disciplines']:
|
|
print(f" {disc:<20} : {nb} courses")
|
|
|
|
# Corrélations
|
|
print(f"\n🔗 CORRÉLATIONS FEATURES → TOP5 (triées par force)")
|
|
print(f" {'FEATURE':<22} {'CORR TOP5':>10} {'CORR TOP1':>10} {'FORCE'}")
|
|
print(f" {'─'*55}")
|
|
for c in correlations[:10]:
|
|
force = '●●●' if c['abs_top5'] > 0.15 else '●●' if c['abs_top5'] > 0.08 else '●'
|
|
direction = '▼ (inverse)' if c['corr_top5'] < 0 else '▲ (direct) '
|
|
print(f" {c['feature']:<22} {c['corr_top5']:>10.4f} {c['corr_top1']:>10.4f} {force} {direction}")
|
|
|
|
# Analyse cote
|
|
print(f"\n🎰 TAUX DE RÉUSSITE PAR TRANCHE DE COTE")
|
|
print(f" {'TRANCHE':<28} {'NB':>5} {'GAGNANT':>8} {'TOP5':>6} {'POS MOY':>8}")
|
|
print(f" {'─'*60}")
|
|
for t in cote_analyse:
|
|
print(f" {t['tranche']:<28} {t['nb']:>5} {t['taux_gagnant']:>7.1f}% {t['taux_top5']:>5.1f}% {t['position_moy']:>8.1f}")
|
|
|
|
# Analyse forme
|
|
print(f"\n🏃 TAUX DE RÉUSSITE PAR FORME RÉCENTE")
|
|
print(f" {'FORME':<28} {'NB':>5} {'GAGNANT':>8} {'TOP5':>6}")
|
|
print(f" {'─'*50}")
|
|
for t in forme_analyse:
|
|
print(f" {t['tranche']:<28} {t['nb']:>5} {t['taux_gagnant']:>7.1f}% {t['taux_top5']:>5.1f}%")
|
|
|
|
# Avis entraîneur
|
|
print(f"\n👨🏫 IMPACT AVIS ENTRAÎNEUR")
|
|
print(f" {'AVIS':<20} {'NB':>5} {'GAGNANT':>8} {'TOP5':>6} {'COTE MOY':>9}")
|
|
print(f" {'─'*52}")
|
|
for a in avis_analyse:
|
|
print(f" {a['avis']:<20} {a['nb']:>5} {a['taux_gagnant']:>7.1f}% {a['taux_top5']:>5.1f}% {a['cote_moy']:>9.1f}")
|
|
|
|
# Driver change
|
|
print(f"\n🔄 IMPACT CHANGEMENT DE DRIVER")
|
|
for d in driver_analyse:
|
|
print(f" Changement {d['driver_change']:<4} : {d['nb']} courses · "
|
|
f"gagnant {d['taux_gagnant']}% · top5 {d['taux_top5']}%")
|
|
|
|
# Top chevaux
|
|
if top_chevaux:
|
|
print(f"\n🏆 TOP CHEVAUX (≥3 courses)")
|
|
print(f" {'CHEVAL':<28} {'COURSES':>8} {'GAGNANT':>8} {'TOP5':>6} {'TX TOP5':>8} {'COTE MOY':>9}")
|
|
print(f" {'─'*70}")
|
|
for h in top_chevaux[:10]:
|
|
print(f" {h['cheval']:<28} {h['nb_courses']:>8} {h['nb_gagnant']:>8} "
|
|
f"{h['nb_top5']:>6} {h['tx_top5']:>7.1f}% {h['cote_moy']:>9.1f}")
|
|
|
|
# Calibration poids
|
|
print(f"\n⚖️ CALIBRATION DES POIDS DU SCORING")
|
|
print(f" {'CRITÈRE':<16} {'ACTUEL':>8} {'CALIBRÉ':>8} {'DELTA':>8} RECOMMANDATION")
|
|
print(f" {'─'*60}")
|
|
for critere in poids['poids_actuels']:
|
|
actuel = poids['poids_actuels'][critere]
|
|
calibre = poids['poids_calibres'].get(critere, actuel)
|
|
delta = poids['delta'].get(critere, 0)
|
|
if delta > 2:
|
|
reco = "↑ AUGMENTER"
|
|
elif delta < -2:
|
|
reco = "↓ RÉDUIRE"
|
|
else:
|
|
reco = "→ OK"
|
|
print(f" {critere:<16} {actuel:>7.1f}% {calibre:>7.1f}% {delta:>+7.1f}% {reco}")
|
|
|
|
nb_jours = volume['nb_jours']
|
|
if nb_jours < 30:
|
|
print(f"\n⚠️ Données insuffisantes ({nb_jours} jours) — attendez 30+ jours avant d'appliquer la calibration")
|
|
elif nb_jours < 100:
|
|
print(f"\n✅ Données suffisantes pour calibration Phase 2 ({nb_jours} jours)")
|
|
print(f" Recommandation : appliquer les nouveaux poids dans scoring.py")
|
|
else:
|
|
print(f"\n🚀 Données suffisantes pour ML Phase 3 ({nb_jours} jours)")
|
|
print(f" Recommandation : entraîner un modèle XGBoost")
|
|
|
|
print(f"\n{'='*65}\n")
|
|
|
|
|
|
# ============================================================
|
|
# MAIN
|
|
# ============================================================
|
|
|
|
def main():
|
|
print(f"\n{'='*65}")
|
|
print(f"🧠 ANALYSE REX — {datetime.now().strftime('%d/%m/%Y %H:%M')}")
|
|
print(f"{'='*65}\n")
|
|
|
|
conn = sqlite3.connect(DB_PATH)
|
|
|
|
# Vérifier les données
|
|
c = conn.cursor()
|
|
c.execute("SELECT COUNT(*) FROM historical_data")
|
|
nb = c.fetchone()[0]
|
|
|
|
if nb == 0:
|
|
print("❌ Aucune donnée dans historical_data.")
|
|
print(" Lancez d'abord : python3 historical_loader.py --days 365")
|
|
conn.close()
|
|
return
|
|
|
|
print(f"✅ {nb} lignes trouvées dans historical_data\n")
|
|
|
|
# Analyses
|
|
print("📊 Calcul des statistiques...")
|
|
volume = analyse_volume(conn)
|
|
correlations= analyse_correlations(conn)
|
|
cote_analyse= analyse_cote(conn)
|
|
forme_analyse= analyse_forme(conn)
|
|
avis_analyse= analyse_avis_entraineur(conn)
|
|
driver_analyse= analyse_driver_change(conn)
|
|
top_chevaux = analyse_top_chevaux(conn)
|
|
poids = calibrer_poids(correlations)
|
|
|
|
conn.close()
|
|
|
|
# Afficher le rapport
|
|
print_rapport(volume, correlations, cote_analyse, forme_analyse,
|
|
avis_analyse, driver_analyse, top_chevaux, poids)
|
|
|
|
# Sauvegarder le rapport JSON
|
|
rapport = {
|
|
'date': datetime.now().isoformat(),
|
|
'volume': volume,
|
|
'correlations': correlations,
|
|
'cote_analyse': cote_analyse,
|
|
'forme_analyse': forme_analyse,
|
|
'avis_analyse': avis_analyse,
|
|
'driver_analyse': driver_analyse,
|
|
'top_chevaux': top_chevaux,
|
|
'poids_calibres': poids,
|
|
}
|
|
|
|
turf_dir = os.environ.get('TURF_DIR', '/home/h3r7/turf_scraper')
|
|
path = f"{turf_dir}/rex_analyse_{datetime.now().strftime('%Y%m%d')}.json"
|
|
with open(path, 'w', encoding='utf-8') as f:
|
|
json.dump(rapport, f, indent=2, ensure_ascii=False)
|
|
print(f"📁 Rapport sauvegardé : {path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|