turf_saas/analyse_rex.py

#!/usr/bin/env python3
"""
Analyse REX - Corrélations et calibration des poids du scoring
Lit historical_data + performance pour améliorer le modèle de prédiction
"""

import sqlite3
import json
import os
import math
from datetime import datetime, timedelta

DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db")

# ============================================================
# UTILITAIRES STATISTIQUES
# ============================================================

def moyenne(valeurs):
    v = [x for x in valeurs if x is not None]
    return sum(v) / len(v) if v else 0

def ecart_type(valeurs):
    v = [x for x in valeurs if x is not None]
    if len(v) < 2:
        return 0
    m = moyenne(v)
    return math.sqrt(sum((x - m)**2 for x in v) / len(v))

def correlation_pearson(x_list, y_list):
    """Corrélation de Pearson entre deux listes"""
    pairs = [(x, y) for x, y in zip(x_list, y_list) if x is not None and y is not None]
    if len(pairs) < 5:
        return 0
    xs = [p[0] for p in pairs]
    ys = [p[1] for p in pairs]
    mx, my = moyenne(xs), moyenne(ys)
    num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
    den = math.sqrt(sum((x - mx)**2 for x in xs) * sum((y - my)**2 for y in ys))
    return round(num / den, 4) if den else 0

def taux_top5_par_segment(conn, feature, nb_segments=5):
    """
    Découpe une feature en segments et calcule le taux top5 de chaque segment.
    Permet de voir si la feature est discriminante.
    """
    c = conn.cursor()
    c.execute(f"""
        SELECT {feature}, top5
        FROM historical_data
        WHERE {feature} IS NOT NULL AND {feature} > 0
        ORDER BY {feature} ASC
    """)
    rows = c.fetchall()
    if len(rows) < nb_segments:
        return []

    segment_size = len(rows) // nb_segments
    segments = []
    for i in range(nb_segments):
        debut = i * segment_size
        fin = debut + segment_size if i < nb_segments - 1 else len(rows)
        seg = rows[debut:fin]
        vals = [r[0] for r in seg]
        top5s = [r[1] for r in seg]
        segments.append({
            'segment': i + 1,
            'min': round(min(vals), 2),
            'max': round(max(vals), 2),
            'nb': len(seg),
            'taux_top5': round(sum(top5s) / len(top5s) * 100, 1)
        })
    return segments

# ============================================================
# ANALYSES PRINCIPALES
# ============================================================

def analyse_volume(conn):
    """Statistiques générales sur les données disponibles"""
    c = conn.cursor()

    c.execute("""
        SELECT COUNT(DISTINCT date) as jours,
               COUNT(*) as lignes,
               MIN(date) as debut,
               MAX(date) as fin,
               AVG(top1) as taux_gagnant,
               AVG(top5) as taux_top5
        FROM historical_data
    """)
    row = c.fetchone()

    c.execute("SELECT COUNT(DISTINCT race_name) FROM historical_data")
    nb_courses = c.fetchone()[0]

    c.execute("""
        SELECT discipline, COUNT(DISTINCT date) as nb
        FROM historical_data
        GROUP BY discipline
        ORDER BY nb DESC
    """)
    disciplines = c.fetchall()

    return {
        'nb_jours':     row[0],
        'nb_lignes':    row[1],
        'nb_courses':   nb_courses,
        'debut':        row[2],
        'fin':          row[3],
        'taux_gagnant': round((row[4] or 0) * 100, 1),
        'taux_top5':    round((row[5] or 0) * 100, 1),
        'disciplines':  disciplines,
    }


def analyse_correlations(conn):
    """
    Calcule la corrélation de chaque feature avec top5 et top1.
    Features négatives avec top5 = meilleur prédicteur (ex: cote basse → top5 élevé)
    """
    c = conn.cursor()
    c.execute("""
        SELECT cote_directe, cote_reference, reduction_km,
               forme_recente, tendance_forme, tx_victoire, tx_place,
               gains_carriere, gains_annee, rang_cote, ratio_cote_field,
               nb_courses, nb_victoires, age, driver_change,
               indicateur_tendance, nb_disq,
               top1, top5
        FROM historical_data
        WHERE cote_directe > 0
    """)
    rows = c.fetchall()

    features = [
        'cote_directe', 'cote_reference', 'reduction_km',
        'forme_recente', 'tendance_forme', 'tx_victoire', 'tx_place',
        'gains_carriere', 'gains_annee', 'rang_cote', 'ratio_cote_field',
        'nb_courses', 'nb_victoires', 'age', 'driver_change',
        'indicateur_tendance', 'nb_disq'
    ]

    top1_vals = [r[17] for r in rows]
    top5_vals = [r[18] for r in rows]

    correlations = []
    for i, feat in enumerate(features):
        feat_vals = [r[i] for r in rows]
        corr_top1 = correlation_pearson(feat_vals, top1_vals)
        corr_top5 = correlation_pearson(feat_vals, top5_vals)
        correlations.append({
            'feature':    feat,
            'corr_top1':  corr_top1,
            'corr_top5':  corr_top5,
            'abs_top5':   abs(corr_top5),
        })

    # Trier par corrélation absolue avec top5
    correlations.sort(key=lambda x: x['abs_top5'], reverse=True)
    return correlations


def analyse_cote(conn):
    """Analyse détaillée de la cote comme prédicteur"""
    c = conn.cursor()

    # Taux de réussite par tranche de cote
    tranches = [
        (0, 3,    "Très favori (< 3)"),
        (3, 6,    "Favori (3-6)"),
        (6, 10,   "Second favori (6-10)"),
        (10, 20,  "Outsider (10-20)"),
        (20, 50,  "Longshot (20-50)"),
        (50, 999, "Outsider extrême (50+)"),
    ]

    resultats = []
    for cmin, cmax, label in tranches:
        c.execute("""
            SELECT COUNT(*) as nb,
                   AVG(top1) as taux_gagnant,
                   AVG(top5) as taux_top5,
                   AVG(ordre_arrivee) as pos_moy
            FROM historical_data
            WHERE cote_directe >= ? AND cote_directe < ? AND ordre_arrivee > 0
        """, (cmin, cmax))
        row = c.fetchone()
        if row[0] > 0:
            resultats.append({
                'tranche':       label,
                'nb':            row[0],
                'taux_gagnant':  round((row[1] or 0) * 100, 1),
                'taux_top5':     round((row[2] or 0) * 100, 1),
                'position_moy':  round(row[3] or 0, 1),
            })
    return resultats


def analyse_forme(conn):
    """Analyse de la forme récente comme prédicteur"""
    c = conn.cursor()

    tranches = [
        (0, 1.5,  "Excellente (< 1.5)"),
        (1.5, 3,  "Bonne (1.5-3)"),
        (3, 5,    "Moyenne (3-5)"),
        (5, 8,    "Mauvaise (5-8)"),
        (8, 99,   "Très mauvaise (8+)"),
    ]

    resultats = []
    for fmin, fmax, label in tranches:
        c.execute("""
            SELECT COUNT(*) as nb,
                   AVG(top1) as taux_gagnant,
                   AVG(top5) as taux_top5
            FROM historical_data
            WHERE forme_recente >= ? AND forme_recente < ? AND ordre_arrivee > 0
        """, (fmin, fmax))
        row = c.fetchone()
        if row[0] > 0:
            resultats.append({
                'tranche':      label,
                'nb':           row[0],
                'taux_gagnant': round((row[1] or 0) * 100, 1),
                'taux_top5':    round((row[2] or 0) * 100, 1),
            })
    return resultats


def analyse_avis_entraineur(conn):
    """Impact de l'avis entraîneur"""
    c = conn.cursor()
    c.execute("""
        SELECT avis_entraineur,
               COUNT(*) as nb,
               AVG(top1) as taux_gagnant,
               AVG(top5) as taux_top5,
               AVG(cote_directe) as cote_moy
        FROM historical_data
        WHERE ordre_arrivee > 0
        GROUP BY avis_entraineur
        ORDER BY AVG(top5) DESC
    """)
    rows = c.fetchall()
    return [{
        'avis':          r[0],
        'nb':            r[1],
        'taux_gagnant':  round((r[2] or 0) * 100, 1),
        'taux_top5':     round((r[3] or 0) * 100, 1),
        'cote_moy':      round(r[4] or 0, 1),
    } for r in rows]


def analyse_driver_change(conn):
    """Impact du changement de driver"""
    c = conn.cursor()
    c.execute("""
        SELECT driver_change,
               COUNT(*) as nb,
               AVG(top1) as taux_gagnant,
               AVG(top5) as taux_top5
        FROM historical_data
        WHERE ordre_arrivee > 0
        GROUP BY driver_change
    """)
    rows = c.fetchall()
    return [{
        'driver_change': 'Oui' if r[0] else 'Non',
        'nb':            r[1],
        'taux_gagnant':  round((r[2] or 0) * 100, 1),
        'taux_top5':     round((r[3] or 0) * 100, 1),
    } for r in rows]


def analyse_top_chevaux(conn):
    """Chevaux les plus performants dans l'historique"""
    c = conn.cursor()
    c.execute("""
        SELECT horse_name,
               COUNT(*) as nb_courses,
               SUM(top1) as nb_gagnant,
               SUM(top5) as nb_top5,
               AVG(cote_directe) as cote_moy
        FROM historical_data
        WHERE ordre_arrivee > 0
        GROUP BY horse_name
        HAVING COUNT(*) >= 3
        ORDER BY AVG(top5) DESC, SUM(top1) DESC
        LIMIT 15
    """)
    rows = c.fetchall()
    return [{
        'cheval':       r[0],
        'nb_courses':   r[1],
        'nb_gagnant':   r[2],
        'nb_top5':      r[3],
        'tx_top5':      round(r[3] / r[1] * 100, 1),
        'cote_moy':     round(r[4] or 0, 1),
    } for r in rows]


# ============================================================
# CALIBRATION DES POIDS
# ============================================================

def calibrer_poids(correlations):
    """
    Recalcule les pondérations du scoring basé sur les corrélations REX.
    Les features avec plus forte corrélation absolue reçoivent plus de poids.
    """
    # Mapping feature → critère scoring actuel
    feature_to_critere = {
        'cote_directe':    'cote',
        'forme_recente':   'forme',
        'tx_victoire':     'tx_victoire',
        'tx_place':        'tx_place',
        'reduction_km':    'reduction_km',
        'tendance_forme':  'tendance',
        'rang_cote':       'cote',  # corrélé à cote
    }

    # Agréger les corrélations par critère
    criteres = {
        'cote':        [],
        'forme':       [],
        'tx_victoire': [],
        'tx_place':    [],
        'reduction_km':[],
        'tendance':    [],
        'avis':        [0.05],  # valeur fixe (avis entraîneur difficile à corréler)
    }

    for corr in correlations:
        critere = feature_to_critere.get(corr['feature'])
        if critere and critere in criteres:
            criteres[critere].append(corr['abs_top5'])

    # Moyenne par critère
    scores = {}
    for critere, vals in criteres.items():
        scores[critere] = moyenne(vals) if vals else 0.01

    # Normaliser pour que la somme = 100%
    total = sum(scores.values())
    poids_calibres = {k: round(v / total * 100, 1) for k, v in scores.items()}

    # Poids actuels (référence)
    poids_actuels = {
        'cote':         20.0,
        'forme':        25.0,
        'tx_victoire':  15.0,
        'tx_place':     15.0,
        'reduction_km': 10.0,
        'tendance':     10.0,
        'avis':          5.0,
    }

    return {
        'poids_actuels':  poids_actuels,
        'poids_calibres': poids_calibres,
        'delta': {k: round(poids_calibres.get(k, 0) - poids_actuels.get(k, 0), 1)
                  for k in poids_actuels}
    }


# ============================================================
# RAPPORT
# ============================================================

def print_rapport(volume, correlations, cote_analyse, forme_analyse,
                  avis_analyse, driver_analyse, top_chevaux, poids):

    print(f"\n{'='*65}")
    print(f"📊 ANALYSE REX — {datetime.now().strftime('%d/%m/%Y %H:%M')}")
    print(f"{'='*65}")

    # Volume
    print(f"\n📦 DONNÉES DISPONIBLES")
    print(f"   Jours    : {volume['nb_jours']}")
    print(f"   Courses  : {volume['nb_courses']}")
    print(f"   Partants : {volume['nb_lignes']}")
    print(f"   Période  : {volume['debut']} → {volume['fin']}")
    print(f"   Taux top5 moyen : {volume['taux_top5']}%")
    for disc, nb in volume['disciplines']:
        print(f"   {disc:<20} : {nb} courses")

    # Corrélations
    print(f"\n🔗 CORRÉLATIONS FEATURES → TOP5 (triées par force)")
    print(f"   {'FEATURE':<22} {'CORR TOP5':>10} {'CORR TOP1':>10} {'FORCE'}")
    print(f"   {'─'*55}")
    for c in correlations[:10]:
        force = '●●●' if c['abs_top5'] > 0.15 else '●●' if c['abs_top5'] > 0.08 else '●'
        direction = '▼ (inverse)' if c['corr_top5'] < 0 else '▲ (direct) '
        print(f"   {c['feature']:<22} {c['corr_top5']:>10.4f} {c['corr_top1']:>10.4f}  {force} {direction}")

    # Analyse cote
    print(f"\n🎰 TAUX DE RÉUSSITE PAR TRANCHE DE COTE")
    print(f"   {'TRANCHE':<28} {'NB':>5} {'GAGNANT':>8} {'TOP5':>6} {'POS MOY':>8}")
    print(f"   {'─'*60}")
    for t in cote_analyse:
        print(f"   {t['tranche']:<28} {t['nb']:>5} {t['taux_gagnant']:>7.1f}% {t['taux_top5']:>5.1f}% {t['position_moy']:>8.1f}")

    # Analyse forme
    print(f"\n🏃 TAUX DE RÉUSSITE PAR FORME RÉCENTE")
    print(f"   {'FORME':<28} {'NB':>5} {'GAGNANT':>8} {'TOP5':>6}")
    print(f"   {'─'*50}")
    for t in forme_analyse:
        print(f"   {t['tranche']:<28} {t['nb']:>5} {t['taux_gagnant']:>7.1f}% {t['taux_top5']:>5.1f}%")

    # Avis entraîneur
    print(f"\n👨‍🏫 IMPACT AVIS ENTRAÎNEUR")
    print(f"   {'AVIS':<20} {'NB':>5} {'GAGNANT':>8} {'TOP5':>6} {'COTE MOY':>9}")
    print(f"   {'─'*52}")
    for a in avis_analyse:
        print(f"   {a['avis']:<20} {a['nb']:>5} {a['taux_gagnant']:>7.1f}% {a['taux_top5']:>5.1f}% {a['cote_moy']:>9.1f}")

    # Driver change
    print(f"\n🔄 IMPACT CHANGEMENT DE DRIVER")
    for d in driver_analyse:
        print(f"   Changement {d['driver_change']:<4} : {d['nb']} courses · "
              f"gagnant {d['taux_gagnant']}% · top5 {d['taux_top5']}%")

    # Top chevaux
    if top_chevaux:
        print(f"\n🏆 TOP CHEVAUX (≥3 courses)")
        print(f"   {'CHEVAL':<28} {'COURSES':>8} {'GAGNANT':>8} {'TOP5':>6} {'TX TOP5':>8} {'COTE MOY':>9}")
        print(f"   {'─'*70}")
        for h in top_chevaux[:10]:
            print(f"   {h['cheval']:<28} {h['nb_courses']:>8} {h['nb_gagnant']:>8} "
                  f"{h['nb_top5']:>6} {h['tx_top5']:>7.1f}% {h['cote_moy']:>9.1f}")

    # Calibration poids
    print(f"\n⚖️  CALIBRATION DES POIDS DU SCORING")
    print(f"   {'CRITÈRE':<16} {'ACTUEL':>8} {'CALIBRÉ':>8} {'DELTA':>8} RECOMMANDATION")
    print(f"   {'─'*60}")
    for critere in poids['poids_actuels']:
        actuel  = poids['poids_actuels'][critere]
        calibre = poids['poids_calibres'].get(critere, actuel)
        delta   = poids['delta'].get(critere, 0)
        if delta > 2:
            reco = "↑ AUGMENTER"
        elif delta < -2:
            reco = "↓ RÉDUIRE"
        else:
            reco = "→ OK"
        print(f"   {critere:<16} {actuel:>7.1f}% {calibre:>7.1f}% {delta:>+7.1f}%  {reco}")

    nb_jours = volume['nb_jours']
    if nb_jours < 30:
        print(f"\n⚠️  Données insuffisantes ({nb_jours} jours) — attendez 30+ jours avant d'appliquer la calibration")
    elif nb_jours < 100:
        print(f"\n✅ Données suffisantes pour calibration Phase 2 ({nb_jours} jours)")
        print(f"   Recommandation : appliquer les nouveaux poids dans scoring.py")
    else:
        print(f"\n🚀 Données suffisantes pour ML Phase 3 ({nb_jours} jours)")
        print(f"   Recommandation : entraîner un modèle XGBoost")

    print(f"\n{'='*65}\n")


# ============================================================
# MAIN
# ============================================================

def main():
    print(f"\n{'='*65}")
    print(f"🧠 ANALYSE REX — {datetime.now().strftime('%d/%m/%Y %H:%M')}")
    print(f"{'='*65}\n")

    conn = sqlite3.connect(DB_PATH)

    # Vérifier les données
    c = conn.cursor()
    c.execute("SELECT COUNT(*) FROM historical_data")
    nb = c.fetchone()[0]

    if nb == 0:
        print("❌ Aucune donnée dans historical_data.")
        print("   Lancez d'abord : python3 historical_loader.py --days 365")
        conn.close()
        return

    print(f"✅ {nb} lignes trouvées dans historical_data\n")

    # Analyses
    print("📊 Calcul des statistiques...")
    volume      = analyse_volume(conn)
    correlations= analyse_correlations(conn)
    cote_analyse= analyse_cote(conn)
    forme_analyse= analyse_forme(conn)
    avis_analyse= analyse_avis_entraineur(conn)
    driver_analyse= analyse_driver_change(conn)
    top_chevaux = analyse_top_chevaux(conn)
    poids       = calibrer_poids(correlations)

    conn.close()

    # Afficher le rapport
    print_rapport(volume, correlations, cote_analyse, forme_analyse,
                  avis_analyse, driver_analyse, top_chevaux, poids)

    # Sauvegarder le rapport JSON
    rapport = {
        'date':           datetime.now().isoformat(),
        'volume':         volume,
        'correlations':   correlations,
        'cote_analyse':   cote_analyse,
        'forme_analyse':  forme_analyse,
        'avis_analyse':   avis_analyse,
        'driver_analyse': driver_analyse,
        'top_chevaux':    top_chevaux,
        'poids_calibres': poids,
    }

    turf_dir = os.environ.get('TURF_DIR', '/home/h3r7/turf_scraper')
    path = f"{turf_dir}/rex_analyse_{datetime.now().strftime('%Y%m%d')}.json"
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(rapport, f, indent=2, ensure_ascii=False)
    print(f"📁 Rapport sauvegardé : {path}")


if __name__ == "__main__":
    main()