#!/usr/bin/env python3 """ Optimisation du modèle de scoring turf Méthodes pour affiner le modèle avec précision """ import sqlite3 import numpy as np from datetime import datetime, timedelta from collections import defaultdict DB_PATH = "/home/h3r7/turf_scraper/turf.db" def analyze_scoring_accuracy(): """Analyse la précision actuelle du scoring""" conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row c = conn.cursor() # Get all results with scoring c.execute(""" SELECT r.date, r.race_name, r.horse_name as actual_horse, r.position as actual_position, s.horse_name as scored_horse, s.score as scoring_score, s.rang_scoring as scoring_rank FROM results r JOIN scoring s ON r.date = s.date AND r.race_name = s.race_name AND r.horse_name = s.horse_name WHERE s.scoring_version = 'v2' ORDER BY r.date, r.race_name """) # Calculate hit rates races = defaultdict(list) for row in c.fetchall(): races[(row['date'], row['race_name'])].append({ 'horse': row['actual_horse'], 'position': row['actual_position'], 'scored_horse': row['scored_horse'], 'score': row['scoring_score'], 'rank': row['scoring_rank'] }) # Calculate metrics total_races = len(races) top1_hits = 0 top3_hits = 0 top5_hits = 0 for race_key, horses in races.items(): actual_top1 = [h['horse'] for h in horses if h['position'] == 1] actual_top3 = [h['horse'] for h in horses if h['position'] <= 3] actual_top5 = [h['horse'] for h in horses if h['position'] <= 5] pred_top1 = [h['scored_horse'] for h in horses if h['rank'] == 1] pred_top3 = [h['scored_horse'] for h in horses if h['rank'] <= 3] pred_top5 = [h['scored_horse'] for h in horses if h['rank'] <= 5] if any(p == actual_top1[0] for p in pred_top1 if actual_top1): top1_hits += 1 if any(p in actual_top3 for p in pred_top3): top3_hits += 1 if any(p in actual_top5 for p in pred_top5): top5_hits += 1 conn.close() print("="*60) print("ANALYSE PRÉCISION SCORING V2") print("="*60) print(f"Total courses analysées: {total_races}") print(f"Top 1 hit rate: {top1_hits}/{total_races} = {top1_hits*100/total_races:.1f}%") print(f"Top 3 hit rate: {top3_hits}/{total_races} = {top3_hits*100/total_races:.1f}%") print(f"Top 5 hit rate: {top5_hits}/{total_races} = {top5_hits*100/total_races:.1f}%") return races def analyze_feature_importance(): """Analyse l'importance des features dans le scoring""" conn = sqlite3.connect(DB_PATH) c = conn.cursor() # Get scoring components c.execute(""" SELECT AVG(score_cote) as avg_cote, AVG(score_forme) as avg_forme, AVG(score_victoire) as avg_victoire, AVG(score_place) as avg_place, AVG(score_avis) as avg_avis FROM scoring WHERE scoring_version = 'v2' """) row = c.fetchone() print("\n" + "="*60) print("IMPORTANCE DES COMPOSANTES DU SCORE") print("="*60) print(f"Score Côte (odds): {row[0]:.2f}") print(f"Score Forme: {row[1]:.2f}") print(f"Score Victoire: {row[2]:.2f}") print(f"Score Place: {row[3]:.2f}") print(f"Score Avis: {row[4]:.2f}") conn.close() def identify_patterns(): """Identifie les patterns qui améliorent les prédictions""" conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row c = conn.cursor() print("\n" + "="*60) print("PATTERNS IDENTIFIÉS") print("="*60) # Position vs Score - correlation c.execute(""" SELECT AVG(s.score) as avg_score, r.position FROM scoring s JOIN results r ON s.date = r.date AND s.race_name = r.race_name AND s.horse_name = r.horse_name WHERE s.scoring_version = 'v2' GROUP BY r.position ORDER BY r.position """) print("\n1. Score moyen par position:") for row in c.fetchall(): print(f" Position {row[1]}: Score avg = {row[0]:.1f}") # Côte vs Position - les faibles cotes gagnent plus souvent? c.execute(""" SELECT CASE WHEN p.odds < 5 THEN 'Favori (<5)' WHEN p.odds < 10 THEN 'Coef 5-10' WHEN p.odds < 20 THEN 'Coef 10-20' ELSE 'Outsider (>20)' END as category, COUNT(*) as total, SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) as top3, ROUND(CAST(SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) * 100, 1) as pct FROM predictions p JOIN results r ON p.date = r.date AND p.race_name = r.race_name AND p.horse_name = r.horse_name WHERE p.source = 'canalturf_partants' GROUP BY category ORDER BY pct DESC """) print("\n2. Taux de Top3 par catégorie de cote:") for row in c.fetchall(): print(f" {row[0]}: {row[3]}% ({row[2]}/{row[1]})") # Forme du cheval (musique) - les récents gains comptent? c.execute(""" SELECT s.forme_recente, COUNT(*) as total, SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) as top3, ROUND(CAST(SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) * 100, 1) as pct FROM scoring s JOIN results r ON s.date = r.date AND s.race_name = r.race_name AND s.horse_name = r.horse_name WHERE s.scoring_version = 'v2' AND s.forme_recente IS NOT NULL GROUP BY s.forme_recente HAVING total > 5 ORDER BY pct DESC LIMIT 10 """) print("\n3. Forme récente (musique) vs réussite:") for row in c.fetchall(): print(f" {row[0]}: {row[3]}% ({row[2]}/{row[1]})") conn.close() def suggest_improvements(): """Suggère des améliorations basées sur l'analyse""" print("\n" + "="*60) print("AMÉLORATIONS SUGGÉRÉES") print("="*60) suggestions = """ 1. FEATURE ENGINEERING - Ajouter: forme sur 5 dernières courses (poids + élevé) - Ajouter: performance sur même distance/hippodrome - Ajouter: historique jockey/cheval ensemble 2. POIDS DES COMPOSANTES - Réduire le poids du score Côte (peu prédictif) - Augmenter le poids de la forme récente - Ajouter un component "tendance" (évolution cotes) 3. MACHINE LEARNING - Passer de scoring linéaire à XGBoost/LogisticRegression - Utiliser cross-validation pour éviter overfitting - Ajouter regularization 4. ENSEMBLE - Combiner scoring V2 + CanalTurf + un modèle ML - Pondérer selon historique de précision 5. BACKTESTING - Tester sur 30+ jours de données - Calculer ROI théorique - Ajuster seuils de sélection """ print(suggestions) def run_full_analysis(): """Lance l'analyse complète""" analyze_scoring_accuracy() analyze_feature_importance() identify_patterns() suggest_improvements() if __name__ == "__main__": run_full_analysis()