221 lines
7.3 KiB
Python
221 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Optimisation du modèle de scoring turf
|
|
Méthodes pour affiner le modèle avec précision
|
|
"""
|
|
import sqlite3
|
|
import numpy as np
|
|
from datetime import datetime, timedelta
|
|
from collections import defaultdict
|
|
|
|
DB_PATH = "/home/h3r7/turf_scraper/turf.db"
|
|
|
|
def analyze_scoring_accuracy():
|
|
"""Analyse la précision actuelle du scoring"""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.row_factory = sqlite3.Row
|
|
c = conn.cursor()
|
|
|
|
# Get all results with scoring
|
|
c.execute("""
|
|
SELECT
|
|
r.date,
|
|
r.race_name,
|
|
r.horse_name as actual_horse,
|
|
r.position as actual_position,
|
|
s.horse_name as scored_horse,
|
|
s.score as scoring_score,
|
|
s.rang_scoring as scoring_rank
|
|
FROM results r
|
|
JOIN scoring s ON r.date = s.date AND r.race_name = s.race_name AND r.horse_name = s.horse_name
|
|
WHERE s.scoring_version = 'v2'
|
|
ORDER BY r.date, r.race_name
|
|
""")
|
|
|
|
# Calculate hit rates
|
|
races = defaultdict(list)
|
|
for row in c.fetchall():
|
|
races[(row['date'], row['race_name'])].append({
|
|
'horse': row['actual_horse'],
|
|
'position': row['actual_position'],
|
|
'scored_horse': row['scored_horse'],
|
|
'score': row['scoring_score'],
|
|
'rank': row['scoring_rank']
|
|
})
|
|
|
|
# Calculate metrics
|
|
total_races = len(races)
|
|
top1_hits = 0
|
|
top3_hits = 0
|
|
top5_hits = 0
|
|
|
|
for race_key, horses in races.items():
|
|
actual_top1 = [h['horse'] for h in horses if h['position'] == 1]
|
|
actual_top3 = [h['horse'] for h in horses if h['position'] <= 3]
|
|
actual_top5 = [h['horse'] for h in horses if h['position'] <= 5]
|
|
|
|
pred_top1 = [h['scored_horse'] for h in horses if h['rank'] == 1]
|
|
pred_top3 = [h['scored_horse'] for h in horses if h['rank'] <= 3]
|
|
pred_top5 = [h['scored_horse'] for h in horses if h['rank'] <= 5]
|
|
|
|
if any(p == actual_top1[0] for p in pred_top1 if actual_top1):
|
|
top1_hits += 1
|
|
if any(p in actual_top3 for p in pred_top3):
|
|
top3_hits += 1
|
|
if any(p in actual_top5 for p in pred_top5):
|
|
top5_hits += 1
|
|
|
|
conn.close()
|
|
|
|
print("="*60)
|
|
print("ANALYSE PRÉCISION SCORING V2")
|
|
print("="*60)
|
|
print(f"Total courses analysées: {total_races}")
|
|
print(f"Top 1 hit rate: {top1_hits}/{total_races} = {top1_hits*100/total_races:.1f}%")
|
|
print(f"Top 3 hit rate: {top3_hits}/{total_races} = {top3_hits*100/total_races:.1f}%")
|
|
print(f"Top 5 hit rate: {top5_hits}/{total_races} = {top5_hits*100/total_races:.1f}%")
|
|
|
|
return races
|
|
|
|
def analyze_feature_importance():
|
|
"""Analyse l'importance des features dans le scoring"""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
|
|
# Get scoring components
|
|
c.execute("""
|
|
SELECT
|
|
AVG(score_cote) as avg_cote,
|
|
AVG(score_forme) as avg_forme,
|
|
AVG(score_victoire) as avg_victoire,
|
|
AVG(score_place) as avg_place,
|
|
AVG(score_avis) as avg_avis
|
|
FROM scoring
|
|
WHERE scoring_version = 'v2'
|
|
""")
|
|
|
|
row = c.fetchone()
|
|
print("\n" + "="*60)
|
|
print("IMPORTANCE DES COMPOSANTES DU SCORE")
|
|
print("="*60)
|
|
print(f"Score Côte (odds): {row[0]:.2f}")
|
|
print(f"Score Forme: {row[1]:.2f}")
|
|
print(f"Score Victoire: {row[2]:.2f}")
|
|
print(f"Score Place: {row[3]:.2f}")
|
|
print(f"Score Avis: {row[4]:.2f}")
|
|
|
|
conn.close()
|
|
|
|
def identify_patterns():
|
|
"""Identifie les patterns qui améliorent les prédictions"""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.row_factory = sqlite3.Row
|
|
c = conn.cursor()
|
|
|
|
print("\n" + "="*60)
|
|
print("PATTERNS IDENTIFIÉS")
|
|
print("="*60)
|
|
|
|
# Position vs Score - correlation
|
|
c.execute("""
|
|
SELECT
|
|
AVG(s.score) as avg_score,
|
|
r.position
|
|
FROM scoring s
|
|
JOIN results r ON s.date = r.date AND s.race_name = r.race_name AND s.horse_name = r.horse_name
|
|
WHERE s.scoring_version = 'v2'
|
|
GROUP BY r.position
|
|
ORDER BY r.position
|
|
""")
|
|
|
|
print("\n1. Score moyen par position:")
|
|
for row in c.fetchall():
|
|
print(f" Position {row[1]}: Score avg = {row[0]:.1f}")
|
|
|
|
# Côte vs Position - les faibles cotes gagnent plus souvent?
|
|
c.execute("""
|
|
SELECT
|
|
CASE
|
|
WHEN p.odds < 5 THEN 'Favori (<5)'
|
|
WHEN p.odds < 10 THEN 'Coef 5-10'
|
|
WHEN p.odds < 20 THEN 'Coef 10-20'
|
|
ELSE 'Outsider (>20)'
|
|
END as category,
|
|
COUNT(*) as total,
|
|
SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) as top3,
|
|
ROUND(CAST(SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) * 100, 1) as pct
|
|
FROM predictions p
|
|
JOIN results r ON p.date = r.date AND p.race_name = r.race_name AND p.horse_name = r.horse_name
|
|
WHERE p.source = 'canalturf_partants'
|
|
GROUP BY category
|
|
ORDER BY pct DESC
|
|
""")
|
|
|
|
print("\n2. Taux de Top3 par catégorie de cote:")
|
|
for row in c.fetchall():
|
|
print(f" {row[0]}: {row[3]}% ({row[2]}/{row[1]})")
|
|
|
|
# Forme du cheval (musique) - les récents gains comptent?
|
|
c.execute("""
|
|
SELECT
|
|
s.forme_recente,
|
|
COUNT(*) as total,
|
|
SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) as top3,
|
|
ROUND(CAST(SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) * 100, 1) as pct
|
|
FROM scoring s
|
|
JOIN results r ON s.date = r.date AND s.race_name = r.race_name AND s.horse_name = r.horse_name
|
|
WHERE s.scoring_version = 'v2' AND s.forme_recente IS NOT NULL
|
|
GROUP BY s.forme_recente
|
|
HAVING total > 5
|
|
ORDER BY pct DESC
|
|
LIMIT 10
|
|
""")
|
|
|
|
print("\n3. Forme récente (musique) vs réussite:")
|
|
for row in c.fetchall():
|
|
print(f" {row[0]}: {row[3]}% ({row[2]}/{row[1]})")
|
|
|
|
conn.close()
|
|
|
|
def suggest_improvements():
|
|
"""Suggère des améliorations basées sur l'analyse"""
|
|
print("\n" + "="*60)
|
|
print("AMÉLORATIONS SUGGÉRÉES")
|
|
print("="*60)
|
|
|
|
suggestions = """
|
|
1. FEATURE ENGINEERING
|
|
- Ajouter: forme sur 5 dernières courses (poids + élevé)
|
|
- Ajouter: performance sur même distance/hippodrome
|
|
- Ajouter: historique jockey/cheval ensemble
|
|
|
|
2. POIDS DES COMPOSANTES
|
|
- Réduire le poids du score Côte (peu prédictif)
|
|
- Augmenter le poids de la forme récente
|
|
- Ajouter un component "tendance" (évolution cotes)
|
|
|
|
3. MACHINE LEARNING
|
|
- Passer de scoring linéaire à XGBoost/LogisticRegression
|
|
- Utiliser cross-validation pour éviter overfitting
|
|
- Ajouter regularization
|
|
|
|
4. ENSEMBLE
|
|
- Combiner scoring V2 + CanalTurf + un modèle ML
|
|
- Pondérer selon historique de précision
|
|
|
|
5. BACKTESTING
|
|
- Tester sur 30+ jours de données
|
|
- Calculer ROI théorique
|
|
- Ajuster seuils de sélection
|
|
"""
|
|
print(suggestions)
|
|
|
|
def run_full_analysis():
|
|
"""Lance l'analyse complète"""
|
|
analyze_scoring_accuracy()
|
|
analyze_feature_importance()
|
|
identify_patterns()
|
|
suggest_improvements()
|
|
|
|
if __name__ == "__main__":
|
|
run_full_analysis() |