Files
turf_saas/model_optimizer.py
2026-04-25 17:18:43 +02:00

221 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""
Optimisation du modèle de scoring turf
Méthodes pour affiner le modèle avec précision
"""
import sqlite3
import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict
DB_PATH = "/home/h3r7/turf_scraper/turf.db"
def analyze_scoring_accuracy():
"""Analyse la précision actuelle du scoring"""
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
c = conn.cursor()
# Get all results with scoring
c.execute("""
SELECT
r.date,
r.race_name,
r.horse_name as actual_horse,
r.position as actual_position,
s.horse_name as scored_horse,
s.score as scoring_score,
s.rang_scoring as scoring_rank
FROM results r
JOIN scoring s ON r.date = s.date AND r.race_name = s.race_name AND r.horse_name = s.horse_name
WHERE s.scoring_version = 'v2'
ORDER BY r.date, r.race_name
""")
# Calculate hit rates
races = defaultdict(list)
for row in c.fetchall():
races[(row['date'], row['race_name'])].append({
'horse': row['actual_horse'],
'position': row['actual_position'],
'scored_horse': row['scored_horse'],
'score': row['scoring_score'],
'rank': row['scoring_rank']
})
# Calculate metrics
total_races = len(races)
top1_hits = 0
top3_hits = 0
top5_hits = 0
for race_key, horses in races.items():
actual_top1 = [h['horse'] for h in horses if h['position'] == 1]
actual_top3 = [h['horse'] for h in horses if h['position'] <= 3]
actual_top5 = [h['horse'] for h in horses if h['position'] <= 5]
pred_top1 = [h['scored_horse'] for h in horses if h['rank'] == 1]
pred_top3 = [h['scored_horse'] for h in horses if h['rank'] <= 3]
pred_top5 = [h['scored_horse'] for h in horses if h['rank'] <= 5]
if any(p == actual_top1[0] for p in pred_top1 if actual_top1):
top1_hits += 1
if any(p in actual_top3 for p in pred_top3):
top3_hits += 1
if any(p in actual_top5 for p in pred_top5):
top5_hits += 1
conn.close()
print("="*60)
print("ANALYSE PRÉCISION SCORING V2")
print("="*60)
print(f"Total courses analysées: {total_races}")
print(f"Top 1 hit rate: {top1_hits}/{total_races} = {top1_hits*100/total_races:.1f}%")
print(f"Top 3 hit rate: {top3_hits}/{total_races} = {top3_hits*100/total_races:.1f}%")
print(f"Top 5 hit rate: {top5_hits}/{total_races} = {top5_hits*100/total_races:.1f}%")
return races
def analyze_feature_importance():
"""Analyse l'importance des features dans le scoring"""
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
# Get scoring components
c.execute("""
SELECT
AVG(score_cote) as avg_cote,
AVG(score_forme) as avg_forme,
AVG(score_victoire) as avg_victoire,
AVG(score_place) as avg_place,
AVG(score_avis) as avg_avis
FROM scoring
WHERE scoring_version = 'v2'
""")
row = c.fetchone()
print("\n" + "="*60)
print("IMPORTANCE DES COMPOSANTES DU SCORE")
print("="*60)
print(f"Score Côte (odds): {row[0]:.2f}")
print(f"Score Forme: {row[1]:.2f}")
print(f"Score Victoire: {row[2]:.2f}")
print(f"Score Place: {row[3]:.2f}")
print(f"Score Avis: {row[4]:.2f}")
conn.close()
def identify_patterns():
"""Identifie les patterns qui améliorent les prédictions"""
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
c = conn.cursor()
print("\n" + "="*60)
print("PATTERNS IDENTIFIÉS")
print("="*60)
# Position vs Score - correlation
c.execute("""
SELECT
AVG(s.score) as avg_score,
r.position
FROM scoring s
JOIN results r ON s.date = r.date AND s.race_name = r.race_name AND s.horse_name = r.horse_name
WHERE s.scoring_version = 'v2'
GROUP BY r.position
ORDER BY r.position
""")
print("\n1. Score moyen par position:")
for row in c.fetchall():
print(f" Position {row[1]}: Score avg = {row[0]:.1f}")
# Côte vs Position - les faibles cotes gagnent plus souvent?
c.execute("""
SELECT
CASE
WHEN p.odds < 5 THEN 'Favori (<5)'
WHEN p.odds < 10 THEN 'Coef 5-10'
WHEN p.odds < 20 THEN 'Coef 10-20'
ELSE 'Outsider (>20)'
END as category,
COUNT(*) as total,
SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) as top3,
ROUND(CAST(SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) * 100, 1) as pct
FROM predictions p
JOIN results r ON p.date = r.date AND p.race_name = r.race_name AND p.horse_name = r.horse_name
WHERE p.source = 'canalturf_partants'
GROUP BY category
ORDER BY pct DESC
""")
print("\n2. Taux de Top3 par catégorie de cote:")
for row in c.fetchall():
print(f" {row[0]}: {row[3]}% ({row[2]}/{row[1]})")
# Forme du cheval (musique) - les récents gains comptent?
c.execute("""
SELECT
s.forme_recente,
COUNT(*) as total,
SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) as top3,
ROUND(CAST(SUM(CASE WHEN r.position <= 3 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) * 100, 1) as pct
FROM scoring s
JOIN results r ON s.date = r.date AND s.race_name = r.race_name AND s.horse_name = r.horse_name
WHERE s.scoring_version = 'v2' AND s.forme_recente IS NOT NULL
GROUP BY s.forme_recente
HAVING total > 5
ORDER BY pct DESC
LIMIT 10
""")
print("\n3. Forme récente (musique) vs réussite:")
for row in c.fetchall():
print(f" {row[0]}: {row[3]}% ({row[2]}/{row[1]})")
conn.close()
def suggest_improvements():
"""Suggère des améliorations basées sur l'analyse"""
print("\n" + "="*60)
print("AMÉLORATIONS SUGGÉRÉES")
print("="*60)
suggestions = """
1. FEATURE ENGINEERING
- Ajouter: forme sur 5 dernières courses (poids + élevé)
- Ajouter: performance sur même distance/hippodrome
- Ajouter: historique jockey/cheval ensemble
2. POIDS DES COMPOSANTES
- Réduire le poids du score Côte (peu prédictif)
- Augmenter le poids de la forme récente
- Ajouter un component "tendance" (évolution cotes)
3. MACHINE LEARNING
- Passer de scoring linéaire à XGBoost/LogisticRegression
- Utiliser cross-validation pour éviter overfitting
- Ajouter regularization
4. ENSEMBLE
- Combiner scoring V2 + CanalTurf + un modèle ML
- Pondérer selon historique de précision
5. BACKTESTING
- Tester sur 30+ jours de données
- Calculer ROI théorique
- Ajuster seuils de sélection
"""
print(suggestions)
def run_full_analysis():
"""Lance l'analyse complète"""
analyze_scoring_accuracy()
analyze_feature_importance()
identify_patterns()
suggest_improvements()
if __name__ == "__main__":
run_full_analysis()