Files
turf_saas/historical_loader.py
2026-04-25 17:18:43 +02:00

502 lines
18 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Historical Loader - Récupère 1 an d'historique Quinté+ via API PMU
Sauvegarde toutes les features + résultats en BDD pour entraîner le modèle ML
"""
import requests
import sqlite3
import json
import time
import os
import re
from datetime import datetime, timedelta
DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db")
HEADERS = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
BASE_URL = "https://turfinfo.api.pmu.fr/rest/client/1/programme"
# ============================================================
# BASE DE DONNÉES
# ============================================================
def init_historical_table():
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS historical_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
-- Identifiants course
date TEXT NOT NULL,
race_name TEXT,
hippodrome TEXT,
distance INTEGER,
discipline TEXT,
allocation REAL,
nb_partants INTEGER,
heure TEXT,
-- Identifiants cheval
horse_name TEXT NOT NULL,
horse_number INTEGER,
driver TEXT,
driver_change INTEGER DEFAULT 0,
-- Features cheval
age INTEGER,
sexe TEXT,
musique TEXT,
nb_courses INTEGER,
nb_victoires INTEGER,
nb_places INTEGER,
nb_places_2 INTEGER,
nb_places_3 INTEGER,
gains_carriere REAL,
gains_annee REAL,
gains_victoires REAL,
reduction_km INTEGER,
avis_entraineur TEXT,
oeilleres TEXT,
deferre TEXT,
-- Features marché
cote_directe REAL,
cote_reference REAL,
indicateur_tendance REAL,
est_favori INTEGER DEFAULT 0,
-- Features dérivées (calculées)
tx_victoire REAL,
tx_place REAL,
forme_recente REAL,
tendance_forme REAL,
nb_disq INTEGER,
rang_cote INTEGER,
ratio_cote_field REAL,
-- Résultat officiel (variable cible)
ordre_arrivee INTEGER,
temps_obtenu INTEGER,
top1 INTEGER DEFAULT 0,
top3 INTEGER DEFAULT 0,
top5 INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(date, horse_name, horse_number)
)
''')
# Index pour accélérer les requêtes ML
c.execute('CREATE INDEX IF NOT EXISTS idx_hist_date ON historical_data(date)')
c.execute('CREATE INDEX IF NOT EXISTS idx_hist_top5 ON historical_data(top5)')
c.execute('CREATE INDEX IF NOT EXISTS idx_hist_horse ON historical_data(horse_name)')
conn.commit()
conn.close()
print(f"✅ Table historical_data initialisée : {DB_PATH}")
# ============================================================
# DÉCODEUR MUSIQUE
# ============================================================
def parse_musique(musique):
if not musique:
return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0}
clean = re.sub(r'\(\d+\)', '', musique)
resultats = re.findall(r'(\d+|D|0)([amphsc]?)', clean)
positions = []
for pos, disc in resultats[:10]:
if pos == 'D':
positions.append(99)
else:
positions.append(int(pos))
if not positions:
return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0}
nb_disq = positions.count(99)
positions_clean = [p for p in positions if p != 99]
recentes = positions_clean[:3]
forme_recente = sum(recentes) / len(recentes) if recentes else 99
if len(positions_clean) >= 4:
debut = sum(positions_clean[-4:]) / 4
fin = sum(positions_clean[:4]) / 4
tendance = round(debut - fin, 2)
else:
tendance = 0
return {
'forme_recente': round(forme_recente, 2),
'tendance': tendance,
'nb_disq': nb_disq
}
# ============================================================
# API PMU
# ============================================================
def get_programme(date_pmu):
url = f"{BASE_URL}/{date_pmu}/reunions"
r = requests.get(url, headers=HEADERS, timeout=15)
if r.status_code != 200:
return None
return r.json().get('programme', {}).get('reunions', [])
def find_quinte(reunions):
for reunion in reunions:
for course in reunion.get('courses', []):
paris = [p['typePari'] for p in course.get('paris', [])]
libelle = course.get('libelle', '')
if any('QUINTE' in p for p in paris) or 'PARIS-TURF' in libelle:
heure_ts = course.get('heureDepart', 0)
heure = datetime.fromtimestamp(heure_ts/1000).strftime('%H:%M') if heure_ts else '13:55'
return {
'num_reunion': reunion['numOfficiel'],
'num_course': course['numOrdre'],
'libelle': libelle,
'hippodrome': reunion['hippodrome']['libelleCourt'],
'distance': course.get('distance', 0),
'discipline': course.get('discipline', ''),
'allocation': course.get('montantPrix', 0),
'nb_partants': course.get('nombreDeclaresPartants', 0),
'heure': heure,
'arrivee_def': course.get('arriveeDefinitive', False),
}
return None
def get_participants(date_pmu, num_r, num_c):
url = f"{BASE_URL}/{date_pmu}/R{num_r}/C{num_c}/participants"
r = requests.get(url, headers=HEADERS, timeout=15)
if r.status_code != 200:
return []
return r.json().get('participants', [])
# ============================================================
# EXTRACTION FEATURES
# ============================================================
def extract_features(p, course_info, all_participants):
"""Extrait toutes les features d'un partant + résultat"""
musique_stats = parse_musique(p.get('musique', ''))
# Cote directe
rapport_direct = p.get('dernierRapportDirect', {}) or {}
cote_directe = rapport_direct.get('rapport', 0) or 0
est_favori = 1 if rapport_direct.get('favoris', False) else 0
indicateur = rapport_direct.get('nombreIndicateurTendance', 0) or 0
# Cote référence
rapport_ref = p.get('dernierRapportReference', {}) or {}
cote_reference = rapport_ref.get('rapport', 0) or 0
# Stats carrière
nb_courses = p.get('nombreCourses', 0) or 0
nb_victoires = p.get('nombreVictoires', 0) or 0
nb_places = p.get('nombrePlaces', 0) or 0
nb_p2 = p.get('nombrePlacesSecond', 0) or 0
nb_p3 = p.get('nombrePlacesTroisieme', 0) or 0
tx_vic = round(nb_victoires / nb_courses * 100, 2) if nb_courses else 0
tx_place = round(nb_places / nb_courses * 100, 2) if nb_courses else 0
# Gains
gains = p.get('gainsParticipant', {}) or {}
gains_carriere = gains.get('gainsCarriere', 0) or 0
gains_annee = gains.get('gainsAnneeEnCours', 0) or 0
gains_victoires= gains.get('gainsVictoires', 0) or 0
# Rang cote dans le field
all_cotes = sorted([
(x.get('dernierRapportDirect', {}) or {}).get('rapport', 999) or 999
for x in all_participants
])
rang_cote = all_cotes.index(cote_directe) + 1 if cote_directe in all_cotes else 99
# Ratio cote / moyenne field
cotes_valides = [c for c in all_cotes if c < 900]
moy_cote = sum(cotes_valides) / len(cotes_valides) if cotes_valides else 1
ratio_cote = round(cote_directe / moy_cote, 3) if moy_cote else 0
# Résultat
ordre = p.get('ordreArrivee', 0) or 0
top1 = 1 if ordre == 1 else 0
top3 = 1 if 1 <= ordre <= 3 else 0
top5 = 1 if 1 <= ordre <= 5 else 0
return {
# Course
'date': None, # rempli par l'appelant
'race_name': course_info['libelle'],
'hippodrome': course_info['hippodrome'],
'distance': course_info['distance'],
'discipline': course_info['discipline'],
'allocation': course_info['allocation'],
'nb_partants': course_info['nb_partants'],
'heure': course_info['heure'],
# Cheval
'horse_name': p.get('nom', ''),
'horse_number': p.get('numPmu', 0),
'driver': p.get('driver', ''),
'driver_change': 1 if p.get('driverChange', False) else 0,
# Features
'age': p.get('age', 0),
'sexe': p.get('sexe', ''),
'musique': p.get('musique', ''),
'nb_courses': nb_courses,
'nb_victoires': nb_victoires,
'nb_places': nb_places,
'nb_places_2': nb_p2,
'nb_places_3': nb_p3,
'gains_carriere': gains_carriere,
'gains_annee': gains_annee,
'gains_victoires': gains_victoires,
'reduction_km': p.get('reductionKilometrique', 0),
'avis_entraineur': p.get('avisEntraineur', 'NEUTRE'),
'oeilleres': p.get('oeilleres', ''),
'deferre': p.get('deferre', ''),
# Marché
'cote_directe': cote_directe,
'cote_reference': cote_reference,
'indicateur_tendance': indicateur,
'est_favori': est_favori,
# Dérivées
'tx_victoire': tx_vic,
'tx_place': tx_place,
'forme_recente': musique_stats['forme_recente'],
'tendance_forme': musique_stats['tendance'],
'nb_disq': musique_stats['nb_disq'],
'rang_cote': rang_cote,
'ratio_cote_field': ratio_cote,
# Résultat
'ordre_arrivee': ordre,
'temps_obtenu': p.get('tempsObtenu', 0),
'top1': top1,
'top3': top3,
'top5': top5,
}
# ============================================================
# SAUVEGARDE
# ============================================================
def save_batch(rows):
if not rows:
return 0
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
saved = 0
for row in rows:
try:
c.execute('''
INSERT OR IGNORE INTO historical_data
(date, race_name, hippodrome, distance, discipline, allocation, nb_partants, heure,
horse_name, horse_number, driver, driver_change,
age, sexe, musique, nb_courses, nb_victoires, nb_places, nb_places_2, nb_places_3,
gains_carriere, gains_annee, gains_victoires, reduction_km, avis_entraineur,
oeilleres, deferre,
cote_directe, cote_reference, indicateur_tendance, est_favori,
tx_victoire, tx_place, forme_recente, tendance_forme, nb_disq,
rang_cote, ratio_cote_field,
ordre_arrivee, temps_obtenu, top1, top3, top5)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
''', (
row['date'], row['race_name'], row['hippodrome'], row['distance'],
row['discipline'], row['allocation'], row['nb_partants'], row['heure'],
row['horse_name'], row['horse_number'], row['driver'], row['driver_change'],
row['age'], row['sexe'], row['musique'], row['nb_courses'],
row['nb_victoires'], row['nb_places'], row['nb_places_2'], row['nb_places_3'],
row['gains_carriere'], row['gains_annee'], row['gains_victoires'],
row['reduction_km'], row['avis_entraineur'], row['oeilleres'], row['deferre'],
row['cote_directe'], row['cote_reference'], row['indicateur_tendance'], row['est_favori'],
row['tx_victoire'], row['tx_place'], row['forme_recente'],
row['tendance_forme'], row['nb_disq'], row['rang_cote'], row['ratio_cote_field'],
row['ordre_arrivee'], row['temps_obtenu'], row['top1'], row['top3'], row['top5']
))
saved += c.rowcount
except Exception as e:
print(f" ⚠️ {row.get('horse_name','?')}: {e}")
conn.commit()
conn.close()
return saved
# ============================================================
# MAIN
# ============================================================
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--days', type=int, default=365, help='Nombre de jours à récupérer')
parser.add_argument('--start', type=str, default=None, help='Date de début YYYY-MM-DD')
parser.add_argument('--delay', type=float, default=0.5, help='Délai entre requêtes (s)')
args = parser.parse_args()
print(f"\n{'='*60}")
print(f"📚 HISTORICAL LOADER — {datetime.now().strftime('%d/%m/%Y %H:%M')}")
print(f"{'='*60}")
init_historical_table()
# Vérifier ce qui est déjà en BDD
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute("SELECT COUNT(DISTINCT date) as jours, COUNT(*) as lignes FROM historical_data")
existing = c.fetchone()
c.execute("SELECT MIN(date), MAX(date) FROM historical_data")
date_range = c.fetchone()
conn.close()
print(f"\n📊 Données existantes : {existing[0]} jours · {existing[1]} lignes")
if date_range[0]:
print(f" Période : {date_range[0]}{date_range[1]}")
# Calculer la plage de dates
if args.start:
start_date = datetime.strptime(args.start, '%Y-%m-%d')
else:
start_date = datetime.now() - timedelta(days=args.days)
end_date = datetime.now() - timedelta(days=1) # Hier (résultats definitifs)
total_days = (end_date - start_date).days + 1
print(f"\n📅 Période à charger : {start_date.strftime('%d/%m/%Y')}{end_date.strftime('%d/%m/%Y')} ({total_days} jours)")
print(f"⏱️ Délai entre requêtes : {args.delay}s")
print(f"⏳ Durée estimée : ~{round(total_days * args.delay * 2 / 60, 1)} minutes\n")
# Stats
total_courses = 0
total_partants = 0
total_saved = 0
errors = 0
skipped = 0
current = start_date
day_num = 0
while current <= end_date:
day_num += 1
date_str = current.strftime('%Y-%m-%d')
date_pmu = current.strftime('%d%m%Y')
# Vérifier si déjà chargé
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute("SELECT COUNT(*) FROM historical_data WHERE date=?", (date_str,))
already = c.fetchone()[0]
conn.close()
if already > 0:
print(f" [{day_num:3}/{total_days}] {date_str} — déjà chargé ({already} partants) ⏭️")
skipped += 1
current += timedelta(days=1)
continue
try:
# Récupérer le programme
reunions = get_programme(date_pmu)
if not reunions:
print(f" [{day_num:3}/{total_days}] {date_str} — pas de programme")
current += timedelta(days=1)
time.sleep(args.delay)
continue
# Trouver le Quinté+
quinte = find_quinte(reunions)
if not quinte:
print(f" [{day_num:3}/{total_days}] {date_str} — pas de Quinté+")
current += timedelta(days=1)
time.sleep(args.delay)
continue
if not quinte['arrivee_def']:
print(f" [{day_num:3}/{total_days}] {date_str} — arrivée non définitive")
current += timedelta(days=1)
time.sleep(args.delay)
continue
time.sleep(args.delay)
# Récupérer les participants
participants = get_participants(date_pmu, quinte['num_reunion'], quinte['num_course'])
if not participants:
print(f" [{day_num:3}/{total_days}] {date_str} — pas de participants")
current += timedelta(days=1)
time.sleep(args.delay)
continue
# Extraire les features
rows = []
for p in participants:
if p.get('statut') not in ('PARTANT', None):
continue
features = extract_features(p, quinte, participants)
features['date'] = date_str
rows.append(features)
# Sauvegarder
saved = save_batch(rows)
total_courses += 1
total_partants += len(rows)
total_saved += saved
winner = next((r['horse_name'] for r in rows if r['top1']), '?')
print(f" [{day_num:3}/{total_days}] {date_str}{quinte['libelle'][:30]:<30} {len(rows)} partants · gagnant: {winner}")
except requests.exceptions.Timeout:
print(f" [{day_num:3}/{total_days}] {date_str} — timeout ⏱️")
errors += 1
time.sleep(2)
except Exception as e:
print(f" [{day_num:3}/{total_days}] {date_str} — erreur: {e}")
errors += 1
time.sleep(1)
current += timedelta(days=1)
time.sleep(args.delay)
# Résumé final
print(f"\n{'='*60}")
print(f"✅ CHARGEMENT TERMINÉ")
print(f"{'='*60}")
print(f" Courses chargées : {total_courses}")
print(f" Partants total : {total_partants}")
print(f" Lignes sauvegardées : {total_saved}")
print(f" Déjà présents : {skipped} jours")
print(f" Erreurs : {errors}")
# Statistiques BDD finale
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute("SELECT COUNT(DISTINCT date), COUNT(*), AVG(top5) FROM historical_data")
stats = c.fetchone()
conn.close()
print(f"\n📊 BDD HISTORIQUE FINALE :")
print(f" Jours total : {stats[0]}")
print(f" Lignes total : {stats[1]}")
print(f" Taux top5 moy : {round((stats[2] or 0)*100, 1)}%")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()