#!/usr/bin/env python3 """ Historical Loader - Récupère 1 an d'historique Quinté+ via API PMU Sauvegarde toutes les features + résultats en BDD pour entraîner le modèle ML """ import requests import sqlite3 import json import time import os import re from datetime import datetime, timedelta DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db") HEADERS = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'} BASE_URL = "https://turfinfo.api.pmu.fr/rest/client/1/programme" # ============================================================ # BASE DE DONNÉES # ============================================================ def init_historical_table(): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute(''' CREATE TABLE IF NOT EXISTS historical_data ( id INTEGER PRIMARY KEY AUTOINCREMENT, -- Identifiants course date TEXT NOT NULL, race_name TEXT, hippodrome TEXT, distance INTEGER, discipline TEXT, allocation REAL, nb_partants INTEGER, heure TEXT, -- Identifiants cheval horse_name TEXT NOT NULL, horse_number INTEGER, driver TEXT, driver_change INTEGER DEFAULT 0, -- Features cheval age INTEGER, sexe TEXT, musique TEXT, nb_courses INTEGER, nb_victoires INTEGER, nb_places INTEGER, nb_places_2 INTEGER, nb_places_3 INTEGER, gains_carriere REAL, gains_annee REAL, gains_victoires REAL, reduction_km INTEGER, avis_entraineur TEXT, oeilleres TEXT, deferre TEXT, -- Features marché cote_directe REAL, cote_reference REAL, indicateur_tendance REAL, est_favori INTEGER DEFAULT 0, -- Features dérivées (calculées) tx_victoire REAL, tx_place REAL, forme_recente REAL, tendance_forme REAL, nb_disq INTEGER, rang_cote INTEGER, ratio_cote_field REAL, -- Résultat officiel (variable cible) ordre_arrivee INTEGER, temps_obtenu INTEGER, top1 INTEGER DEFAULT 0, top3 INTEGER DEFAULT 0, top5 INTEGER DEFAULT 0, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, UNIQUE(date, horse_name, horse_number) ) ''') # Index pour accélérer les requêtes ML c.execute('CREATE INDEX IF NOT EXISTS idx_hist_date ON historical_data(date)') c.execute('CREATE INDEX IF NOT EXISTS idx_hist_top5 ON historical_data(top5)') c.execute('CREATE INDEX IF NOT EXISTS idx_hist_horse ON historical_data(horse_name)') conn.commit() conn.close() print(f"✅ Table historical_data initialisée : {DB_PATH}") # ============================================================ # DÉCODEUR MUSIQUE # ============================================================ def parse_musique(musique): if not musique: return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0} clean = re.sub(r'\(\d+\)', '', musique) resultats = re.findall(r'(\d+|D|0)([amphsc]?)', clean) positions = [] for pos, disc in resultats[:10]: if pos == 'D': positions.append(99) else: positions.append(int(pos)) if not positions: return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0} nb_disq = positions.count(99) positions_clean = [p for p in positions if p != 99] recentes = positions_clean[:3] forme_recente = sum(recentes) / len(recentes) if recentes else 99 if len(positions_clean) >= 4: debut = sum(positions_clean[-4:]) / 4 fin = sum(positions_clean[:4]) / 4 tendance = round(debut - fin, 2) else: tendance = 0 return { 'forme_recente': round(forme_recente, 2), 'tendance': tendance, 'nb_disq': nb_disq } # ============================================================ # API PMU # ============================================================ def get_programme(date_pmu): url = f"{BASE_URL}/{date_pmu}/reunions" r = requests.get(url, headers=HEADERS, timeout=15) if r.status_code != 200: return None return r.json().get('programme', {}).get('reunions', []) def find_quinte(reunions): for reunion in reunions: for course in reunion.get('courses', []): paris = [p['typePari'] for p in course.get('paris', [])] libelle = course.get('libelle', '') if any('QUINTE' in p for p in paris) or 'PARIS-TURF' in libelle: heure_ts = course.get('heureDepart', 0) heure = datetime.fromtimestamp(heure_ts/1000).strftime('%H:%M') if heure_ts else '13:55' return { 'num_reunion': reunion['numOfficiel'], 'num_course': course['numOrdre'], 'libelle': libelle, 'hippodrome': reunion['hippodrome']['libelleCourt'], 'distance': course.get('distance', 0), 'discipline': course.get('discipline', ''), 'allocation': course.get('montantPrix', 0), 'nb_partants': course.get('nombreDeclaresPartants', 0), 'heure': heure, 'arrivee_def': course.get('arriveeDefinitive', False), } return None def get_participants(date_pmu, num_r, num_c): url = f"{BASE_URL}/{date_pmu}/R{num_r}/C{num_c}/participants" r = requests.get(url, headers=HEADERS, timeout=15) if r.status_code != 200: return [] return r.json().get('participants', []) # ============================================================ # EXTRACTION FEATURES # ============================================================ def extract_features(p, course_info, all_participants): """Extrait toutes les features d'un partant + résultat""" musique_stats = parse_musique(p.get('musique', '')) # Cote directe rapport_direct = p.get('dernierRapportDirect', {}) or {} cote_directe = rapport_direct.get('rapport', 0) or 0 est_favori = 1 if rapport_direct.get('favoris', False) else 0 indicateur = rapport_direct.get('nombreIndicateurTendance', 0) or 0 # Cote référence rapport_ref = p.get('dernierRapportReference', {}) or {} cote_reference = rapport_ref.get('rapport', 0) or 0 # Stats carrière nb_courses = p.get('nombreCourses', 0) or 0 nb_victoires = p.get('nombreVictoires', 0) or 0 nb_places = p.get('nombrePlaces', 0) or 0 nb_p2 = p.get('nombrePlacesSecond', 0) or 0 nb_p3 = p.get('nombrePlacesTroisieme', 0) or 0 tx_vic = round(nb_victoires / nb_courses * 100, 2) if nb_courses else 0 tx_place = round(nb_places / nb_courses * 100, 2) if nb_courses else 0 # Gains gains = p.get('gainsParticipant', {}) or {} gains_carriere = gains.get('gainsCarriere', 0) or 0 gains_annee = gains.get('gainsAnneeEnCours', 0) or 0 gains_victoires= gains.get('gainsVictoires', 0) or 0 # Rang cote dans le field all_cotes = sorted([ (x.get('dernierRapportDirect', {}) or {}).get('rapport', 999) or 999 for x in all_participants ]) rang_cote = all_cotes.index(cote_directe) + 1 if cote_directe in all_cotes else 99 # Ratio cote / moyenne field cotes_valides = [c for c in all_cotes if c < 900] moy_cote = sum(cotes_valides) / len(cotes_valides) if cotes_valides else 1 ratio_cote = round(cote_directe / moy_cote, 3) if moy_cote else 0 # Résultat ordre = p.get('ordreArrivee', 0) or 0 top1 = 1 if ordre == 1 else 0 top3 = 1 if 1 <= ordre <= 3 else 0 top5 = 1 if 1 <= ordre <= 5 else 0 return { # Course 'date': None, # rempli par l'appelant 'race_name': course_info['libelle'], 'hippodrome': course_info['hippodrome'], 'distance': course_info['distance'], 'discipline': course_info['discipline'], 'allocation': course_info['allocation'], 'nb_partants': course_info['nb_partants'], 'heure': course_info['heure'], # Cheval 'horse_name': p.get('nom', ''), 'horse_number': p.get('numPmu', 0), 'driver': p.get('driver', ''), 'driver_change': 1 if p.get('driverChange', False) else 0, # Features 'age': p.get('age', 0), 'sexe': p.get('sexe', ''), 'musique': p.get('musique', ''), 'nb_courses': nb_courses, 'nb_victoires': nb_victoires, 'nb_places': nb_places, 'nb_places_2': nb_p2, 'nb_places_3': nb_p3, 'gains_carriere': gains_carriere, 'gains_annee': gains_annee, 'gains_victoires': gains_victoires, 'reduction_km': p.get('reductionKilometrique', 0), 'avis_entraineur': p.get('avisEntraineur', 'NEUTRE'), 'oeilleres': p.get('oeilleres', ''), 'deferre': p.get('deferre', ''), # Marché 'cote_directe': cote_directe, 'cote_reference': cote_reference, 'indicateur_tendance': indicateur, 'est_favori': est_favori, # Dérivées 'tx_victoire': tx_vic, 'tx_place': tx_place, 'forme_recente': musique_stats['forme_recente'], 'tendance_forme': musique_stats['tendance'], 'nb_disq': musique_stats['nb_disq'], 'rang_cote': rang_cote, 'ratio_cote_field': ratio_cote, # Résultat 'ordre_arrivee': ordre, 'temps_obtenu': p.get('tempsObtenu', 0), 'top1': top1, 'top3': top3, 'top5': top5, } # ============================================================ # SAUVEGARDE # ============================================================ def save_batch(rows): if not rows: return 0 conn = sqlite3.connect(DB_PATH) c = conn.cursor() saved = 0 for row in rows: try: c.execute(''' INSERT OR IGNORE INTO historical_data (date, race_name, hippodrome, distance, discipline, allocation, nb_partants, heure, horse_name, horse_number, driver, driver_change, age, sexe, musique, nb_courses, nb_victoires, nb_places, nb_places_2, nb_places_3, gains_carriere, gains_annee, gains_victoires, reduction_km, avis_entraineur, oeilleres, deferre, cote_directe, cote_reference, indicateur_tendance, est_favori, tx_victoire, tx_place, forme_recente, tendance_forme, nb_disq, rang_cote, ratio_cote_field, ordre_arrivee, temps_obtenu, top1, top3, top5) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''', ( row['date'], row['race_name'], row['hippodrome'], row['distance'], row['discipline'], row['allocation'], row['nb_partants'], row['heure'], row['horse_name'], row['horse_number'], row['driver'], row['driver_change'], row['age'], row['sexe'], row['musique'], row['nb_courses'], row['nb_victoires'], row['nb_places'], row['nb_places_2'], row['nb_places_3'], row['gains_carriere'], row['gains_annee'], row['gains_victoires'], row['reduction_km'], row['avis_entraineur'], row['oeilleres'], row['deferre'], row['cote_directe'], row['cote_reference'], row['indicateur_tendance'], row['est_favori'], row['tx_victoire'], row['tx_place'], row['forme_recente'], row['tendance_forme'], row['nb_disq'], row['rang_cote'], row['ratio_cote_field'], row['ordre_arrivee'], row['temps_obtenu'], row['top1'], row['top3'], row['top5'] )) saved += c.rowcount except Exception as e: print(f" ⚠️ {row.get('horse_name','?')}: {e}") conn.commit() conn.close() return saved # ============================================================ # MAIN # ============================================================ def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--days', type=int, default=365, help='Nombre de jours à récupérer') parser.add_argument('--start', type=str, default=None, help='Date de début YYYY-MM-DD') parser.add_argument('--delay', type=float, default=0.5, help='Délai entre requêtes (s)') args = parser.parse_args() print(f"\n{'='*60}") print(f"📚 HISTORICAL LOADER — {datetime.now().strftime('%d/%m/%Y %H:%M')}") print(f"{'='*60}") init_historical_table() # Vérifier ce qui est déjà en BDD conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute("SELECT COUNT(DISTINCT date) as jours, COUNT(*) as lignes FROM historical_data") existing = c.fetchone() c.execute("SELECT MIN(date), MAX(date) FROM historical_data") date_range = c.fetchone() conn.close() print(f"\n📊 Données existantes : {existing[0]} jours · {existing[1]} lignes") if date_range[0]: print(f" Période : {date_range[0]} → {date_range[1]}") # Calculer la plage de dates if args.start: start_date = datetime.strptime(args.start, '%Y-%m-%d') else: start_date = datetime.now() - timedelta(days=args.days) end_date = datetime.now() - timedelta(days=1) # Hier (résultats definitifs) total_days = (end_date - start_date).days + 1 print(f"\n📅 Période à charger : {start_date.strftime('%d/%m/%Y')} → {end_date.strftime('%d/%m/%Y')} ({total_days} jours)") print(f"⏱️ Délai entre requêtes : {args.delay}s") print(f"⏳ Durée estimée : ~{round(total_days * args.delay * 2 / 60, 1)} minutes\n") # Stats total_courses = 0 total_partants = 0 total_saved = 0 errors = 0 skipped = 0 current = start_date day_num = 0 while current <= end_date: day_num += 1 date_str = current.strftime('%Y-%m-%d') date_pmu = current.strftime('%d%m%Y') # Vérifier si déjà chargé conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute("SELECT COUNT(*) FROM historical_data WHERE date=?", (date_str,)) already = c.fetchone()[0] conn.close() if already > 0: print(f" [{day_num:3}/{total_days}] {date_str} — déjà chargé ({already} partants) ⏭️") skipped += 1 current += timedelta(days=1) continue try: # Récupérer le programme reunions = get_programme(date_pmu) if not reunions: print(f" [{day_num:3}/{total_days}] {date_str} — pas de programme") current += timedelta(days=1) time.sleep(args.delay) continue # Trouver le Quinté+ quinte = find_quinte(reunions) if not quinte: print(f" [{day_num:3}/{total_days}] {date_str} — pas de Quinté+") current += timedelta(days=1) time.sleep(args.delay) continue if not quinte['arrivee_def']: print(f" [{day_num:3}/{total_days}] {date_str} — arrivée non définitive") current += timedelta(days=1) time.sleep(args.delay) continue time.sleep(args.delay) # Récupérer les participants participants = get_participants(date_pmu, quinte['num_reunion'], quinte['num_course']) if not participants: print(f" [{day_num:3}/{total_days}] {date_str} — pas de participants") current += timedelta(days=1) time.sleep(args.delay) continue # Extraire les features rows = [] for p in participants: if p.get('statut') not in ('PARTANT', None): continue features = extract_features(p, quinte, participants) features['date'] = date_str rows.append(features) # Sauvegarder saved = save_batch(rows) total_courses += 1 total_partants += len(rows) total_saved += saved winner = next((r['horse_name'] for r in rows if r['top1']), '?') print(f" [{day_num:3}/{total_days}] {date_str} — {quinte['libelle'][:30]:<30} {len(rows)} partants · gagnant: {winner} ✅") except requests.exceptions.Timeout: print(f" [{day_num:3}/{total_days}] {date_str} — timeout ⏱️") errors += 1 time.sleep(2) except Exception as e: print(f" [{day_num:3}/{total_days}] {date_str} — erreur: {e} ❌") errors += 1 time.sleep(1) current += timedelta(days=1) time.sleep(args.delay) # Résumé final print(f"\n{'='*60}") print(f"✅ CHARGEMENT TERMINÉ") print(f"{'='*60}") print(f" Courses chargées : {total_courses}") print(f" Partants total : {total_partants}") print(f" Lignes sauvegardées : {total_saved}") print(f" Déjà présents : {skipped} jours") print(f" Erreurs : {errors}") # Statistiques BDD finale conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute("SELECT COUNT(DISTINCT date), COUNT(*), AVG(top5) FROM historical_data") stats = c.fetchone() conn.close() print(f"\n📊 BDD HISTORIQUE FINALE :") print(f" Jours total : {stats[0]}") print(f" Lignes total : {stats[1]}") print(f" Taux top5 moy : {round((stats[2] or 0)*100, 1)}%") print(f"{'='*60}\n") if __name__ == "__main__": main()