502 lines
18 KiB
Python
Executable File
502 lines
18 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Historical Loader - Récupère 1 an d'historique Quinté+ via API PMU
|
|
Sauvegarde toutes les features + résultats en BDD pour entraîner le modèle ML
|
|
"""
|
|
|
|
import requests
|
|
import sqlite3
|
|
import json
|
|
import time
|
|
import os
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
|
|
DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db")
|
|
HEADERS = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
|
|
BASE_URL = "https://turfinfo.api.pmu.fr/rest/client/1/programme"
|
|
|
|
# ============================================================
|
|
# BASE DE DONNÉES
|
|
# ============================================================
|
|
|
|
def init_historical_table():
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
c.execute('''
|
|
CREATE TABLE IF NOT EXISTS historical_data (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
-- Identifiants course
|
|
date TEXT NOT NULL,
|
|
race_name TEXT,
|
|
hippodrome TEXT,
|
|
distance INTEGER,
|
|
discipline TEXT,
|
|
allocation REAL,
|
|
nb_partants INTEGER,
|
|
heure TEXT,
|
|
|
|
-- Identifiants cheval
|
|
horse_name TEXT NOT NULL,
|
|
horse_number INTEGER,
|
|
driver TEXT,
|
|
driver_change INTEGER DEFAULT 0,
|
|
|
|
-- Features cheval
|
|
age INTEGER,
|
|
sexe TEXT,
|
|
musique TEXT,
|
|
nb_courses INTEGER,
|
|
nb_victoires INTEGER,
|
|
nb_places INTEGER,
|
|
nb_places_2 INTEGER,
|
|
nb_places_3 INTEGER,
|
|
gains_carriere REAL,
|
|
gains_annee REAL,
|
|
gains_victoires REAL,
|
|
reduction_km INTEGER,
|
|
avis_entraineur TEXT,
|
|
oeilleres TEXT,
|
|
deferre TEXT,
|
|
|
|
-- Features marché
|
|
cote_directe REAL,
|
|
cote_reference REAL,
|
|
indicateur_tendance REAL,
|
|
est_favori INTEGER DEFAULT 0,
|
|
|
|
-- Features dérivées (calculées)
|
|
tx_victoire REAL,
|
|
tx_place REAL,
|
|
forme_recente REAL,
|
|
tendance_forme REAL,
|
|
nb_disq INTEGER,
|
|
rang_cote INTEGER,
|
|
ratio_cote_field REAL,
|
|
|
|
-- Résultat officiel (variable cible)
|
|
ordre_arrivee INTEGER,
|
|
temps_obtenu INTEGER,
|
|
top1 INTEGER DEFAULT 0,
|
|
top3 INTEGER DEFAULT 0,
|
|
top5 INTEGER DEFAULT 0,
|
|
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
|
UNIQUE(date, horse_name, horse_number)
|
|
)
|
|
''')
|
|
|
|
# Index pour accélérer les requêtes ML
|
|
c.execute('CREATE INDEX IF NOT EXISTS idx_hist_date ON historical_data(date)')
|
|
c.execute('CREATE INDEX IF NOT EXISTS idx_hist_top5 ON historical_data(top5)')
|
|
c.execute('CREATE INDEX IF NOT EXISTS idx_hist_horse ON historical_data(horse_name)')
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
print(f"✅ Table historical_data initialisée : {DB_PATH}")
|
|
|
|
|
|
# ============================================================
|
|
# DÉCODEUR MUSIQUE
|
|
# ============================================================
|
|
|
|
def parse_musique(musique):
|
|
if not musique:
|
|
return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0}
|
|
|
|
clean = re.sub(r'\(\d+\)', '', musique)
|
|
resultats = re.findall(r'(\d+|D|0)([amphsc]?)', clean)
|
|
|
|
positions = []
|
|
for pos, disc in resultats[:10]:
|
|
if pos == 'D':
|
|
positions.append(99)
|
|
else:
|
|
positions.append(int(pos))
|
|
|
|
if not positions:
|
|
return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0}
|
|
|
|
nb_disq = positions.count(99)
|
|
positions_clean = [p for p in positions if p != 99]
|
|
|
|
recentes = positions_clean[:3]
|
|
forme_recente = sum(recentes) / len(recentes) if recentes else 99
|
|
|
|
if len(positions_clean) >= 4:
|
|
debut = sum(positions_clean[-4:]) / 4
|
|
fin = sum(positions_clean[:4]) / 4
|
|
tendance = round(debut - fin, 2)
|
|
else:
|
|
tendance = 0
|
|
|
|
return {
|
|
'forme_recente': round(forme_recente, 2),
|
|
'tendance': tendance,
|
|
'nb_disq': nb_disq
|
|
}
|
|
|
|
|
|
# ============================================================
|
|
# API PMU
|
|
# ============================================================
|
|
|
|
def get_programme(date_pmu):
|
|
url = f"{BASE_URL}/{date_pmu}/reunions"
|
|
r = requests.get(url, headers=HEADERS, timeout=15)
|
|
if r.status_code != 200:
|
|
return None
|
|
return r.json().get('programme', {}).get('reunions', [])
|
|
|
|
|
|
def find_quinte(reunions):
|
|
for reunion in reunions:
|
|
for course in reunion.get('courses', []):
|
|
paris = [p['typePari'] for p in course.get('paris', [])]
|
|
libelle = course.get('libelle', '')
|
|
if any('QUINTE' in p for p in paris) or 'PARIS-TURF' in libelle:
|
|
heure_ts = course.get('heureDepart', 0)
|
|
heure = datetime.fromtimestamp(heure_ts/1000).strftime('%H:%M') if heure_ts else '13:55'
|
|
return {
|
|
'num_reunion': reunion['numOfficiel'],
|
|
'num_course': course['numOrdre'],
|
|
'libelle': libelle,
|
|
'hippodrome': reunion['hippodrome']['libelleCourt'],
|
|
'distance': course.get('distance', 0),
|
|
'discipline': course.get('discipline', ''),
|
|
'allocation': course.get('montantPrix', 0),
|
|
'nb_partants': course.get('nombreDeclaresPartants', 0),
|
|
'heure': heure,
|
|
'arrivee_def': course.get('arriveeDefinitive', False),
|
|
}
|
|
return None
|
|
|
|
|
|
def get_participants(date_pmu, num_r, num_c):
|
|
url = f"{BASE_URL}/{date_pmu}/R{num_r}/C{num_c}/participants"
|
|
r = requests.get(url, headers=HEADERS, timeout=15)
|
|
if r.status_code != 200:
|
|
return []
|
|
return r.json().get('participants', [])
|
|
|
|
|
|
# ============================================================
|
|
# EXTRACTION FEATURES
|
|
# ============================================================
|
|
|
|
def extract_features(p, course_info, all_participants):
|
|
"""Extrait toutes les features d'un partant + résultat"""
|
|
|
|
musique_stats = parse_musique(p.get('musique', ''))
|
|
|
|
# Cote directe
|
|
rapport_direct = p.get('dernierRapportDirect', {}) or {}
|
|
cote_directe = rapport_direct.get('rapport', 0) or 0
|
|
est_favori = 1 if rapport_direct.get('favoris', False) else 0
|
|
indicateur = rapport_direct.get('nombreIndicateurTendance', 0) or 0
|
|
|
|
# Cote référence
|
|
rapport_ref = p.get('dernierRapportReference', {}) or {}
|
|
cote_reference = rapport_ref.get('rapport', 0) or 0
|
|
|
|
# Stats carrière
|
|
nb_courses = p.get('nombreCourses', 0) or 0
|
|
nb_victoires = p.get('nombreVictoires', 0) or 0
|
|
nb_places = p.get('nombrePlaces', 0) or 0
|
|
nb_p2 = p.get('nombrePlacesSecond', 0) or 0
|
|
nb_p3 = p.get('nombrePlacesTroisieme', 0) or 0
|
|
|
|
tx_vic = round(nb_victoires / nb_courses * 100, 2) if nb_courses else 0
|
|
tx_place = round(nb_places / nb_courses * 100, 2) if nb_courses else 0
|
|
|
|
# Gains
|
|
gains = p.get('gainsParticipant', {}) or {}
|
|
gains_carriere = gains.get('gainsCarriere', 0) or 0
|
|
gains_annee = gains.get('gainsAnneeEnCours', 0) or 0
|
|
gains_victoires= gains.get('gainsVictoires', 0) or 0
|
|
|
|
# Rang cote dans le field
|
|
all_cotes = sorted([
|
|
(x.get('dernierRapportDirect', {}) or {}).get('rapport', 999) or 999
|
|
for x in all_participants
|
|
])
|
|
rang_cote = all_cotes.index(cote_directe) + 1 if cote_directe in all_cotes else 99
|
|
|
|
# Ratio cote / moyenne field
|
|
cotes_valides = [c for c in all_cotes if c < 900]
|
|
moy_cote = sum(cotes_valides) / len(cotes_valides) if cotes_valides else 1
|
|
ratio_cote = round(cote_directe / moy_cote, 3) if moy_cote else 0
|
|
|
|
# Résultat
|
|
ordre = p.get('ordreArrivee', 0) or 0
|
|
top1 = 1 if ordre == 1 else 0
|
|
top3 = 1 if 1 <= ordre <= 3 else 0
|
|
top5 = 1 if 1 <= ordre <= 5 else 0
|
|
|
|
return {
|
|
# Course
|
|
'date': None, # rempli par l'appelant
|
|
'race_name': course_info['libelle'],
|
|
'hippodrome': course_info['hippodrome'],
|
|
'distance': course_info['distance'],
|
|
'discipline': course_info['discipline'],
|
|
'allocation': course_info['allocation'],
|
|
'nb_partants': course_info['nb_partants'],
|
|
'heure': course_info['heure'],
|
|
|
|
# Cheval
|
|
'horse_name': p.get('nom', ''),
|
|
'horse_number': p.get('numPmu', 0),
|
|
'driver': p.get('driver', ''),
|
|
'driver_change': 1 if p.get('driverChange', False) else 0,
|
|
|
|
# Features
|
|
'age': p.get('age', 0),
|
|
'sexe': p.get('sexe', ''),
|
|
'musique': p.get('musique', ''),
|
|
'nb_courses': nb_courses,
|
|
'nb_victoires': nb_victoires,
|
|
'nb_places': nb_places,
|
|
'nb_places_2': nb_p2,
|
|
'nb_places_3': nb_p3,
|
|
'gains_carriere': gains_carriere,
|
|
'gains_annee': gains_annee,
|
|
'gains_victoires': gains_victoires,
|
|
'reduction_km': p.get('reductionKilometrique', 0),
|
|
'avis_entraineur': p.get('avisEntraineur', 'NEUTRE'),
|
|
'oeilleres': p.get('oeilleres', ''),
|
|
'deferre': p.get('deferre', ''),
|
|
|
|
# Marché
|
|
'cote_directe': cote_directe,
|
|
'cote_reference': cote_reference,
|
|
'indicateur_tendance': indicateur,
|
|
'est_favori': est_favori,
|
|
|
|
# Dérivées
|
|
'tx_victoire': tx_vic,
|
|
'tx_place': tx_place,
|
|
'forme_recente': musique_stats['forme_recente'],
|
|
'tendance_forme': musique_stats['tendance'],
|
|
'nb_disq': musique_stats['nb_disq'],
|
|
'rang_cote': rang_cote,
|
|
'ratio_cote_field': ratio_cote,
|
|
|
|
# Résultat
|
|
'ordre_arrivee': ordre,
|
|
'temps_obtenu': p.get('tempsObtenu', 0),
|
|
'top1': top1,
|
|
'top3': top3,
|
|
'top5': top5,
|
|
}
|
|
|
|
|
|
# ============================================================
|
|
# SAUVEGARDE
|
|
# ============================================================
|
|
|
|
def save_batch(rows):
|
|
if not rows:
|
|
return 0
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
saved = 0
|
|
for row in rows:
|
|
try:
|
|
c.execute('''
|
|
INSERT OR IGNORE INTO historical_data
|
|
(date, race_name, hippodrome, distance, discipline, allocation, nb_partants, heure,
|
|
horse_name, horse_number, driver, driver_change,
|
|
age, sexe, musique, nb_courses, nb_victoires, nb_places, nb_places_2, nb_places_3,
|
|
gains_carriere, gains_annee, gains_victoires, reduction_km, avis_entraineur,
|
|
oeilleres, deferre,
|
|
cote_directe, cote_reference, indicateur_tendance, est_favori,
|
|
tx_victoire, tx_place, forme_recente, tendance_forme, nb_disq,
|
|
rang_cote, ratio_cote_field,
|
|
ordre_arrivee, temps_obtenu, top1, top3, top5)
|
|
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
|
''', (
|
|
row['date'], row['race_name'], row['hippodrome'], row['distance'],
|
|
row['discipline'], row['allocation'], row['nb_partants'], row['heure'],
|
|
row['horse_name'], row['horse_number'], row['driver'], row['driver_change'],
|
|
row['age'], row['sexe'], row['musique'], row['nb_courses'],
|
|
row['nb_victoires'], row['nb_places'], row['nb_places_2'], row['nb_places_3'],
|
|
row['gains_carriere'], row['gains_annee'], row['gains_victoires'],
|
|
row['reduction_km'], row['avis_entraineur'], row['oeilleres'], row['deferre'],
|
|
row['cote_directe'], row['cote_reference'], row['indicateur_tendance'], row['est_favori'],
|
|
row['tx_victoire'], row['tx_place'], row['forme_recente'],
|
|
row['tendance_forme'], row['nb_disq'], row['rang_cote'], row['ratio_cote_field'],
|
|
row['ordre_arrivee'], row['temps_obtenu'], row['top1'], row['top3'], row['top5']
|
|
))
|
|
saved += c.rowcount
|
|
except Exception as e:
|
|
print(f" ⚠️ {row.get('horse_name','?')}: {e}")
|
|
conn.commit()
|
|
conn.close()
|
|
return saved
|
|
|
|
|
|
# ============================================================
|
|
# MAIN
|
|
# ============================================================
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--days', type=int, default=365, help='Nombre de jours à récupérer')
|
|
parser.add_argument('--start', type=str, default=None, help='Date de début YYYY-MM-DD')
|
|
parser.add_argument('--delay', type=float, default=0.5, help='Délai entre requêtes (s)')
|
|
args = parser.parse_args()
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"📚 HISTORICAL LOADER — {datetime.now().strftime('%d/%m/%Y %H:%M')}")
|
|
print(f"{'='*60}")
|
|
|
|
init_historical_table()
|
|
|
|
# Vérifier ce qui est déjà en BDD
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
c.execute("SELECT COUNT(DISTINCT date) as jours, COUNT(*) as lignes FROM historical_data")
|
|
existing = c.fetchone()
|
|
c.execute("SELECT MIN(date), MAX(date) FROM historical_data")
|
|
date_range = c.fetchone()
|
|
conn.close()
|
|
|
|
print(f"\n📊 Données existantes : {existing[0]} jours · {existing[1]} lignes")
|
|
if date_range[0]:
|
|
print(f" Période : {date_range[0]} → {date_range[1]}")
|
|
|
|
# Calculer la plage de dates
|
|
if args.start:
|
|
start_date = datetime.strptime(args.start, '%Y-%m-%d')
|
|
else:
|
|
start_date = datetime.now() - timedelta(days=args.days)
|
|
|
|
end_date = datetime.now() - timedelta(days=1) # Hier (résultats definitifs)
|
|
|
|
total_days = (end_date - start_date).days + 1
|
|
print(f"\n📅 Période à charger : {start_date.strftime('%d/%m/%Y')} → {end_date.strftime('%d/%m/%Y')} ({total_days} jours)")
|
|
print(f"⏱️ Délai entre requêtes : {args.delay}s")
|
|
print(f"⏳ Durée estimée : ~{round(total_days * args.delay * 2 / 60, 1)} minutes\n")
|
|
|
|
# Stats
|
|
total_courses = 0
|
|
total_partants = 0
|
|
total_saved = 0
|
|
errors = 0
|
|
skipped = 0
|
|
|
|
current = start_date
|
|
day_num = 0
|
|
|
|
while current <= end_date:
|
|
day_num += 1
|
|
date_str = current.strftime('%Y-%m-%d')
|
|
date_pmu = current.strftime('%d%m%Y')
|
|
|
|
# Vérifier si déjà chargé
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
c.execute("SELECT COUNT(*) FROM historical_data WHERE date=?", (date_str,))
|
|
already = c.fetchone()[0]
|
|
conn.close()
|
|
|
|
if already > 0:
|
|
print(f" [{day_num:3}/{total_days}] {date_str} — déjà chargé ({already} partants) ⏭️")
|
|
skipped += 1
|
|
current += timedelta(days=1)
|
|
continue
|
|
|
|
try:
|
|
# Récupérer le programme
|
|
reunions = get_programme(date_pmu)
|
|
if not reunions:
|
|
print(f" [{day_num:3}/{total_days}] {date_str} — pas de programme")
|
|
current += timedelta(days=1)
|
|
time.sleep(args.delay)
|
|
continue
|
|
|
|
# Trouver le Quinté+
|
|
quinte = find_quinte(reunions)
|
|
if not quinte:
|
|
print(f" [{day_num:3}/{total_days}] {date_str} — pas de Quinté+")
|
|
current += timedelta(days=1)
|
|
time.sleep(args.delay)
|
|
continue
|
|
|
|
if not quinte['arrivee_def']:
|
|
print(f" [{day_num:3}/{total_days}] {date_str} — arrivée non définitive")
|
|
current += timedelta(days=1)
|
|
time.sleep(args.delay)
|
|
continue
|
|
|
|
time.sleep(args.delay)
|
|
|
|
# Récupérer les participants
|
|
participants = get_participants(date_pmu, quinte['num_reunion'], quinte['num_course'])
|
|
if not participants:
|
|
print(f" [{day_num:3}/{total_days}] {date_str} — pas de participants")
|
|
current += timedelta(days=1)
|
|
time.sleep(args.delay)
|
|
continue
|
|
|
|
# Extraire les features
|
|
rows = []
|
|
for p in participants:
|
|
if p.get('statut') not in ('PARTANT', None):
|
|
continue
|
|
features = extract_features(p, quinte, participants)
|
|
features['date'] = date_str
|
|
rows.append(features)
|
|
|
|
# Sauvegarder
|
|
saved = save_batch(rows)
|
|
total_courses += 1
|
|
total_partants += len(rows)
|
|
total_saved += saved
|
|
|
|
winner = next((r['horse_name'] for r in rows if r['top1']), '?')
|
|
print(f" [{day_num:3}/{total_days}] {date_str} — {quinte['libelle'][:30]:<30} {len(rows)} partants · gagnant: {winner} ✅")
|
|
|
|
except requests.exceptions.Timeout:
|
|
print(f" [{day_num:3}/{total_days}] {date_str} — timeout ⏱️")
|
|
errors += 1
|
|
time.sleep(2)
|
|
except Exception as e:
|
|
print(f" [{day_num:3}/{total_days}] {date_str} — erreur: {e} ❌")
|
|
errors += 1
|
|
time.sleep(1)
|
|
|
|
current += timedelta(days=1)
|
|
time.sleep(args.delay)
|
|
|
|
# Résumé final
|
|
print(f"\n{'='*60}")
|
|
print(f"✅ CHARGEMENT TERMINÉ")
|
|
print(f"{'='*60}")
|
|
print(f" Courses chargées : {total_courses}")
|
|
print(f" Partants total : {total_partants}")
|
|
print(f" Lignes sauvegardées : {total_saved}")
|
|
print(f" Déjà présents : {skipped} jours")
|
|
print(f" Erreurs : {errors}")
|
|
|
|
# Statistiques BDD finale
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
c.execute("SELECT COUNT(DISTINCT date), COUNT(*), AVG(top5) FROM historical_data")
|
|
stats = c.fetchone()
|
|
conn.close()
|
|
|
|
print(f"\n📊 BDD HISTORIQUE FINALE :")
|
|
print(f" Jours total : {stats[0]}")
|
|
print(f" Lignes total : {stats[1]}")
|
|
print(f" Taux top5 moy : {round((stats[2] or 0)*100, 1)}%")
|
|
print(f"{'='*60}\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|