362 lines
13 KiB
Python
362 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Improved Historical Data Loader
|
|
- Fills gaps in historical data
|
|
- Adds more features for better ML
|
|
- Supports bulk loading for more training data
|
|
"""
|
|
|
|
import requests
|
|
import sqlite3
|
|
import json
|
|
import time
|
|
import os
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import sys
|
|
|
|
DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db")
|
|
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'Accept': 'application/json'}
|
|
BASE_URL = "https://turfinfo.api.pmu.fr/rest/client/1/programme"
|
|
|
|
|
|
def get_db_connection():
|
|
return sqlite3.connect(DB_PATH)
|
|
|
|
|
|
def get_missing_dates():
|
|
"""Find dates missing from historical_data."""
|
|
conn = get_db_connection()
|
|
c = conn.cursor()
|
|
|
|
c.execute("SELECT DISTINCT date FROM historical_data")
|
|
existing = set(row[0] for row in c.fetchall())
|
|
conn.close()
|
|
|
|
start_date = datetime(2025, 1, 1)
|
|
end_date = datetime(2026, 12, 31)
|
|
|
|
all_dates = []
|
|
current = start_date
|
|
while current <= end_date:
|
|
date_str = current.strftime('%Y-%m-%d')
|
|
if date_str not in existing:
|
|
all_dates.append(date_str)
|
|
current += timedelta(days=1)
|
|
|
|
return all_dates
|
|
|
|
|
|
def get_programme(date_pmu):
|
|
"""Get program for a given date."""
|
|
try:
|
|
url = f"{BASE_URL}/{date_pmu}/reunions"
|
|
r = requests.get(url, headers=HEADERS, timeout=15)
|
|
if r.status_code != 200:
|
|
return []
|
|
return r.json().get('programme', {}).get('reunions', [])
|
|
except Exception as e:
|
|
print(f"Error fetching {date_pmu}: {e}")
|
|
return []
|
|
|
|
|
|
def find_all_quintes(reunions):
|
|
"""Find all courses (not just Quinte+) - more training data."""
|
|
courses = []
|
|
for reunion in reunions:
|
|
for course in reunion.get('courses', []):
|
|
paris = [p['typePari'] for p in course.get('paris', [])]
|
|
libelle = course.get('libelle', '')
|
|
|
|
# Skip if no participants
|
|
nb_partants = course.get('nombreDeclaresPartants', 0)
|
|
if nb_partants < 10:
|
|
continue
|
|
|
|
heure_ts = course.get('heureDepart', 0)
|
|
heure = datetime.fromtimestamp(heure_ts/1000).strftime('%H:%M') if heure_ts else '13:55'
|
|
|
|
courses.append({
|
|
'num_reunion': reunion['numOfficiel'],
|
|
'num_course': course['numOrdre'],
|
|
'libelle': libelle,
|
|
'hippodrome': reunion['hippodrome']['libelleCourt'],
|
|
'distance': course.get('distance', 0),
|
|
'discipline': course.get('discipline', ''),
|
|
'allocation': course.get('montantPrix', 0),
|
|
'nb_partants': nb_partants,
|
|
'heure': heure,
|
|
'arrivee_def': course.get('arriveeDefinitive', False),
|
|
})
|
|
return courses
|
|
|
|
|
|
def get_participants(date_pmu, num_r, num_c):
|
|
"""Get participants for a course."""
|
|
try:
|
|
url = f"{BASE_URL}/{date_pmu}/R{num_r}/C{num_c}/participants"
|
|
r = requests.get(url, headers=HEADERS, timeout=15)
|
|
if r.status_code != 200:
|
|
return []
|
|
return r.json().get('participants', [])
|
|
except Exception as e:
|
|
print(f"Error fetching participants: {e}")
|
|
return []
|
|
|
|
|
|
def parse_musique(musique):
|
|
"""Parse the musique (form) string."""
|
|
if not musique:
|
|
return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0, 'best_pos': 99}
|
|
|
|
clean = re.sub(r'\(\d+\)', '', musique)
|
|
resultats = re.findall(r'(\d+|D|0)([amphsc]?)', clean)
|
|
|
|
positions = []
|
|
for pos, disc in resultats[:10]:
|
|
if pos == 'D':
|
|
positions.append(99)
|
|
else:
|
|
positions.append(int(pos))
|
|
|
|
if not positions:
|
|
return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0, 'best_pos': 99}
|
|
|
|
nb_disq = positions.count(99)
|
|
positions_clean = [p for p in positions if p != 99]
|
|
|
|
recentes = positions_clean[:3]
|
|
forme_recente = sum(recentes) / len(recentes) if recentes else 99
|
|
|
|
best_pos = min(positions_clean) if positions_clean else 99
|
|
|
|
if len(positions_clean) >= 4:
|
|
debut = sum(positions_clean[-4:]) / 4
|
|
fin = sum(positions_clean[:4]) / 4
|
|
tendance = round(debut - fin, 2)
|
|
else:
|
|
tendance = 0
|
|
|
|
return {
|
|
'forme_recente': round(forme_recente, 2),
|
|
'tendance': tendance,
|
|
'nb_disq': nb_disq,
|
|
'best_pos': best_pos
|
|
}
|
|
|
|
|
|
def extract_features(p, course_info, all_participants):
|
|
"""Extract all features from a participant."""
|
|
musique_stats = parse_musique(p.get('musique', ''))
|
|
|
|
# Odds
|
|
rapport_direct = p.get('dernierRapportDirect', {}) or {}
|
|
cote_directe = rapport_direct.get('rapport', 0) or 0
|
|
est_favori = 1 if rapport_direct.get('favoris', False) else 0
|
|
|
|
rapport_ref = p.get('dernierRapportReference', {}) or {}
|
|
cote_reference = rapport_ref.get('rapport', 0) or 0
|
|
|
|
# Career stats
|
|
nb_courses = p.get('nombreCourses', 0) or 0
|
|
nb_victoires = p.get('nombreVictoires', 0) or 0
|
|
nb_places = p.get('nombrePlaces', 0) or 0
|
|
nb_p2 = p.get('nombrePlacesSecond', 0) or 0
|
|
nb_p3 = p.get('nombrePlacesTroisieme', 0) or 0
|
|
|
|
tx_vic = round(nb_victoires / nb_courses * 100, 2) if nb_courses else 0
|
|
tx_place = round(nb_places / nb_courses * 100, 2) if nb_courses else 0
|
|
|
|
# Earnings
|
|
gains = p.get('gainsParticipant', {}) or {}
|
|
gains_carriere = gains.get('gainsCarriere', 0) or 0
|
|
gains_annee = gains.get('gainsAnneeEnCours', 0) or 0
|
|
gains_victoires = gains.get('gainsVictoires', 0) or 0
|
|
|
|
# Odds rank
|
|
all_cotes = sorted([
|
|
(x.get('dernierRapportDirect', {}) or {}).get('rapport', 999) or 999
|
|
for x in all_participants
|
|
])
|
|
rang_cote = all_cotes.index(cote_directe) + 1 if cote_directe in all_cotes else 99
|
|
|
|
cotes_valides = [c for c in all_cotes if c < 900]
|
|
moy_cote = sum(cotes_valides) / len(cotes_valides) if cotes_valides else 1
|
|
ratio_cote = round(cote_directe / moy_cote, 3) if moy_cote else 0
|
|
|
|
# Result
|
|
ordre = p.get('ordreArrivee', 0) or 0
|
|
top1 = 1 if ordre == 1 else 0
|
|
top3 = 1 if 1 <= ordre <= 3 else 0
|
|
top5 = 1 if 1 <= ordre <= 5 else 0
|
|
|
|
# Driver/jockey
|
|
driver = p.get('driver', {}) or {}
|
|
jockey_name = driver.get('nom', '') if driver else ''
|
|
if not jockey_name:
|
|
jockey_name = p.get('jockey', {}).get('nom', '') if p.get('jockey') else ''
|
|
|
|
# Equipment
|
|
oeilleres = p.get('oeilleres', '')
|
|
deferre = p.get('deferre', '')
|
|
|
|
return {
|
|
'date': None,
|
|
'race_name': course_info['libelle'],
|
|
'hippodrome': course_info['hippodrome'],
|
|
'distance': course_info['distance'],
|
|
'discipline': course_info['discipline'],
|
|
'allocation': course_info['allocation'],
|
|
'nb_partants': course_info['nb_partants'],
|
|
'heure': course_info['heure'],
|
|
'horse_name': p.get('nom', ''),
|
|
'horse_number': p.get('numero', 0),
|
|
'driver': jockey_name,
|
|
'age': p.get('age', 0) or 0,
|
|
'sexe': p.get('sexe', 'U'),
|
|
'musique': p.get('musique', ''),
|
|
'nb_courses': nb_courses,
|
|
'nb_victoires': nb_victoires,
|
|
'nb_places': nb_places,
|
|
'nb_places_2': nb_p2,
|
|
'nb_places_3': nb_p3,
|
|
'gains_carriere': gains_carriere,
|
|
'gains_annee': gains_annee,
|
|
'gains_victoires': gains_victoires,
|
|
'reduction_km': p.get('reductionKm', 0) or 0,
|
|
'avis_entraineur': p.get('avisEntraineur', 'NEUTRE') or 'NEUTRE',
|
|
'oeilleres': oeilleres or 'SANS',
|
|
'deferre': deferre or 'NON',
|
|
'cote_directe': cote_directe,
|
|
'cote_reference': cote_reference,
|
|
'indicateur_tendance': rapport_direct.get('nombreIndicateurTendance', 0) or 0,
|
|
'est_favori': est_favori,
|
|
'tx_victoire': tx_vic,
|
|
'tx_place': tx_place,
|
|
'forme_recente': musique_stats['forme_recente'],
|
|
'tendance_forme': musique_stats['tendance'],
|
|
'nb_disq': musique_stats['nb_disq'],
|
|
'rang_cote': rang_cote,
|
|
'ratio_cote_field': ratio_cote,
|
|
'ordre_arrivee': ordre,
|
|
'temps_obtenu': p.get('tempsObtenu', 0) or 0,
|
|
'top1': top1,
|
|
'top3': top3,
|
|
'top5': top5,
|
|
}
|
|
|
|
|
|
def load_date(date_str):
|
|
"""Load all course data for a specific date."""
|
|
date_pmu = datetime.strptime(date_str, '%Y-%m-%d').strftime('%d%m%Y')
|
|
|
|
reunions = get_programme(date_pmu)
|
|
if not reunions:
|
|
return 0
|
|
|
|
courses = find_all_quintes(reunions)
|
|
if not courses:
|
|
return 0
|
|
|
|
total_loaded = 0
|
|
conn = get_db_connection()
|
|
c = conn.cursor()
|
|
|
|
for course in courses:
|
|
participants = get_participants(date_pmu, course['num_reunion'], course['num_course'])
|
|
if not participants:
|
|
continue
|
|
|
|
for p in participants:
|
|
try:
|
|
features = extract_features(p, course, participants)
|
|
features['date'] = date_str
|
|
|
|
c.execute('''
|
|
INSERT OR IGNORE INTO historical_data
|
|
(date, race_name, hippodrome, distance, discipline, allocation, nb_partants, heure,
|
|
horse_name, horse_number, driver, age, sexe, musique, nb_courses, nb_victoires,
|
|
nb_places, nb_places_2, nb_places_3, gains_carriere, gains_annee, gains_victoires,
|
|
reduction_km, avis_entraineur, oeilleres, deferre, cote_directe, cote_reference,
|
|
indicateur_tendance, est_favori, tx_victoire, tx_place, forme_recente, tendance_forme,
|
|
nb_disq, rang_cote, ratio_cote_field, ordre_arrivee, temps_obtenu, top1, top3, top5)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
|
|
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
''', (
|
|
features['date'], features['race_name'], features['hippodrome'], features['distance'],
|
|
features['discipline'], features['allocation'], features['nb_partants'], features['heure'],
|
|
features['horse_name'], features['horse_number'], features['driver'], features['age'],
|
|
features['sexe'], features['musique'], features['nb_courses'], features['nb_victoires'],
|
|
features['nb_places'], features['nb_places_2'], features['nb_places_3'],
|
|
features['gains_carriere'], features['gains_annee'], features['gains_victoires'],
|
|
features['reduction_km'], features['avis_entraineur'], features['oeilleres'],
|
|
features['deferre'], features['cote_directe'], features['cote_reference'],
|
|
features['indicateur_tendance'], features['est_favori'], features['tx_victoire'],
|
|
features['tx_place'], features['forme_recente'], features['tendance_forme'],
|
|
features['nb_disq'], features['rang_cote'], features['ratio_cote_field'],
|
|
features['ordre_arrivee'], features['temps_obtenu'], features['top1'],
|
|
features['top3'], features['top5']
|
|
))
|
|
|
|
if c.rowcount > 0:
|
|
total_loaded += 1
|
|
|
|
except Exception as e:
|
|
print(f" Error loading participant: {e}")
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
return total_loaded
|
|
|
|
|
|
def main():
|
|
print(f"\n{'='*60}")
|
|
print("Improved Historical Data Loader")
|
|
print(f"{'='*60}\n")
|
|
|
|
# Get missing dates
|
|
missing = get_missing_dates()
|
|
print(f"Found {len(missing)} missing dates")
|
|
|
|
if not missing:
|
|
print("No missing dates to load!")
|
|
return
|
|
|
|
# Load missing dates (limit to avoid timeout)
|
|
dates_to_load = missing[:30] # Load max 30 dates at once
|
|
|
|
print(f"Loading {len(dates_to_load)} dates...\n")
|
|
|
|
total = 0
|
|
for i, date in enumerate(dates_to_load):
|
|
print(f"[{i+1}/{len(dates_to_load)}] Loading {date}...", end=" ")
|
|
loaded = load_date(date)
|
|
print(f"✓ {loaded} rows")
|
|
total += loaded
|
|
|
|
# Rate limiting
|
|
if i < len(dates_to_load) - 1:
|
|
time.sleep(0.5)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Total loaded: {total} rows")
|
|
|
|
# Show updated stats
|
|
conn = get_db_connection()
|
|
c = conn.cursor()
|
|
c.execute("SELECT COUNT(*), COUNT(DISTINCT date) FROM historical_data")
|
|
count, days = c.fetchone()
|
|
c.execute("SELECT MIN(date), MAX(date) FROM historical_data")
|
|
min_date, max_date = c.fetchone()
|
|
conn.close()
|
|
|
|
print(f"Total in DB: {count} rows, {days} days")
|
|
print(f"Date range: {min_date} to {max_date}")
|
|
print(f"{'='*60}\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|