#!/usr/bin/env python3 """ Improved Historical Data Loader - Fills gaps in historical data - Adds more features for better ML - Supports bulk loading for more training data """ import requests import sqlite3 import json import time import os import re from datetime import datetime, timedelta from concurrent.futures import ThreadPoolExecutor, as_completed import sys DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db") HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'Accept': 'application/json'} BASE_URL = "https://turfinfo.api.pmu.fr/rest/client/1/programme" def get_db_connection(): return sqlite3.connect(DB_PATH) def get_missing_dates(): """Find dates missing from historical_data.""" conn = get_db_connection() c = conn.cursor() c.execute("SELECT DISTINCT date FROM historical_data") existing = set(row[0] for row in c.fetchall()) conn.close() start_date = datetime(2025, 1, 1) end_date = datetime(2026, 12, 31) all_dates = [] current = start_date while current <= end_date: date_str = current.strftime('%Y-%m-%d') if date_str not in existing: all_dates.append(date_str) current += timedelta(days=1) return all_dates def get_programme(date_pmu): """Get program for a given date.""" try: url = f"{BASE_URL}/{date_pmu}/reunions" r = requests.get(url, headers=HEADERS, timeout=15) if r.status_code != 200: return [] return r.json().get('programme', {}).get('reunions', []) except Exception as e: print(f"Error fetching {date_pmu}: {e}") return [] def find_all_quintes(reunions): """Find all courses (not just Quinte+) - more training data.""" courses = [] for reunion in reunions: for course in reunion.get('courses', []): paris = [p['typePari'] for p in course.get('paris', [])] libelle = course.get('libelle', '') # Skip if no participants nb_partants = course.get('nombreDeclaresPartants', 0) if nb_partants < 10: continue heure_ts = course.get('heureDepart', 0) heure = datetime.fromtimestamp(heure_ts/1000).strftime('%H:%M') if heure_ts else '13:55' courses.append({ 'num_reunion': reunion['numOfficiel'], 'num_course': course['numOrdre'], 'libelle': libelle, 'hippodrome': reunion['hippodrome']['libelleCourt'], 'distance': course.get('distance', 0), 'discipline': course.get('discipline', ''), 'allocation': course.get('montantPrix', 0), 'nb_partants': nb_partants, 'heure': heure, 'arrivee_def': course.get('arriveeDefinitive', False), }) return courses def get_participants(date_pmu, num_r, num_c): """Get participants for a course.""" try: url = f"{BASE_URL}/{date_pmu}/R{num_r}/C{num_c}/participants" r = requests.get(url, headers=HEADERS, timeout=15) if r.status_code != 200: return [] return r.json().get('participants', []) except Exception as e: print(f"Error fetching participants: {e}") return [] def parse_musique(musique): """Parse the musique (form) string.""" if not musique: return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0, 'best_pos': 99} clean = re.sub(r'\(\d+\)', '', musique) resultats = re.findall(r'(\d+|D|0)([amphsc]?)', clean) positions = [] for pos, disc in resultats[:10]: if pos == 'D': positions.append(99) else: positions.append(int(pos)) if not positions: return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0, 'best_pos': 99} nb_disq = positions.count(99) positions_clean = [p for p in positions if p != 99] recentes = positions_clean[:3] forme_recente = sum(recentes) / len(recentes) if recentes else 99 best_pos = min(positions_clean) if positions_clean else 99 if len(positions_clean) >= 4: debut = sum(positions_clean[-4:]) / 4 fin = sum(positions_clean[:4]) / 4 tendance = round(debut - fin, 2) else: tendance = 0 return { 'forme_recente': round(forme_recente, 2), 'tendance': tendance, 'nb_disq': nb_disq, 'best_pos': best_pos } def extract_features(p, course_info, all_participants): """Extract all features from a participant.""" musique_stats = parse_musique(p.get('musique', '')) # Odds rapport_direct = p.get('dernierRapportDirect', {}) or {} cote_directe = rapport_direct.get('rapport', 0) or 0 est_favori = 1 if rapport_direct.get('favoris', False) else 0 rapport_ref = p.get('dernierRapportReference', {}) or {} cote_reference = rapport_ref.get('rapport', 0) or 0 # Career stats nb_courses = p.get('nombreCourses', 0) or 0 nb_victoires = p.get('nombreVictoires', 0) or 0 nb_places = p.get('nombrePlaces', 0) or 0 nb_p2 = p.get('nombrePlacesSecond', 0) or 0 nb_p3 = p.get('nombrePlacesTroisieme', 0) or 0 tx_vic = round(nb_victoires / nb_courses * 100, 2) if nb_courses else 0 tx_place = round(nb_places / nb_courses * 100, 2) if nb_courses else 0 # Earnings gains = p.get('gainsParticipant', {}) or {} gains_carriere = gains.get('gainsCarriere', 0) or 0 gains_annee = gains.get('gainsAnneeEnCours', 0) or 0 gains_victoires = gains.get('gainsVictoires', 0) or 0 # Odds rank all_cotes = sorted([ (x.get('dernierRapportDirect', {}) or {}).get('rapport', 999) or 999 for x in all_participants ]) rang_cote = all_cotes.index(cote_directe) + 1 if cote_directe in all_cotes else 99 cotes_valides = [c for c in all_cotes if c < 900] moy_cote = sum(cotes_valides) / len(cotes_valides) if cotes_valides else 1 ratio_cote = round(cote_directe / moy_cote, 3) if moy_cote else 0 # Result ordre = p.get('ordreArrivee', 0) or 0 top1 = 1 if ordre == 1 else 0 top3 = 1 if 1 <= ordre <= 3 else 0 top5 = 1 if 1 <= ordre <= 5 else 0 # Driver/jockey driver = p.get('driver', {}) or {} jockey_name = driver.get('nom', '') if driver else '' if not jockey_name: jockey_name = p.get('jockey', {}).get('nom', '') if p.get('jockey') else '' # Equipment oeilleres = p.get('oeilleres', '') deferre = p.get('deferre', '') return { 'date': None, 'race_name': course_info['libelle'], 'hippodrome': course_info['hippodrome'], 'distance': course_info['distance'], 'discipline': course_info['discipline'], 'allocation': course_info['allocation'], 'nb_partants': course_info['nb_partants'], 'heure': course_info['heure'], 'horse_name': p.get('nom', ''), 'horse_number': p.get('numero', 0), 'driver': jockey_name, 'age': p.get('age', 0) or 0, 'sexe': p.get('sexe', 'U'), 'musique': p.get('musique', ''), 'nb_courses': nb_courses, 'nb_victoires': nb_victoires, 'nb_places': nb_places, 'nb_places_2': nb_p2, 'nb_places_3': nb_p3, 'gains_carriere': gains_carriere, 'gains_annee': gains_annee, 'gains_victoires': gains_victoires, 'reduction_km': p.get('reductionKm', 0) or 0, 'avis_entraineur': p.get('avisEntraineur', 'NEUTRE') or 'NEUTRE', 'oeilleres': oeilleres or 'SANS', 'deferre': deferre or 'NON', 'cote_directe': cote_directe, 'cote_reference': cote_reference, 'indicateur_tendance': rapport_direct.get('nombreIndicateurTendance', 0) or 0, 'est_favori': est_favori, 'tx_victoire': tx_vic, 'tx_place': tx_place, 'forme_recente': musique_stats['forme_recente'], 'tendance_forme': musique_stats['tendance'], 'nb_disq': musique_stats['nb_disq'], 'rang_cote': rang_cote, 'ratio_cote_field': ratio_cote, 'ordre_arrivee': ordre, 'temps_obtenu': p.get('tempsObtenu', 0) or 0, 'top1': top1, 'top3': top3, 'top5': top5, } def load_date(date_str): """Load all course data for a specific date.""" date_pmu = datetime.strptime(date_str, '%Y-%m-%d').strftime('%d%m%Y') reunions = get_programme(date_pmu) if not reunions: return 0 courses = find_all_quintes(reunions) if not courses: return 0 total_loaded = 0 conn = get_db_connection() c = conn.cursor() for course in courses: participants = get_participants(date_pmu, course['num_reunion'], course['num_course']) if not participants: continue for p in participants: try: features = extract_features(p, course, participants) features['date'] = date_str c.execute(''' INSERT OR IGNORE INTO historical_data (date, race_name, hippodrome, distance, discipline, allocation, nb_partants, heure, horse_name, horse_number, driver, age, sexe, musique, nb_courses, nb_victoires, nb_places, nb_places_2, nb_places_3, gains_carriere, gains_annee, gains_victoires, reduction_km, avis_entraineur, oeilleres, deferre, cote_directe, cote_reference, indicateur_tendance, est_favori, tx_victoire, tx_place, forme_recente, tendance_forme, nb_disq, rang_cote, ratio_cote_field, ordre_arrivee, temps_obtenu, top1, top3, top5) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( features['date'], features['race_name'], features['hippodrome'], features['distance'], features['discipline'], features['allocation'], features['nb_partants'], features['heure'], features['horse_name'], features['horse_number'], features['driver'], features['age'], features['sexe'], features['musique'], features['nb_courses'], features['nb_victoires'], features['nb_places'], features['nb_places_2'], features['nb_places_3'], features['gains_carriere'], features['gains_annee'], features['gains_victoires'], features['reduction_km'], features['avis_entraineur'], features['oeilleres'], features['deferre'], features['cote_directe'], features['cote_reference'], features['indicateur_tendance'], features['est_favori'], features['tx_victoire'], features['tx_place'], features['forme_recente'], features['tendance_forme'], features['nb_disq'], features['rang_cote'], features['ratio_cote_field'], features['ordre_arrivee'], features['temps_obtenu'], features['top1'], features['top3'], features['top5'] )) if c.rowcount > 0: total_loaded += 1 except Exception as e: print(f" Error loading participant: {e}") conn.commit() conn.close() return total_loaded def main(): print(f"\n{'='*60}") print("Improved Historical Data Loader") print(f"{'='*60}\n") # Get missing dates missing = get_missing_dates() print(f"Found {len(missing)} missing dates") if not missing: print("No missing dates to load!") return # Load missing dates (limit to avoid timeout) dates_to_load = missing[:30] # Load max 30 dates at once print(f"Loading {len(dates_to_load)} dates...\n") total = 0 for i, date in enumerate(dates_to_load): print(f"[{i+1}/{len(dates_to_load)}] Loading {date}...", end=" ") loaded = load_date(date) print(f"✓ {loaded} rows") total += loaded # Rate limiting if i < len(dates_to_load) - 1: time.sleep(0.5) print(f"\n{'='*60}") print(f"Total loaded: {total} rows") # Show updated stats conn = get_db_connection() c = conn.cursor() c.execute("SELECT COUNT(*), COUNT(DISTINCT date) FROM historical_data") count, days = c.fetchone() c.execute("SELECT MIN(date), MAX(date) FROM historical_data") min_date, max_date = c.fetchone() conn.close() print(f"Total in DB: {count} rows, {days} days") print(f"Date range: {min_date} to {max_date}") print(f"{'='*60}\n") if __name__ == "__main__": main()