Files
turf_saas/improve_historical_data.py
2026-04-25 17:18:43 +02:00

362 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Improved Historical Data Loader
- Fills gaps in historical data
- Adds more features for better ML
- Supports bulk loading for more training data
"""
import requests
import sqlite3
import json
import time
import os
import re
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db")
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'Accept': 'application/json'}
BASE_URL = "https://turfinfo.api.pmu.fr/rest/client/1/programme"
def get_db_connection():
return sqlite3.connect(DB_PATH)
def get_missing_dates():
"""Find dates missing from historical_data."""
conn = get_db_connection()
c = conn.cursor()
c.execute("SELECT DISTINCT date FROM historical_data")
existing = set(row[0] for row in c.fetchall())
conn.close()
start_date = datetime(2025, 1, 1)
end_date = datetime(2026, 12, 31)
all_dates = []
current = start_date
while current <= end_date:
date_str = current.strftime('%Y-%m-%d')
if date_str not in existing:
all_dates.append(date_str)
current += timedelta(days=1)
return all_dates
def get_programme(date_pmu):
"""Get program for a given date."""
try:
url = f"{BASE_URL}/{date_pmu}/reunions"
r = requests.get(url, headers=HEADERS, timeout=15)
if r.status_code != 200:
return []
return r.json().get('programme', {}).get('reunions', [])
except Exception as e:
print(f"Error fetching {date_pmu}: {e}")
return []
def find_all_quintes(reunions):
"""Find all courses (not just Quinte+) - more training data."""
courses = []
for reunion in reunions:
for course in reunion.get('courses', []):
paris = [p['typePari'] for p in course.get('paris', [])]
libelle = course.get('libelle', '')
# Skip if no participants
nb_partants = course.get('nombreDeclaresPartants', 0)
if nb_partants < 10:
continue
heure_ts = course.get('heureDepart', 0)
heure = datetime.fromtimestamp(heure_ts/1000).strftime('%H:%M') if heure_ts else '13:55'
courses.append({
'num_reunion': reunion['numOfficiel'],
'num_course': course['numOrdre'],
'libelle': libelle,
'hippodrome': reunion['hippodrome']['libelleCourt'],
'distance': course.get('distance', 0),
'discipline': course.get('discipline', ''),
'allocation': course.get('montantPrix', 0),
'nb_partants': nb_partants,
'heure': heure,
'arrivee_def': course.get('arriveeDefinitive', False),
})
return courses
def get_participants(date_pmu, num_r, num_c):
"""Get participants for a course."""
try:
url = f"{BASE_URL}/{date_pmu}/R{num_r}/C{num_c}/participants"
r = requests.get(url, headers=HEADERS, timeout=15)
if r.status_code != 200:
return []
return r.json().get('participants', [])
except Exception as e:
print(f"Error fetching participants: {e}")
return []
def parse_musique(musique):
"""Parse the musique (form) string."""
if not musique:
return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0, 'best_pos': 99}
clean = re.sub(r'\(\d+\)', '', musique)
resultats = re.findall(r'(\d+|D|0)([amphsc]?)', clean)
positions = []
for pos, disc in resultats[:10]:
if pos == 'D':
positions.append(99)
else:
positions.append(int(pos))
if not positions:
return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0, 'best_pos': 99}
nb_disq = positions.count(99)
positions_clean = [p for p in positions if p != 99]
recentes = positions_clean[:3]
forme_recente = sum(recentes) / len(recentes) if recentes else 99
best_pos = min(positions_clean) if positions_clean else 99
if len(positions_clean) >= 4:
debut = sum(positions_clean[-4:]) / 4
fin = sum(positions_clean[:4]) / 4
tendance = round(debut - fin, 2)
else:
tendance = 0
return {
'forme_recente': round(forme_recente, 2),
'tendance': tendance,
'nb_disq': nb_disq,
'best_pos': best_pos
}
def extract_features(p, course_info, all_participants):
"""Extract all features from a participant."""
musique_stats = parse_musique(p.get('musique', ''))
# Odds
rapport_direct = p.get('dernierRapportDirect', {}) or {}
cote_directe = rapport_direct.get('rapport', 0) or 0
est_favori = 1 if rapport_direct.get('favoris', False) else 0
rapport_ref = p.get('dernierRapportReference', {}) or {}
cote_reference = rapport_ref.get('rapport', 0) or 0
# Career stats
nb_courses = p.get('nombreCourses', 0) or 0
nb_victoires = p.get('nombreVictoires', 0) or 0
nb_places = p.get('nombrePlaces', 0) or 0
nb_p2 = p.get('nombrePlacesSecond', 0) or 0
nb_p3 = p.get('nombrePlacesTroisieme', 0) or 0
tx_vic = round(nb_victoires / nb_courses * 100, 2) if nb_courses else 0
tx_place = round(nb_places / nb_courses * 100, 2) if nb_courses else 0
# Earnings
gains = p.get('gainsParticipant', {}) or {}
gains_carriere = gains.get('gainsCarriere', 0) or 0
gains_annee = gains.get('gainsAnneeEnCours', 0) or 0
gains_victoires = gains.get('gainsVictoires', 0) or 0
# Odds rank
all_cotes = sorted([
(x.get('dernierRapportDirect', {}) or {}).get('rapport', 999) or 999
for x in all_participants
])
rang_cote = all_cotes.index(cote_directe) + 1 if cote_directe in all_cotes else 99
cotes_valides = [c for c in all_cotes if c < 900]
moy_cote = sum(cotes_valides) / len(cotes_valides) if cotes_valides else 1
ratio_cote = round(cote_directe / moy_cote, 3) if moy_cote else 0
# Result
ordre = p.get('ordreArrivee', 0) or 0
top1 = 1 if ordre == 1 else 0
top3 = 1 if 1 <= ordre <= 3 else 0
top5 = 1 if 1 <= ordre <= 5 else 0
# Driver/jockey
driver = p.get('driver', {}) or {}
jockey_name = driver.get('nom', '') if driver else ''
if not jockey_name:
jockey_name = p.get('jockey', {}).get('nom', '') if p.get('jockey') else ''
# Equipment
oeilleres = p.get('oeilleres', '')
deferre = p.get('deferre', '')
return {
'date': None,
'race_name': course_info['libelle'],
'hippodrome': course_info['hippodrome'],
'distance': course_info['distance'],
'discipline': course_info['discipline'],
'allocation': course_info['allocation'],
'nb_partants': course_info['nb_partants'],
'heure': course_info['heure'],
'horse_name': p.get('nom', ''),
'horse_number': p.get('numero', 0),
'driver': jockey_name,
'age': p.get('age', 0) or 0,
'sexe': p.get('sexe', 'U'),
'musique': p.get('musique', ''),
'nb_courses': nb_courses,
'nb_victoires': nb_victoires,
'nb_places': nb_places,
'nb_places_2': nb_p2,
'nb_places_3': nb_p3,
'gains_carriere': gains_carriere,
'gains_annee': gains_annee,
'gains_victoires': gains_victoires,
'reduction_km': p.get('reductionKm', 0) or 0,
'avis_entraineur': p.get('avisEntraineur', 'NEUTRE') or 'NEUTRE',
'oeilleres': oeilleres or 'SANS',
'deferre': deferre or 'NON',
'cote_directe': cote_directe,
'cote_reference': cote_reference,
'indicateur_tendance': rapport_direct.get('nombreIndicateurTendance', 0) or 0,
'est_favori': est_favori,
'tx_victoire': tx_vic,
'tx_place': tx_place,
'forme_recente': musique_stats['forme_recente'],
'tendance_forme': musique_stats['tendance'],
'nb_disq': musique_stats['nb_disq'],
'rang_cote': rang_cote,
'ratio_cote_field': ratio_cote,
'ordre_arrivee': ordre,
'temps_obtenu': p.get('tempsObtenu', 0) or 0,
'top1': top1,
'top3': top3,
'top5': top5,
}
def load_date(date_str):
"""Load all course data for a specific date."""
date_pmu = datetime.strptime(date_str, '%Y-%m-%d').strftime('%d%m%Y')
reunions = get_programme(date_pmu)
if not reunions:
return 0
courses = find_all_quintes(reunions)
if not courses:
return 0
total_loaded = 0
conn = get_db_connection()
c = conn.cursor()
for course in courses:
participants = get_participants(date_pmu, course['num_reunion'], course['num_course'])
if not participants:
continue
for p in participants:
try:
features = extract_features(p, course, participants)
features['date'] = date_str
c.execute('''
INSERT OR IGNORE INTO historical_data
(date, race_name, hippodrome, distance, discipline, allocation, nb_partants, heure,
horse_name, horse_number, driver, age, sexe, musique, nb_courses, nb_victoires,
nb_places, nb_places_2, nb_places_3, gains_carriere, gains_annee, gains_victoires,
reduction_km, avis_entraineur, oeilleres, deferre, cote_directe, cote_reference,
indicateur_tendance, est_favori, tx_victoire, tx_place, forme_recente, tendance_forme,
nb_disq, rang_cote, ratio_cote_field, ordre_arrivee, temps_obtenu, top1, top3, top5)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
features['date'], features['race_name'], features['hippodrome'], features['distance'],
features['discipline'], features['allocation'], features['nb_partants'], features['heure'],
features['horse_name'], features['horse_number'], features['driver'], features['age'],
features['sexe'], features['musique'], features['nb_courses'], features['nb_victoires'],
features['nb_places'], features['nb_places_2'], features['nb_places_3'],
features['gains_carriere'], features['gains_annee'], features['gains_victoires'],
features['reduction_km'], features['avis_entraineur'], features['oeilleres'],
features['deferre'], features['cote_directe'], features['cote_reference'],
features['indicateur_tendance'], features['est_favori'], features['tx_victoire'],
features['tx_place'], features['forme_recente'], features['tendance_forme'],
features['nb_disq'], features['rang_cote'], features['ratio_cote_field'],
features['ordre_arrivee'], features['temps_obtenu'], features['top1'],
features['top3'], features['top5']
))
if c.rowcount > 0:
total_loaded += 1
except Exception as e:
print(f" Error loading participant: {e}")
conn.commit()
conn.close()
return total_loaded
def main():
print(f"\n{'='*60}")
print("Improved Historical Data Loader")
print(f"{'='*60}\n")
# Get missing dates
missing = get_missing_dates()
print(f"Found {len(missing)} missing dates")
if not missing:
print("No missing dates to load!")
return
# Load missing dates (limit to avoid timeout)
dates_to_load = missing[:30] # Load max 30 dates at once
print(f"Loading {len(dates_to_load)} dates...\n")
total = 0
for i, date in enumerate(dates_to_load):
print(f"[{i+1}/{len(dates_to_load)}] Loading {date}...", end=" ")
loaded = load_date(date)
print(f"{loaded} rows")
total += loaded
# Rate limiting
if i < len(dates_to_load) - 1:
time.sleep(0.5)
print(f"\n{'='*60}")
print(f"Total loaded: {total} rows")
# Show updated stats
conn = get_db_connection()
c = conn.cursor()
c.execute("SELECT COUNT(*), COUNT(DISTINCT date) FROM historical_data")
count, days = c.fetchone()
c.execute("SELECT MIN(date), MAX(date) FROM historical_data")
min_date, max_date = c.fetchone()
conn.close()
print(f"Total in DB: {count} rows, {days} days")
print(f"Date range: {min_date} to {max_date}")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()