turf_saas/improve_historical_data.py

#!/usr/bin/env python3
"""
Improved Historical Data Loader
- Fills gaps in historical data
- Adds more features for better ML
- Supports bulk loading for more training data
"""

import requests
import sqlite3
import json
import time
import os
import re
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys

DB_PATH = os.environ.get("DB_PATH", "/home/h3r7/turf_scraper/turf.db")
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'Accept': 'application/json'}
BASE_URL = "https://turfinfo.api.pmu.fr/rest/client/1/programme"


def get_db_connection():
    return sqlite3.connect(DB_PATH)


def get_missing_dates():
    """Find dates missing from historical_data."""
    conn = get_db_connection()
    c = conn.cursor()

    c.execute("SELECT DISTINCT date FROM historical_data")
    existing = set(row[0] for row in c.fetchall())
    conn.close()

    start_date = datetime(2025, 1, 1)
    end_date = datetime(2026, 12, 31)

    all_dates = []
    current = start_date
    while current <= end_date:
        date_str = current.strftime('%Y-%m-%d')
        if date_str not in existing:
            all_dates.append(date_str)
        current += timedelta(days=1)

    return all_dates


def get_programme(date_pmu):
    """Get program for a given date."""
    try:
        url = f"{BASE_URL}/{date_pmu}/reunions"
        r = requests.get(url, headers=HEADERS, timeout=15)
        if r.status_code != 200:
            return []
        return r.json().get('programme', {}).get('reunions', [])
    except Exception as e:
        print(f"Error fetching {date_pmu}: {e}")
        return []


def find_all_quintes(reunions):
    """Find all courses (not just Quinte+) - more training data."""
    courses = []
    for reunion in reunions:
        for course in reunion.get('courses', []):
            paris = [p['typePari'] for p in course.get('paris', [])]
            libelle = course.get('libelle', '')

            # Skip if no participants
            nb_partants = course.get('nombreDeclaresPartants', 0)
            if nb_partants < 10:
                continue

            heure_ts = course.get('heureDepart', 0)
            heure = datetime.fromtimestamp(heure_ts/1000).strftime('%H:%M') if heure_ts else '13:55'

            courses.append({
                'num_reunion': reunion['numOfficiel'],
                'num_course': course['numOrdre'],
                'libelle': libelle,
                'hippodrome': reunion['hippodrome']['libelleCourt'],
                'distance': course.get('distance', 0),
                'discipline': course.get('discipline', ''),
                'allocation': course.get('montantPrix', 0),
                'nb_partants': nb_partants,
                'heure': heure,
                'arrivee_def': course.get('arriveeDefinitive', False),
            })
    return courses


def get_participants(date_pmu, num_r, num_c):
    """Get participants for a course."""
    try:
        url = f"{BASE_URL}/{date_pmu}/R{num_r}/C{num_c}/participants"
        r = requests.get(url, headers=HEADERS, timeout=15)
        if r.status_code != 200:
            return []
        return r.json().get('participants', [])
    except Exception as e:
        print(f"Error fetching participants: {e}")
        return []


def parse_musique(musique):
    """Parse the musique (form) string."""
    if not musique:
        return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0, 'best_pos': 99}

    clean = re.sub(r'\(\d+\)', '', musique)
    resultats = re.findall(r'(\d+|D|0)([amphsc]?)', clean)

    positions = []
    for pos, disc in resultats[:10]:
        if pos == 'D':
            positions.append(99)
        else:
            positions.append(int(pos))

    if not positions:
        return {'forme_recente': 99, 'tendance': 0, 'nb_disq': 0, 'best_pos': 99}

    nb_disq = positions.count(99)
    positions_clean = [p for p in positions if p != 99]

    recentes = positions_clean[:3]
    forme_recente = sum(recentes) / len(recentes) if recentes else 99

    best_pos = min(positions_clean) if positions_clean else 99

    if len(positions_clean) >= 4:
        debut = sum(positions_clean[-4:]) / 4
        fin = sum(positions_clean[:4]) / 4
        tendance = round(debut - fin, 2)
    else:
        tendance = 0

    return {
        'forme_recente': round(forme_recente, 2),
        'tendance': tendance,
        'nb_disq': nb_disq,
        'best_pos': best_pos
    }


def extract_features(p, course_info, all_participants):
    """Extract all features from a participant."""
    musique_stats = parse_musique(p.get('musique', ''))

    # Odds
    rapport_direct = p.get('dernierRapportDirect', {}) or {}
    cote_directe = rapport_direct.get('rapport', 0) or 0
    est_favori = 1 if rapport_direct.get('favoris', False) else 0

    rapport_ref = p.get('dernierRapportReference', {}) or {}
    cote_reference = rapport_ref.get('rapport', 0) or 0

    # Career stats
    nb_courses = p.get('nombreCourses', 0) or 0
    nb_victoires = p.get('nombreVictoires', 0) or 0
    nb_places = p.get('nombrePlaces', 0) or 0
    nb_p2 = p.get('nombrePlacesSecond', 0) or 0
    nb_p3 = p.get('nombrePlacesTroisieme', 0) or 0

    tx_vic = round(nb_victoires / nb_courses * 100, 2) if nb_courses else 0
    tx_place = round(nb_places / nb_courses * 100, 2) if nb_courses else 0

    # Earnings
    gains = p.get('gainsParticipant', {}) or {}
    gains_carriere = gains.get('gainsCarriere', 0) or 0
    gains_annee = gains.get('gainsAnneeEnCours', 0) or 0
    gains_victoires = gains.get('gainsVictoires', 0) or 0

    # Odds rank
    all_cotes = sorted([
        (x.get('dernierRapportDirect', {}) or {}).get('rapport', 999) or 999
        for x in all_participants
    ])
    rang_cote = all_cotes.index(cote_directe) + 1 if cote_directe in all_cotes else 99

    cotes_valides = [c for c in all_cotes if c < 900]
    moy_cote = sum(cotes_valides) / len(cotes_valides) if cotes_valides else 1
    ratio_cote = round(cote_directe / moy_cote, 3) if moy_cote else 0

    # Result
    ordre = p.get('ordreArrivee', 0) or 0
    top1 = 1 if ordre == 1 else 0
    top3 = 1 if 1 <= ordre <= 3 else 0
    top5 = 1 if 1 <= ordre <= 5 else 0

    # Driver/jockey
    driver = p.get('driver', {}) or {}
    jockey_name = driver.get('nom', '') if driver else ''
    if not jockey_name:
        jockey_name = p.get('jockey', {}).get('nom', '') if p.get('jockey') else ''

    # Equipment
    oeilleres = p.get('oeilleres', '')
    deferre = p.get('deferre', '')

    return {
        'date': None,
        'race_name': course_info['libelle'],
        'hippodrome': course_info['hippodrome'],
        'distance': course_info['distance'],
        'discipline': course_info['discipline'],
        'allocation': course_info['allocation'],
        'nb_partants': course_info['nb_partants'],
        'heure': course_info['heure'],
        'horse_name': p.get('nom', ''),
        'horse_number': p.get('numero', 0),
        'driver': jockey_name,
        'age': p.get('age', 0) or 0,
        'sexe': p.get('sexe', 'U'),
        'musique': p.get('musique', ''),
        'nb_courses': nb_courses,
        'nb_victoires': nb_victoires,
        'nb_places': nb_places,
        'nb_places_2': nb_p2,
        'nb_places_3': nb_p3,
        'gains_carriere': gains_carriere,
        'gains_annee': gains_annee,
        'gains_victoires': gains_victoires,
        'reduction_km': p.get('reductionKm', 0) or 0,
        'avis_entraineur': p.get('avisEntraineur', 'NEUTRE') or 'NEUTRE',
        'oeilleres': oeilleres or 'SANS',
        'deferre': deferre or 'NON',
        'cote_directe': cote_directe,
        'cote_reference': cote_reference,
        'indicateur_tendance': rapport_direct.get('nombreIndicateurTendance', 0) or 0,
        'est_favori': est_favori,
        'tx_victoire': tx_vic,
        'tx_place': tx_place,
        'forme_recente': musique_stats['forme_recente'],
        'tendance_forme': musique_stats['tendance'],
        'nb_disq': musique_stats['nb_disq'],
        'rang_cote': rang_cote,
        'ratio_cote_field': ratio_cote,
        'ordre_arrivee': ordre,
        'temps_obtenu': p.get('tempsObtenu', 0) or 0,
        'top1': top1,
        'top3': top3,
        'top5': top5,
    }


def load_date(date_str):
    """Load all course data for a specific date."""
    date_pmu = datetime.strptime(date_str, '%Y-%m-%d').strftime('%d%m%Y')

    reunions = get_programme(date_pmu)
    if not reunions:
        return 0

    courses = find_all_quintes(reunions)
    if not courses:
        return 0

    total_loaded = 0
    conn = get_db_connection()
    c = conn.cursor()

    for course in courses:
        participants = get_participants(date_pmu, course['num_reunion'], course['num_course'])
        if not participants:
            continue

        for p in participants:
            try:
                features = extract_features(p, course, participants)
                features['date'] = date_str

                c.execute('''
                    INSERT OR IGNORE INTO historical_data
                    (date, race_name, hippodrome, distance, discipline, allocation, nb_partants, heure,
                     horse_name, horse_number, driver, age, sexe, musique, nb_courses, nb_victoires,
                     nb_places, nb_places_2, nb_places_3, gains_carriere, gains_annee, gains_victoires,
                     reduction_km, avis_entraineur, oeilleres, deferre, cote_directe, cote_reference,
                     indicateur_tendance, est_favori, tx_victoire, tx_place, forme_recente, tendance_forme,
                     nb_disq, rang_cote, ratio_cote_field, ordre_arrivee, temps_obtenu, top1, top3, top5)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
                            ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', (
                    features['date'], features['race_name'], features['hippodrome'], features['distance'],
                    features['discipline'], features['allocation'], features['nb_partants'], features['heure'],
                    features['horse_name'], features['horse_number'], features['driver'], features['age'],
                    features['sexe'], features['musique'], features['nb_courses'], features['nb_victoires'],
                    features['nb_places'], features['nb_places_2'], features['nb_places_3'],
                    features['gains_carriere'], features['gains_annee'], features['gains_victoires'],
                    features['reduction_km'], features['avis_entraineur'], features['oeilleres'],
                    features['deferre'], features['cote_directe'], features['cote_reference'],
                    features['indicateur_tendance'], features['est_favori'], features['tx_victoire'],
                    features['tx_place'], features['forme_recente'], features['tendance_forme'],
                    features['nb_disq'], features['rang_cote'], features['ratio_cote_field'],
                    features['ordre_arrivee'], features['temps_obtenu'], features['top1'],
                    features['top3'], features['top5']
                ))

                if c.rowcount > 0:
                    total_loaded += 1

            except Exception as e:
                print(f"  Error loading participant: {e}")

    conn.commit()
    conn.close()

    return total_loaded


def main():
    print(f"\n{'='*60}")
    print("Improved Historical Data Loader")
    print(f"{'='*60}\n")

    # Get missing dates
    missing = get_missing_dates()
    print(f"Found {len(missing)} missing dates")

    if not missing:
        print("No missing dates to load!")
        return

    # Load missing dates (limit to avoid timeout)
    dates_to_load = missing[:30]  # Load max 30 dates at once

    print(f"Loading {len(dates_to_load)} dates...\n")

    total = 0
    for i, date in enumerate(dates_to_load):
        print(f"[{i+1}/{len(dates_to_load)}] Loading {date}...", end=" ")
        loaded = load_date(date)
        print(f"✓ {loaded} rows")
        total += loaded

        # Rate limiting
        if i < len(dates_to_load) - 1:
            time.sleep(0.5)

    print(f"\n{'='*60}")
    print(f"Total loaded: {total} rows")

    # Show updated stats
    conn = get_db_connection()
    c = conn.cursor()
    c.execute("SELECT COUNT(*), COUNT(DISTINCT date) FROM historical_data")
    count, days = c.fetchone()
    c.execute("SELECT MIN(date), MAX(date) FROM historical_data")
    min_date, max_date = c.fetchone()
    conn.close()

    print(f"Total in DB: {count} rows, {days} days")
    print(f"Date range: {min_date} to {max_date}")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    main()