#!/usr/bin/env python3 """ Turf Scraper v5 - REALTIME DATABASE SAVING Saves predictions immediately as they're scraped Parser robuste intégré : canalturf (partants + pronostic + sélections), boturfers (infos course) """ import requests from bs4 import BeautifulSoup import json from datetime import datetime import time from concurrent.futures import ThreadPoolExecutor, as_completed import threading import sqlite3 import re import os DB_PATH = "/home/h3r7/turf_saas/turf_saas.db" HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8', } lock = threading.Lock() counter = {"total": 0, "done": 0} # ============== DATABASE FUNCTIONS ============== def init_db(): """Initialize database""" conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute(''' CREATE TABLE IF NOT EXISTS predictions ( id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT NOT NULL, race_name TEXT, race_hippodrome TEXT, race_time TEXT, horse_number INTEGER, horse_name TEXT, odds REAL, prediction_rank INTEGER, source TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, jockey TEXT, odds_time TEXT, odds_prev REAL ) ''') # Ajouter les colonnes jockey/odds_time si elles n'existent pas (migration) for col, coltype in [("jockey", "TEXT"), ("odds_time", "TEXT"), ("odds_prev", "REAL")]: try: c.execute(f"ALTER TABLE predictions ADD COLUMN {col} {coltype}") except sqlite3.OperationalError: pass # Colonne déjà présente c.execute(''' CREATE TABLE IF NOT EXISTS results ( id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT NOT NULL, race_name TEXT, race_hippodrome TEXT, position INTEGER, horse_name TEXT, odds REAL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') c.execute(''' CREATE TABLE IF NOT EXISTS performance ( id INTEGER PRIMARY KEY AUTOINCREMENT, prediction_date TEXT, race_date TEXT, horse_name TEXT, predicted_rank INTEGER, actual_position INTEGER, hit BOOLEAN, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') # Table odds_history : historique des cotes intraday c.execute(''' CREATE TABLE IF NOT EXISTS odds_history ( id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT NOT NULL, race_name TEXT, race_hippodrome TEXT, horse_number INTEGER, horse_name TEXT, odds REAL NOT NULL, scraped_at TEXT NOT NULL, source TEXT DEFAULT 'canalturf' ) ''') c.execute(''' CREATE TABLE IF NOT EXISTS race_meta ( id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT NOT NULL, race_name TEXT, race_hippodrome TEXT, race_time TEXT, race_timestamp INTEGER, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') conn.commit() conn.close() print(f"✅ DB initialized: {DB_PATH}") def add_prediction(date, race_name, race_hippodrome, race_time, horse_number, horse_name, odds, prediction_rank, source, jockey="", odds_time=None): """Add a prediction with OR IGNORE to avoid duplicates""" c = conn.cursor() c.execute(''' INSERT OR IGNORE INTO predictions (date, race_name, race_hippodrome, race_time, horse_number, horse_name, odds, prediction_rank, source, jockey, odds_time) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', (date, race_name, race_hippodrome, race_time, horse_number, horse_name, odds, prediction_rank, source, jockey, odds_time or datetime.now().isoformat())) c.execute(''' CREATE TABLE IF NOT EXISTS race_meta ( id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT NOT NULL, race_name TEXT, race_hippodrome TEXT, race_time TEXT, race_timestamp INTEGER, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') conn.commit() conn.close() def add_result(date, race_name, race_hippodrome, position, horse_name, odds): """Add a race result""" conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute(''' INSERT INTO results (date, race_name, race_hippodrome, position, horse_name, odds) VALUES (?, ?, ?, ?, ?, ?) ''', (date, race_name, race_hippodrome, position, horse_name, odds)) c.execute(''' CREATE TABLE IF NOT EXISTS race_meta ( id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT NOT NULL, race_name TEXT, race_hippodrome TEXT, race_time TEXT, race_timestamp INTEGER, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') conn.commit() conn.close() # ============== SCRAPER FUNCTIONS ============== def fetch_url(args): url, site = args try: r = requests.get(url, headers=HEADERS, timeout=12) soup = BeautifulSoup(r.text, 'html.parser') for s in soup(["script", "style"]): s.decompose() text = soup.get_text(separator='\n', strip=True)[:8000] with lock: counter["done"] += 1 pct = (counter["done"] / counter["total"]) * 100 print(f" [{pct:.0f}%] {site}: OK") return {'url': url, 'site': site, 'content': text, 'status': 'success'} except Exception as e: with lock: counter["done"] += 1 return {'url': url, 'site': site, 'error': str(e), 'status': 'error'} # ============== PARSERS ROBUSTES ============== def parse_canalturf_quinte(content): """ Extrait depuis courses_quinte.php : - Infos course (nom, hippodrome, heure, distance, allocation) - Liste des partants (numéro, cheval, jockey, cote) - Pronostic structuré (bases, chances régulières, outsiders) """ result = { "course": {}, "partants": [], "pronostic": {"bases": [], "chances": [], "outsiders": []} } lines = [l.strip() for l in content.split('\n') if l.strip()] # Nom de la course for line in lines: if re.search(r'^PRIX\s+[A-Z]', line): result["course"]["nom"] = line.strip() break # Hippodrome m = re.search(r'hippodrome de\s+([A-Z\-]+)', content, re.IGNORECASE) if m: result["course"]["hippodrome"] = m.group(1).strip() # Heure m = re.search(r'(\d{1,2}:\d{2})', content) if m: result["course"]["heure"] = m.group(1) # Distance m = re.search(r'(\d{3,4})m', content) if m: result["course"]["distance"] = int(m.group(1)) # Type de course for t in ['TROT ATTELE', 'TROT MONTE', 'PLAT', 'OBSTACLE', 'HAIES', 'STEEPLE']: if t in content.upper(): result["course"]["type"] = t break # Partants : on cherche des blocs numéro / NOM / Jockey / cote # On s'arrête dès qu'on a trouvé la section "Liste des partants" pour éviter # de parser aussi le bloc pronostic qui contient les mêmes noms sans cote liste_idx = content.find("Liste des partants") prono_idx = content.find("Le pronostic du Quinté+") partants_zone = content[liste_idx:prono_idx] if liste_idx != -1 and prono_idx != -1 else content lines_partants = [l.strip() for l in partants_zone.split('\n') if l.strip()] seen_nums = set() i = 0 while i < len(lines_partants): if re.match(r'^\d{1,2}$', lines_partants[i]): num = int(lines_partants[i]) if 1 <= num <= 20 and num not in seen_nums and i + 2 < len(lines_partants): nom_cheval = lines_partants[i + 1] jockey = lines_partants[i + 2] cote = None if i + 3 < len(lines_partants) and re.match(r'[\d\.]+/\d', lines_partants[i + 3]): try: cote = float(lines_partants[i + 3].split('/')[0]) except: pass i += 4 else: i += 3 # Valider que le nom est bien en majuscules if re.search(r'[A-Z]{3,}', nom_cheval) and re.search(r'[A-Z]', jockey): seen_nums.add(num) result["partants"].append({ "numero": num, "cheval": nom_cheval.strip(), "jockey": jockey.strip(), "cote": cote }) continue i += 1 # Pronostic : extraire uniquement les chevaux dans la section dédiée # On délimite chaque section entre son mot-clé et le suivant section_keywords = ["Base(s)", "Chance(s) régulière(s)", "Outsider(s)", "Le cheval du Quinté+"] def extract_horses_between(start_kw, end_kws): horses = [] idx_start = content.find(start_kw) if idx_start == -1: return horses idx_end = len(content) for kw in end_kws: idx = content.find(kw, idx_start + len(start_kw)) if idx != -1 and idx < idx_end: idx_end = idx snippet = content[idx_start:idx_end] for m in re.finditer(r'(\d{1,2})\s+([A-Z][A-Z\s\-\']+?)\s*\(', snippet): try: horses.append({"numero": int(m.group(1)), "cheval": m.group(2).strip()}) except: pass return horses result["pronostic"]["bases"] = extract_horses_between("Base(s)", ["Chance(s) régulière(s)", "Outsider(s)", "Le cheval"]) result["pronostic"]["chances"] = extract_horses_between("Chance(s) régulière(s)", ["Outsider(s)", "Le cheval"]) result["pronostic"]["outsiders"] = extract_horses_between("Outsider(s)", ["Le cheval", "Partants détaillés"]) return result def parse_canalturf_selections(content): """ Extrait depuis courses_chevaux_jour.php : Sélections gagnantes/placées par course (hippodrome, heure, cheval, jockey, cote PMU) """ selections = [] today = datetime.now().strftime('%Y-%m-%d') for m in re.finditer( r'C(\d+)\s*[-–]\s*(PRIX[^(]+)\((\d{1,2}:\d{2})\)\s*' r'(\d{1,2})\s*[-–]\s*([A-Z][A-Z\s\'\-]+?)\s*\(([^)]+)\)', content ): race_name = m.group(2).strip() race_time = m.group(3) horse_num = int(m.group(4)) horse_name = m.group(5).strip() jockey = m.group(6).strip() after = content[m.end():m.end() + 100] cote_m = re.search(r'(\d+\.?\d*)\s*PMU', after) cote = float(cote_m.group(1)) if cote_m else 0.0 selections.append({ "date": today, "race_name": race_name, "race_time": race_time, "horse_number": horse_num, "horse_name": horse_name, "jockey": jockey, "cote_pmu": cote, }) return selections def parse_boturfers_quinte(content): """ Extrait depuis boturfers.fr/quinte-du-jour : Infos course (nb partants, distance, météo, probabilités) """ info = {} m = re.search(r'(\d+)\s*partants', content) if m: info["nb_partants"] = int(m.group(1)) m = re.search(r'(\d+)°C', content) if m: info["temperature"] = int(m.group(1)) probs = re.findall(r'(\d+)%\s*\nen (\d+) cheval', content) if probs: info["probabilites"] = {f"top{p[1]}": int(p[0]) for p in probs} return info def save_parsed_data(quinte_data, selections, today): """Sauvegarde en BDD toutes les données parsées""" conn = sqlite3.connect(DB_PATH) c = conn.cursor() now = datetime.now().isoformat() saved = 0 course = quinte_data.get("course", {}) race_name = course.get("nom", "Quinté+") hippodrome = course.get("hippodrome", "") race_time = course.get("heure", "13:55") # 1. Partants avec cotes for p in quinte_data.get("partants", []): try: c.execute(''' INSERT OR IGNORE INTO predictions (date, race_name, race_hippodrome, race_time, horse_number, horse_name, odds, prediction_rank, source, jockey, odds_time) VALUES (?, ?, ?, ?, ?, ?, ?, 0, ?, ?, ?) ''', (today, race_name, hippodrome, race_time, p["numero"], p["cheval"], p.get("cote") or 0, "canalturf_partants", p.get("jockey", ""), now)) saved += c.rowcount except Exception as e: print(f" ⚠️ Partant {p['cheval']}: {e}") # 2. Pronostic (bases=1, chances=2, outsiders=3) for category, rank in [("bases", 1), ("chances", 2), ("outsiders", 3)]: for horse in quinte_data.get("pronostic", {}).get(category, []): try: c.execute(''' INSERT OR IGNORE INTO predictions (date, race_name, race_hippodrome, race_time, horse_number, horse_name, odds, prediction_rank, source, odds_time) VALUES (?, ?, ?, ?, ?, ?, 0, ?, ?, ?) ''', (today, race_name, hippodrome, race_time, horse["numero"], horse["cheval"], rank, f"canalturf_prono_{category}", now)) saved += c.rowcount except Exception as e: print(f" ⚠️ Prono {horse['cheval']}: {e}") # 3. Sélections autres courses for sel in selections: try: c.execute(''' INSERT OR IGNORE INTO predictions (date, race_name, race_hippodrome, race_time, horse_number, horse_name, odds, prediction_rank, source, jockey, odds_time) VALUES (?, ?, ?, ?, ?, ?, ?, 0, ?, ?, ?) ''', (sel["date"], sel["race_name"], hippodrome, sel["race_time"], sel["horse_number"], sel["horse_name"], sel.get("cote_pmu") or 0, "canalturf_selections", sel.get("jockey", ""), now)) saved += c.rowcount except Exception as e: print(f" ⚠️ Sélection {sel['horse_name']}: {e}") c.execute(''' CREATE TABLE IF NOT EXISTS race_meta ( id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT NOT NULL, race_name TEXT, race_hippodrome TEXT, race_time TEXT, race_timestamp INTEGER, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') conn.commit() c.execute('SELECT COUNT(*) FROM predictions WHERE date = ?', (today,)) total_today = c.fetchone()[0] conn.close() return saved, total_today def save_race_meta(quinte_data, today): """Sauvegarde l'heure de la course (HH:MM + timestamp) dans race_meta.""" course = quinte_data.get("course", {}) race_name = course.get("nom", "Quinté+") hippodrome = course.get("hippodrome", "") race_time = course.get("heure", "13:55") # Convertir HH:MM en timestamp du jour try: dt = datetime.strptime(f"{today} {race_time}", "%Y-%m-%d %H:%M") ts = int(dt.timestamp()) except: ts = None conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute(''' INSERT INTO race_meta (date, race_name, race_hippodrome, race_time, race_timestamp) VALUES (?, ?, ?, ?, ?) ''', (today, race_name, hippodrome, race_time, ts)) conn.commit() conn.close() print(f"🕒 Heure course sauvegardée : {race_time} (ts={ts})") def save_odds_history(quinte_data, today): """ Sauvegarde un snapshot des cotes dans odds_history à chaque run. Permet de suivre l'évolution des cotes tout au long de la journée. """ conn = sqlite3.connect(DB_PATH) c = conn.cursor() now = datetime.now().isoformat() saved = 0 course = quinte_data.get("course", {}) race_name = course.get("nom", "Quinté+") hippodrome = course.get("hippodrome", "") for p in quinte_data.get("partants", []): cote = p.get("cote") if not cote or cote <= 0: continue c.execute(''' INSERT INTO odds_history (date, race_name, race_hippodrome, horse_number, horse_name, odds, scraped_at, source) VALUES (?, ?, ?, ?, ?, ?, ?, ?) ''', (today, race_name, hippodrome, p["numero"], p["cheval"], cote, now, "canalturf")) saved += c.rowcount c.execute(''' CREATE TABLE IF NOT EXISTS race_meta ( id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT NOT NULL, race_name TEXT, race_hippodrome TEXT, race_time TEXT, race_timestamp INTEGER, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') conn.commit() conn.close() return saved def print_odds_evolution(today): """ Affiche l'évolution des cotes depuis le début de la journée. Compare le premier snapshot du matin avec le snapshot actuel. """ conn = sqlite3.connect(DB_PATH) c = conn.cursor() # Récupérer tous les snapshots du jour c.execute(''' SELECT horse_name, odds, scraped_at FROM odds_history WHERE date = ? ORDER BY horse_name, scraped_at ASC ''', (today,)) rows = c.fetchall() conn.close() if not rows: return # Grouper par cheval horses = {} for horse, odds, ts in rows: if horse not in horses: horses[horse] = [] horses[horse].append((odds, ts)) # Afficher l'évolution print(f"\n📈 ÉVOLUTION DES COTES — {today}") print(f"{'-'*60}") print(f" {'CHEVAL':<25} {'MATIN':<8} {'ACTUEL':<8} {'ÉVOL':<8} TENDANCE") print(f"{'-'*60}") evolutions = [] for horse, snapshots in horses.items(): if len(snapshots) < 1: continue cote_debut = snapshots[0][0] cote_actuel = snapshots[-1][0] nb_snapshots = len(snapshots) if cote_debut > 0: evol_pct = ((cote_actuel - cote_debut) / cote_debut) * 100 else: evol_pct = 0 evolutions.append((horse, cote_debut, cote_actuel, evol_pct, nb_snapshots)) # Trier par cote actuelle for horse, debut, actuel, evol, nb in sorted(evolutions, key=lambda x: x[2]): if evol < -5: tendance = "📉 BAISSE" elif evol > 5: tendance = "📈 HAUSSE" else: tendance = "➡️ STABLE" evol_str = f"{evol:+.0f}%" if nb > 1 else "1er snap" print(f" {horse:<25} {debut:<8} {actuel:<8} {evol_str:<8} {tendance}") print(f"{'-'*60}") print(f" ({len(evolutions)} chevaux, {rows[0][2][:16] if rows else '?'} → maintenant)") # ============== URL LIST ============== def get_urls(): """ALL 7 WORKING SITES""" sites = { 'equidia': ['https://www.equidia.fr/courses', 'https://www.equidia.fr/courses/2026-02-24'], 'zeturf': ['https://www.zeturf.fr/fr/courses-du-jour', 'https://www.zeturf.fr/en'], 'canalturf': ['https://www.canalturf.com/courses_chevaux_jour.php', 'https://www.canalturf.com/courses_quinte.php'], 'boturfers': ['https://www.boturfers.fr', 'https://www.boturfers.fr/quinte-du-jour', 'https://www.boturfers.fr/quinte-de-demain'], 'zone-turf': ['https://www.zone-turf.fr', 'https://www.zone-turf.fr/programmes/'], 'genybet': ['https://www.genybet.fr', 'https://www.genybet.fr/courses/'], 'ruedesjoueurs': ['https://www.ruedesjoueurs.com/turf.html', 'https://www.ruedesjoueurs.com/turf/pronostics.html'] } urls = [] for site, pages in sites.items(): for url in pages: urls.append((url, site)) return urls # ============== MAIN ============== def main(): start = time.time() print(f"\n{'='*50}") print(f"🐾 TURF SCRAPER v5 - REALTIME SAVING") print(f"{'='*50}\n") init_db() urls = get_urls() counter["total"] = len(urls) print(f"📡 Fetching {len(urls)} pages...\n") results = [] with ThreadPoolExecutor(max_workers=10) as executor: futures = {executor.submit(fetch_url, u): u for u in urls} for future in as_completed(futures): results.append(future.result()) elapsed = time.time() - start today = datetime.now().strftime('%Y-%m-%d') print(f"\n📊 Parsing predictions...") quinte_data = {"course": {}, "partants": [], "pronostic": {}} selections = [] boturfers_info = {} for r in results: if r['status'] != 'success': continue site = r['site'] url = r['url'] content = r['content'] if site == 'canalturf': if 'quinte' in url: quinte_data = parse_canalturf_quinte(content) nb_p = len(quinte_data['partants']) nb_b = len(quinte_data['pronostic'].get('bases', [])) print(f" canalturf quinté : {nb_p} partants, {nb_b} base(s) trouvé(s)") else: selections = parse_canalturf_selections(content) print(f" canalturf sélections : {len(selections)} course(s)") elif site == 'boturfers' and 'quinte-du-jour' in url: boturfers_info = parse_boturfers_quinte(content) temp = boturfers_info.get('temperature', '?') print(f" boturfers : {boturfers_info.get('nb_partants', '?')} partants, {temp}°C") # Sauvegarde BDD saved, total_today = save_parsed_data(quinte_data, selections, today) print(f"\n💾 {saved} nouvelles entrées insérées en BDD") # Snapshot cotes dans odds_history odds_saved = save_odds_history(quinte_data, today) print(f"📊 {odds_saved} cotes sauvegardées dans odds_history") # Afficher l'évolution des cotes print_odds_evolution(today) # Affichage résumé Quinté+ if quinte_data["partants"]: course = quinte_data["course"] print(f"\n{'='*55}") print(f"🏇 {course.get('nom', 'Quinté+')} — {course.get('hippodrome', '')} {course.get('heure', '')} ({course.get('distance', '')}m)") print(f"{'─'*55}") print(f" {'N°':<4} {'CHEVAL':<25} {'JOCKEY':<20} COTE") print(f"{'─'*55}") for p in sorted(quinte_data["partants"], key=lambda x: x.get("cote") or 999): cote_str = str(p['cote']) if p['cote'] else "?" print(f" {p['numero']:<4} {p['cheval']:<25} {p['jockey']:<20} {cote_str}") bases = [h['cheval'] for h in quinte_data['pronostic'].get('bases', [])] if bases: print(f"\n ⭐ Bases : {', '.join(bases)}") chances = [h['cheval'] for h in quinte_data['pronostic'].get('chances', [])] if chances: print(f" 🎯 Chances : {', '.join(chances)}") outsiders = [h['cheval'] for h in quinte_data['pronostic'].get('outsiders', [])] if outsiders: print(f" 🔍 Outsiders : {', '.join(outsiders)}") print(f"{'='*55}") # Stats par site by_site = {} for r in results: s = r['site'] by_site[s] = by_site.get(s, 0) + (1 if r['status'] == 'success' else 0) print(f"\n📊 STATS:") for site, count in by_site.items(): print(f" {site}: {count} pages") # Sauvegarde JSON output = f"{os.environ.get('TURF_DIR', '/home/h3r7/turf_scraper')}/v5_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(output, 'w', encoding='utf-8') as f: json.dump({ 'timestamp': datetime.now().isoformat(), 'runtime_sec': round(elapsed, 2), 'total_pages': len(urls), 'pages': results }, f, indent=2, ensure_ascii=False) print(f"\n{'='*50}") print(f"✅ DONE! {len(results)} pages in {elapsed:.1f}s") print(f"💾 {total_today} prédictions en BDD pour aujourd'hui") print(f"📁 {output}") print(f"{'='*50}\n") if __name__ == "__main__": main()