700 lines
24 KiB
Python
Executable File
700 lines
24 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Turf Scraper v5 - REALTIME DATABASE SAVING
|
||
Saves predictions immediately as they're scraped
|
||
Parser robuste intégré : canalturf (partants + pronostic + sélections), boturfers (infos course)
|
||
"""
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import json
|
||
from datetime import datetime
|
||
import time
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
import threading
|
||
import sqlite3
|
||
import re
|
||
import os
|
||
|
||
DB_PATH = "/home/h3r7/turf_scraper/turf.db"
|
||
HEADERS = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
|
||
}
|
||
|
||
lock = threading.Lock()
|
||
counter = {"total": 0, "done": 0}
|
||
|
||
# ============== DATABASE FUNCTIONS ==============
|
||
|
||
def init_db():
|
||
"""Initialize database"""
|
||
conn = sqlite3.connect(DB_PATH)
|
||
conn = sqlite3.connect(DB_PATH)
|
||
conn = sqlite3.connect(DB_PATH)
|
||
c = conn.cursor()
|
||
|
||
c.execute('''
|
||
CREATE TABLE IF NOT EXISTS predictions (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
date TEXT NOT NULL,
|
||
race_name TEXT,
|
||
race_hippodrome TEXT,
|
||
race_time TEXT,
|
||
horse_number INTEGER,
|
||
horse_name TEXT,
|
||
odds REAL,
|
||
prediction_rank INTEGER,
|
||
source TEXT,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||
jockey TEXT,
|
||
odds_time TEXT,
|
||
odds_prev REAL
|
||
)
|
||
''')
|
||
|
||
# Ajouter les colonnes jockey/odds_time si elles n'existent pas (migration)
|
||
for col, coltype in [("jockey", "TEXT"), ("odds_time", "TEXT"), ("odds_prev", "REAL")]:
|
||
try:
|
||
c.execute(f"ALTER TABLE predictions ADD COLUMN {col} {coltype}")
|
||
except sqlite3.OperationalError:
|
||
pass # Colonne déjà présente
|
||
|
||
c.execute('''
|
||
CREATE TABLE IF NOT EXISTS results (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
date TEXT NOT NULL,
|
||
race_name TEXT,
|
||
race_hippodrome TEXT,
|
||
position INTEGER,
|
||
horse_name TEXT,
|
||
odds REAL,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
)
|
||
''')
|
||
|
||
c.execute('''
|
||
CREATE TABLE IF NOT EXISTS performance (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
prediction_date TEXT,
|
||
race_date TEXT,
|
||
horse_name TEXT,
|
||
predicted_rank INTEGER,
|
||
actual_position INTEGER,
|
||
hit BOOLEAN,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
)
|
||
''')
|
||
|
||
# Table odds_history : historique des cotes intraday
|
||
c.execute('''
|
||
CREATE TABLE IF NOT EXISTS odds_history (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
date TEXT NOT NULL,
|
||
race_name TEXT,
|
||
race_hippodrome TEXT,
|
||
horse_number INTEGER,
|
||
horse_name TEXT,
|
||
odds REAL NOT NULL,
|
||
scraped_at TEXT NOT NULL,
|
||
source TEXT DEFAULT 'canalturf'
|
||
)
|
||
''')
|
||
|
||
|
||
c.execute('''
|
||
CREATE TABLE IF NOT EXISTS race_meta (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
date TEXT NOT NULL,
|
||
race_name TEXT,
|
||
race_hippodrome TEXT,
|
||
race_time TEXT,
|
||
race_timestamp INTEGER,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
)
|
||
''')
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
print(f"✅ DB initialized: {DB_PATH}")
|
||
|
||
def add_prediction(date, race_name, race_hippodrome, race_time, horse_number, horse_name,
|
||
odds, prediction_rank, source, jockey="", odds_time=None):
|
||
"""Add a prediction with OR IGNORE to avoid duplicates"""
|
||
c = conn.cursor()
|
||
c.execute('''
|
||
INSERT OR IGNORE INTO predictions
|
||
(date, race_name, race_hippodrome, race_time, horse_number, horse_name, odds, prediction_rank, source, jockey, odds_time)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||
''', (date, race_name, race_hippodrome, race_time, horse_number, horse_name,
|
||
odds, prediction_rank, source, jockey, odds_time or datetime.now().isoformat()))
|
||
|
||
c.execute('''
|
||
CREATE TABLE IF NOT EXISTS race_meta (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
date TEXT NOT NULL,
|
||
race_name TEXT,
|
||
race_hippodrome TEXT,
|
||
race_time TEXT,
|
||
race_timestamp INTEGER,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
)
|
||
''')
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
def add_result(date, race_name, race_hippodrome, position, horse_name, odds):
|
||
"""Add a race result"""
|
||
conn = sqlite3.connect(DB_PATH)
|
||
c = conn.cursor()
|
||
c.execute('''
|
||
INSERT INTO results (date, race_name, race_hippodrome, position, horse_name, odds)
|
||
VALUES (?, ?, ?, ?, ?, ?)
|
||
''', (date, race_name, race_hippodrome, position, horse_name, odds))
|
||
|
||
c.execute('''
|
||
CREATE TABLE IF NOT EXISTS race_meta (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
date TEXT NOT NULL,
|
||
race_name TEXT,
|
||
race_hippodrome TEXT,
|
||
race_time TEXT,
|
||
race_timestamp INTEGER,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
)
|
||
''')
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
# ============== SCRAPER FUNCTIONS ==============
|
||
|
||
def fetch_url(args):
|
||
url, site = args
|
||
try:
|
||
r = requests.get(url, headers=HEADERS, timeout=12)
|
||
soup = BeautifulSoup(r.text, 'html.parser')
|
||
for s in soup(["script", "style"]):
|
||
s.decompose()
|
||
text = soup.get_text(separator='\n', strip=True)[:8000]
|
||
|
||
with lock:
|
||
counter["done"] += 1
|
||
pct = (counter["done"] / counter["total"]) * 100
|
||
print(f" [{pct:.0f}%] {site}: OK")
|
||
|
||
return {'url': url, 'site': site, 'content': text, 'status': 'success'}
|
||
except Exception as e:
|
||
with lock:
|
||
counter["done"] += 1
|
||
return {'url': url, 'site': site, 'error': str(e), 'status': 'error'}
|
||
|
||
# ============== PARSERS ROBUSTES ==============
|
||
|
||
def parse_canalturf_quinte(content):
|
||
"""
|
||
Extrait depuis courses_quinte.php :
|
||
- Infos course (nom, hippodrome, heure, distance, allocation)
|
||
- Liste des partants (numéro, cheval, jockey, cote)
|
||
- Pronostic structuré (bases, chances régulières, outsiders)
|
||
"""
|
||
result = {
|
||
"course": {},
|
||
"partants": [],
|
||
"pronostic": {"bases": [], "chances": [], "outsiders": []}
|
||
}
|
||
lines = [l.strip() for l in content.split('\n') if l.strip()]
|
||
|
||
# Nom de la course
|
||
for line in lines:
|
||
if re.search(r'^PRIX\s+[A-Z]', line):
|
||
result["course"]["nom"] = line.strip()
|
||
break
|
||
|
||
# Hippodrome
|
||
m = re.search(r'hippodrome de\s+([A-Z\-]+)', content, re.IGNORECASE)
|
||
if m:
|
||
result["course"]["hippodrome"] = m.group(1).strip()
|
||
|
||
# Heure
|
||
m = re.search(r'(\d{1,2}:\d{2})', content)
|
||
if m:
|
||
result["course"]["heure"] = m.group(1)
|
||
|
||
# Distance
|
||
m = re.search(r'(\d{3,4})m', content)
|
||
if m:
|
||
result["course"]["distance"] = int(m.group(1))
|
||
|
||
# Type de course
|
||
for t in ['TROT ATTELE', 'TROT MONTE', 'PLAT', 'OBSTACLE', 'HAIES', 'STEEPLE']:
|
||
if t in content.upper():
|
||
result["course"]["type"] = t
|
||
break
|
||
|
||
# Partants : on cherche des blocs numéro / NOM / Jockey / cote
|
||
# On s'arrête dès qu'on a trouvé la section "Liste des partants" pour éviter
|
||
# de parser aussi le bloc pronostic qui contient les mêmes noms sans cote
|
||
liste_idx = content.find("Liste des partants")
|
||
prono_idx = content.find("Le pronostic du Quinté+")
|
||
partants_zone = content[liste_idx:prono_idx] if liste_idx != -1 and prono_idx != -1 else content
|
||
lines_partants = [l.strip() for l in partants_zone.split('\n') if l.strip()]
|
||
|
||
seen_nums = set()
|
||
i = 0
|
||
while i < len(lines_partants):
|
||
if re.match(r'^\d{1,2}$', lines_partants[i]):
|
||
num = int(lines_partants[i])
|
||
if 1 <= num <= 20 and num not in seen_nums and i + 2 < len(lines_partants):
|
||
nom_cheval = lines_partants[i + 1]
|
||
jockey = lines_partants[i + 2]
|
||
cote = None
|
||
if i + 3 < len(lines_partants) and re.match(r'[\d\.]+/\d', lines_partants[i + 3]):
|
||
try:
|
||
cote = float(lines_partants[i + 3].split('/')[0])
|
||
except:
|
||
pass
|
||
i += 4
|
||
else:
|
||
i += 3
|
||
# Valider que le nom est bien en majuscules
|
||
if re.search(r'[A-Z]{3,}', nom_cheval) and re.search(r'[A-Z]', jockey):
|
||
seen_nums.add(num)
|
||
result["partants"].append({
|
||
"numero": num,
|
||
"cheval": nom_cheval.strip(),
|
||
"jockey": jockey.strip(),
|
||
"cote": cote
|
||
})
|
||
continue
|
||
i += 1
|
||
|
||
# Pronostic : extraire uniquement les chevaux dans la section dédiée
|
||
# On délimite chaque section entre son mot-clé et le suivant
|
||
section_keywords = ["Base(s)", "Chance(s) régulière(s)", "Outsider(s)", "Le cheval du Quinté+"]
|
||
|
||
def extract_horses_between(start_kw, end_kws):
|
||
horses = []
|
||
idx_start = content.find(start_kw)
|
||
if idx_start == -1:
|
||
return horses
|
||
idx_end = len(content)
|
||
for kw in end_kws:
|
||
idx = content.find(kw, idx_start + len(start_kw))
|
||
if idx != -1 and idx < idx_end:
|
||
idx_end = idx
|
||
snippet = content[idx_start:idx_end]
|
||
for m in re.finditer(r'(\d{1,2})\s+([A-Z][A-Z\s\-\']+?)\s*\(', snippet):
|
||
try:
|
||
horses.append({"numero": int(m.group(1)), "cheval": m.group(2).strip()})
|
||
except:
|
||
pass
|
||
return horses
|
||
|
||
result["pronostic"]["bases"] = extract_horses_between("Base(s)", ["Chance(s) régulière(s)", "Outsider(s)", "Le cheval"])
|
||
result["pronostic"]["chances"] = extract_horses_between("Chance(s) régulière(s)", ["Outsider(s)", "Le cheval"])
|
||
result["pronostic"]["outsiders"] = extract_horses_between("Outsider(s)", ["Le cheval", "Partants détaillés"])
|
||
|
||
return result
|
||
|
||
|
||
def parse_canalturf_selections(content):
|
||
"""
|
||
Extrait depuis courses_chevaux_jour.php :
|
||
Sélections gagnantes/placées par course (hippodrome, heure, cheval, jockey, cote PMU)
|
||
"""
|
||
selections = []
|
||
today = datetime.now().strftime('%Y-%m-%d')
|
||
|
||
for m in re.finditer(
|
||
r'C(\d+)\s*[-–]\s*(PRIX[^(]+)\((\d{1,2}:\d{2})\)\s*'
|
||
r'(\d{1,2})\s*[-–]\s*([A-Z][A-Z\s\'\-]+?)\s*\(([^)]+)\)',
|
||
content
|
||
):
|
||
race_name = m.group(2).strip()
|
||
race_time = m.group(3)
|
||
horse_num = int(m.group(4))
|
||
horse_name = m.group(5).strip()
|
||
jockey = m.group(6).strip()
|
||
|
||
after = content[m.end():m.end() + 100]
|
||
cote_m = re.search(r'(\d+\.?\d*)\s*PMU', after)
|
||
cote = float(cote_m.group(1)) if cote_m else 0.0
|
||
|
||
selections.append({
|
||
"date": today,
|
||
"race_name": race_name,
|
||
"race_time": race_time,
|
||
"horse_number": horse_num,
|
||
"horse_name": horse_name,
|
||
"jockey": jockey,
|
||
"cote_pmu": cote,
|
||
})
|
||
|
||
return selections
|
||
|
||
|
||
def parse_boturfers_quinte(content):
|
||
"""
|
||
Extrait depuis boturfers.fr/quinte-du-jour :
|
||
Infos course (nb partants, distance, météo, probabilités)
|
||
"""
|
||
info = {}
|
||
|
||
m = re.search(r'(\d+)\s*partants', content)
|
||
if m:
|
||
info["nb_partants"] = int(m.group(1))
|
||
|
||
m = re.search(r'(\d+)°C', content)
|
||
if m:
|
||
info["temperature"] = int(m.group(1))
|
||
|
||
probs = re.findall(r'(\d+)%\s*\nen (\d+) cheval', content)
|
||
if probs:
|
||
info["probabilites"] = {f"top{p[1]}": int(p[0]) for p in probs}
|
||
|
||
return info
|
||
|
||
|
||
def save_parsed_data(quinte_data, selections, today):
|
||
"""Sauvegarde en BDD toutes les données parsées"""
|
||
conn = sqlite3.connect(DB_PATH)
|
||
c = conn.cursor()
|
||
now = datetime.now().isoformat()
|
||
saved = 0
|
||
|
||
course = quinte_data.get("course", {})
|
||
race_name = course.get("nom", "Quinté+")
|
||
hippodrome = course.get("hippodrome", "")
|
||
race_time = course.get("heure", "13:55")
|
||
|
||
# 1. Partants avec cotes
|
||
for p in quinte_data.get("partants", []):
|
||
try:
|
||
c.execute('''
|
||
INSERT OR IGNORE INTO predictions
|
||
(date, race_name, race_hippodrome, race_time, horse_number, horse_name,
|
||
odds, prediction_rank, source, jockey, odds_time)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?, 0, ?, ?, ?)
|
||
''', (today, race_name, hippodrome, race_time,
|
||
p["numero"], p["cheval"], p.get("cote") or 0,
|
||
"canalturf_partants", p.get("jockey", ""), now))
|
||
saved += c.rowcount
|
||
except Exception as e:
|
||
print(f" ⚠️ Partant {p['cheval']}: {e}")
|
||
|
||
# 2. Pronostic (bases=1, chances=2, outsiders=3)
|
||
for category, rank in [("bases", 1), ("chances", 2), ("outsiders", 3)]:
|
||
for horse in quinte_data.get("pronostic", {}).get(category, []):
|
||
try:
|
||
c.execute('''
|
||
INSERT OR IGNORE INTO predictions
|
||
(date, race_name, race_hippodrome, race_time, horse_number, horse_name,
|
||
odds, prediction_rank, source, odds_time)
|
||
VALUES (?, ?, ?, ?, ?, ?, 0, ?, ?, ?)
|
||
''', (today, race_name, hippodrome, race_time,
|
||
horse["numero"], horse["cheval"], rank,
|
||
f"canalturf_prono_{category}", now))
|
||
saved += c.rowcount
|
||
except Exception as e:
|
||
print(f" ⚠️ Prono {horse['cheval']}: {e}")
|
||
|
||
# 3. Sélections autres courses
|
||
for sel in selections:
|
||
try:
|
||
c.execute('''
|
||
INSERT OR IGNORE INTO predictions
|
||
(date, race_name, race_hippodrome, race_time, horse_number, horse_name,
|
||
odds, prediction_rank, source, jockey, odds_time)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?, 0, ?, ?, ?)
|
||
''', (sel["date"], sel["race_name"], hippodrome, sel["race_time"],
|
||
sel["horse_number"], sel["horse_name"], sel.get("cote_pmu") or 0,
|
||
"canalturf_selections", sel.get("jockey", ""), now))
|
||
saved += c.rowcount
|
||
except Exception as e:
|
||
print(f" ⚠️ Sélection {sel['horse_name']}: {e}")
|
||
|
||
|
||
c.execute('''
|
||
CREATE TABLE IF NOT EXISTS race_meta (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
date TEXT NOT NULL,
|
||
race_name TEXT,
|
||
race_hippodrome TEXT,
|
||
race_time TEXT,
|
||
race_timestamp INTEGER,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
)
|
||
''')
|
||
|
||
conn.commit()
|
||
c.execute('SELECT COUNT(*) FROM predictions WHERE date = ?', (today,))
|
||
total_today = c.fetchone()[0]
|
||
conn.close()
|
||
return saved, total_today
|
||
|
||
|
||
def save_race_meta(quinte_data, today):
|
||
"""Sauvegarde l'heure de la course (HH:MM + timestamp) dans race_meta."""
|
||
course = quinte_data.get("course", {})
|
||
race_name = course.get("nom", "Quinté+")
|
||
hippodrome = course.get("hippodrome", "")
|
||
race_time = course.get("heure", "13:55")
|
||
|
||
# Convertir HH:MM en timestamp du jour
|
||
try:
|
||
dt = datetime.strptime(f"{today} {race_time}", "%Y-%m-%d %H:%M")
|
||
ts = int(dt.timestamp())
|
||
except:
|
||
ts = None
|
||
|
||
conn = sqlite3.connect(DB_PATH)
|
||
c = conn.cursor()
|
||
c.execute('''
|
||
INSERT INTO race_meta (date, race_name, race_hippodrome, race_time, race_timestamp)
|
||
VALUES (?, ?, ?, ?, ?)
|
||
''', (today, race_name, hippodrome, race_time, ts))
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
print(f"🕒 Heure course sauvegardée : {race_time} (ts={ts})")
|
||
|
||
def save_odds_history(quinte_data, today):
|
||
"""
|
||
Sauvegarde un snapshot des cotes dans odds_history à chaque run.
|
||
Permet de suivre l'évolution des cotes tout au long de la journée.
|
||
"""
|
||
conn = sqlite3.connect(DB_PATH)
|
||
c = conn.cursor()
|
||
now = datetime.now().isoformat()
|
||
saved = 0
|
||
|
||
course = quinte_data.get("course", {})
|
||
race_name = course.get("nom", "Quinté+")
|
||
hippodrome = course.get("hippodrome", "")
|
||
|
||
for p in quinte_data.get("partants", []):
|
||
cote = p.get("cote")
|
||
if not cote or cote <= 0:
|
||
continue
|
||
c.execute('''
|
||
INSERT INTO odds_history
|
||
(date, race_name, race_hippodrome, horse_number, horse_name, odds, scraped_at, source)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||
''', (today, race_name, hippodrome,
|
||
p["numero"], p["cheval"], cote, now, "canalturf"))
|
||
saved += c.rowcount
|
||
|
||
|
||
c.execute('''
|
||
CREATE TABLE IF NOT EXISTS race_meta (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
date TEXT NOT NULL,
|
||
race_name TEXT,
|
||
race_hippodrome TEXT,
|
||
race_time TEXT,
|
||
race_timestamp INTEGER,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
)
|
||
''')
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
return saved
|
||
|
||
|
||
def print_odds_evolution(today):
|
||
"""
|
||
Affiche l'évolution des cotes depuis le début de la journée.
|
||
Compare le premier snapshot du matin avec le snapshot actuel.
|
||
"""
|
||
conn = sqlite3.connect(DB_PATH)
|
||
c = conn.cursor()
|
||
|
||
# Récupérer tous les snapshots du jour
|
||
c.execute('''
|
||
SELECT horse_name, odds, scraped_at
|
||
FROM odds_history
|
||
WHERE date = ?
|
||
ORDER BY horse_name, scraped_at ASC
|
||
''', (today,))
|
||
rows = c.fetchall()
|
||
conn.close()
|
||
|
||
if not rows:
|
||
return
|
||
|
||
# Grouper par cheval
|
||
horses = {}
|
||
for horse, odds, ts in rows:
|
||
if horse not in horses:
|
||
horses[horse] = []
|
||
horses[horse].append((odds, ts))
|
||
|
||
# Afficher l'évolution
|
||
print(f"\n📈 ÉVOLUTION DES COTES — {today}")
|
||
print(f"{'-'*60}")
|
||
print(f" {'CHEVAL':<25} {'MATIN':<8} {'ACTUEL':<8} {'ÉVOL':<8} TENDANCE")
|
||
print(f"{'-'*60}")
|
||
|
||
evolutions = []
|
||
for horse, snapshots in horses.items():
|
||
if len(snapshots) < 1:
|
||
continue
|
||
cote_debut = snapshots[0][0]
|
||
cote_actuel = snapshots[-1][0]
|
||
nb_snapshots = len(snapshots)
|
||
if cote_debut > 0:
|
||
evol_pct = ((cote_actuel - cote_debut) / cote_debut) * 100
|
||
else:
|
||
evol_pct = 0
|
||
evolutions.append((horse, cote_debut, cote_actuel, evol_pct, nb_snapshots))
|
||
|
||
# Trier par cote actuelle
|
||
for horse, debut, actuel, evol, nb in sorted(evolutions, key=lambda x: x[2]):
|
||
if evol < -5:
|
||
tendance = "📉 BAISSE"
|
||
elif evol > 5:
|
||
tendance = "📈 HAUSSE"
|
||
else:
|
||
tendance = "➡️ STABLE"
|
||
evol_str = f"{evol:+.0f}%" if nb > 1 else "1er snap"
|
||
print(f" {horse:<25} {debut:<8} {actuel:<8} {evol_str:<8} {tendance}")
|
||
|
||
print(f"{'-'*60}")
|
||
print(f" ({len(evolutions)} chevaux, {rows[0][2][:16] if rows else '?'} → maintenant)")
|
||
|
||
|
||
# ============== URL LIST ==============
|
||
|
||
def get_urls():
|
||
"""ALL 7 WORKING SITES"""
|
||
sites = {
|
||
'equidia': ['https://www.equidia.fr/courses', 'https://www.equidia.fr/courses/2026-02-24'],
|
||
'zeturf': ['https://www.zeturf.fr/fr/courses-du-jour', 'https://www.zeturf.fr/en'],
|
||
'canalturf': ['https://www.canalturf.com/courses_chevaux_jour.php', 'https://www.canalturf.com/courses_quinte.php'],
|
||
'boturfers': ['https://www.boturfers.fr', 'https://www.boturfers.fr/quinte-du-jour', 'https://www.boturfers.fr/quinte-de-demain'],
|
||
'zone-turf': ['https://www.zone-turf.fr', 'https://www.zone-turf.fr/programmes/'],
|
||
'genybet': ['https://www.genybet.fr', 'https://www.genybet.fr/courses/'],
|
||
'ruedesjoueurs': ['https://www.ruedesjoueurs.com/turf.html', 'https://www.ruedesjoueurs.com/turf/pronostics.html']
|
||
}
|
||
urls = []
|
||
for site, pages in sites.items():
|
||
for url in pages:
|
||
urls.append((url, site))
|
||
return urls
|
||
|
||
# ============== MAIN ==============
|
||
|
||
def main():
|
||
start = time.time()
|
||
print(f"\n{'='*50}")
|
||
print(f"🐾 TURF SCRAPER v5 - REALTIME SAVING")
|
||
print(f"{'='*50}\n")
|
||
|
||
init_db()
|
||
|
||
urls = get_urls()
|
||
counter["total"] = len(urls)
|
||
|
||
print(f"📡 Fetching {len(urls)} pages...\n")
|
||
|
||
results = []
|
||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||
futures = {executor.submit(fetch_url, u): u for u in urls}
|
||
for future in as_completed(futures):
|
||
results.append(future.result())
|
||
|
||
elapsed = time.time() - start
|
||
today = datetime.now().strftime('%Y-%m-%d')
|
||
|
||
print(f"\n📊 Parsing predictions...")
|
||
|
||
quinte_data = {"course": {}, "partants": [], "pronostic": {}}
|
||
selections = []
|
||
boturfers_info = {}
|
||
|
||
for r in results:
|
||
if r['status'] != 'success':
|
||
continue
|
||
site = r['site']
|
||
url = r['url']
|
||
content = r['content']
|
||
|
||
if site == 'canalturf':
|
||
if 'quinte' in url:
|
||
quinte_data = parse_canalturf_quinte(content)
|
||
nb_p = len(quinte_data['partants'])
|
||
nb_b = len(quinte_data['pronostic'].get('bases', []))
|
||
print(f" canalturf quinté : {nb_p} partants, {nb_b} base(s) trouvé(s)")
|
||
else:
|
||
selections = parse_canalturf_selections(content)
|
||
print(f" canalturf sélections : {len(selections)} course(s)")
|
||
|
||
elif site == 'boturfers' and 'quinte-du-jour' in url:
|
||
boturfers_info = parse_boturfers_quinte(content)
|
||
temp = boturfers_info.get('temperature', '?')
|
||
print(f" boturfers : {boturfers_info.get('nb_partants', '?')} partants, {temp}°C")
|
||
|
||
# Sauvegarde BDD
|
||
saved, total_today = save_parsed_data(quinte_data, selections, today)
|
||
print(f"\n💾 {saved} nouvelles entrées insérées en BDD")
|
||
|
||
# Snapshot cotes dans odds_history
|
||
odds_saved = save_odds_history(quinte_data, today)
|
||
print(f"📊 {odds_saved} cotes sauvegardées dans odds_history")
|
||
|
||
# Afficher l'évolution des cotes
|
||
print_odds_evolution(today)
|
||
|
||
# Affichage résumé Quinté+
|
||
if quinte_data["partants"]:
|
||
course = quinte_data["course"]
|
||
print(f"\n{'='*55}")
|
||
print(f"🏇 {course.get('nom', 'Quinté+')} — {course.get('hippodrome', '')} {course.get('heure', '')} ({course.get('distance', '')}m)")
|
||
print(f"{'─'*55}")
|
||
print(f" {'N°':<4} {'CHEVAL':<25} {'JOCKEY':<20} COTE")
|
||
print(f"{'─'*55}")
|
||
for p in sorted(quinte_data["partants"], key=lambda x: x.get("cote") or 999):
|
||
cote_str = str(p['cote']) if p['cote'] else "?"
|
||
print(f" {p['numero']:<4} {p['cheval']:<25} {p['jockey']:<20} {cote_str}")
|
||
bases = [h['cheval'] for h in quinte_data['pronostic'].get('bases', [])]
|
||
if bases:
|
||
print(f"\n ⭐ Bases : {', '.join(bases)}")
|
||
chances = [h['cheval'] for h in quinte_data['pronostic'].get('chances', [])]
|
||
if chances:
|
||
print(f" 🎯 Chances : {', '.join(chances)}")
|
||
outsiders = [h['cheval'] for h in quinte_data['pronostic'].get('outsiders', [])]
|
||
if outsiders:
|
||
print(f" 🔍 Outsiders : {', '.join(outsiders)}")
|
||
print(f"{'='*55}")
|
||
|
||
# Stats par site
|
||
by_site = {}
|
||
for r in results:
|
||
s = r['site']
|
||
by_site[s] = by_site.get(s, 0) + (1 if r['status'] == 'success' else 0)
|
||
|
||
print(f"\n📊 STATS:")
|
||
for site, count in by_site.items():
|
||
print(f" {site}: {count} pages")
|
||
|
||
# Sauvegarde JSON
|
||
output = f"{os.environ.get('TURF_DIR', '/home/h3r7/turf_scraper')}/v5_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||
with open(output, 'w', encoding='utf-8') as f:
|
||
json.dump({
|
||
'timestamp': datetime.now().isoformat(),
|
||
'runtime_sec': round(elapsed, 2),
|
||
'total_pages': len(urls),
|
||
'pages': results
|
||
}, f, indent=2, ensure_ascii=False)
|
||
|
||
print(f"\n{'='*50}")
|
||
print(f"✅ DONE! {len(results)} pages in {elapsed:.1f}s")
|
||
print(f"💾 {total_today} prédictions en BDD pour aujourd'hui")
|
||
print(f"📁 {output}")
|
||
print(f"{'='*50}\n")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|