Files
turf_saas/multi_scraper_v5.py
2026-04-25 17:18:43 +02:00

700 lines
24 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Turf Scraper v5 - REALTIME DATABASE SAVING
Saves predictions immediately as they're scraped
Parser robuste intégré : canalturf (partants + pronostic + sélections), boturfers (infos course)
"""
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import sqlite3
import re
import os
DB_PATH = "/home/h3r7/turf_scraper/turf.db"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
}
lock = threading.Lock()
counter = {"total": 0, "done": 0}
# ============== DATABASE FUNCTIONS ==============
def init_db():
"""Initialize database"""
conn = sqlite3.connect(DB_PATH)
conn = sqlite3.connect(DB_PATH)
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS predictions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL,
race_name TEXT,
race_hippodrome TEXT,
race_time TEXT,
horse_number INTEGER,
horse_name TEXT,
odds REAL,
prediction_rank INTEGER,
source TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
jockey TEXT,
odds_time TEXT,
odds_prev REAL
)
''')
# Ajouter les colonnes jockey/odds_time si elles n'existent pas (migration)
for col, coltype in [("jockey", "TEXT"), ("odds_time", "TEXT"), ("odds_prev", "REAL")]:
try:
c.execute(f"ALTER TABLE predictions ADD COLUMN {col} {coltype}")
except sqlite3.OperationalError:
pass # Colonne déjà présente
c.execute('''
CREATE TABLE IF NOT EXISTS results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL,
race_name TEXT,
race_hippodrome TEXT,
position INTEGER,
horse_name TEXT,
odds REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
c.execute('''
CREATE TABLE IF NOT EXISTS performance (
id INTEGER PRIMARY KEY AUTOINCREMENT,
prediction_date TEXT,
race_date TEXT,
horse_name TEXT,
predicted_rank INTEGER,
actual_position INTEGER,
hit BOOLEAN,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Table odds_history : historique des cotes intraday
c.execute('''
CREATE TABLE IF NOT EXISTS odds_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL,
race_name TEXT,
race_hippodrome TEXT,
horse_number INTEGER,
horse_name TEXT,
odds REAL NOT NULL,
scraped_at TEXT NOT NULL,
source TEXT DEFAULT 'canalturf'
)
''')
c.execute('''
CREATE TABLE IF NOT EXISTS race_meta (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL,
race_name TEXT,
race_hippodrome TEXT,
race_time TEXT,
race_timestamp INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
print(f"✅ DB initialized: {DB_PATH}")
def add_prediction(date, race_name, race_hippodrome, race_time, horse_number, horse_name,
odds, prediction_rank, source, jockey="", odds_time=None):
"""Add a prediction with OR IGNORE to avoid duplicates"""
c = conn.cursor()
c.execute('''
INSERT OR IGNORE INTO predictions
(date, race_name, race_hippodrome, race_time, horse_number, horse_name, odds, prediction_rank, source, jockey, odds_time)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (date, race_name, race_hippodrome, race_time, horse_number, horse_name,
odds, prediction_rank, source, jockey, odds_time or datetime.now().isoformat()))
c.execute('''
CREATE TABLE IF NOT EXISTS race_meta (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL,
race_name TEXT,
race_hippodrome TEXT,
race_time TEXT,
race_timestamp INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
def add_result(date, race_name, race_hippodrome, position, horse_name, odds):
"""Add a race result"""
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
INSERT INTO results (date, race_name, race_hippodrome, position, horse_name, odds)
VALUES (?, ?, ?, ?, ?, ?)
''', (date, race_name, race_hippodrome, position, horse_name, odds))
c.execute('''
CREATE TABLE IF NOT EXISTS race_meta (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL,
race_name TEXT,
race_hippodrome TEXT,
race_time TEXT,
race_timestamp INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
# ============== SCRAPER FUNCTIONS ==============
def fetch_url(args):
url, site = args
try:
r = requests.get(url, headers=HEADERS, timeout=12)
soup = BeautifulSoup(r.text, 'html.parser')
for s in soup(["script", "style"]):
s.decompose()
text = soup.get_text(separator='\n', strip=True)[:8000]
with lock:
counter["done"] += 1
pct = (counter["done"] / counter["total"]) * 100
print(f" [{pct:.0f}%] {site}: OK")
return {'url': url, 'site': site, 'content': text, 'status': 'success'}
except Exception as e:
with lock:
counter["done"] += 1
return {'url': url, 'site': site, 'error': str(e), 'status': 'error'}
# ============== PARSERS ROBUSTES ==============
def parse_canalturf_quinte(content):
"""
Extrait depuis courses_quinte.php :
- Infos course (nom, hippodrome, heure, distance, allocation)
- Liste des partants (numéro, cheval, jockey, cote)
- Pronostic structuré (bases, chances régulières, outsiders)
"""
result = {
"course": {},
"partants": [],
"pronostic": {"bases": [], "chances": [], "outsiders": []}
}
lines = [l.strip() for l in content.split('\n') if l.strip()]
# Nom de la course
for line in lines:
if re.search(r'^PRIX\s+[A-Z]', line):
result["course"]["nom"] = line.strip()
break
# Hippodrome
m = re.search(r'hippodrome de\s+([A-Z\-]+)', content, re.IGNORECASE)
if m:
result["course"]["hippodrome"] = m.group(1).strip()
# Heure
m = re.search(r'(\d{1,2}:\d{2})', content)
if m:
result["course"]["heure"] = m.group(1)
# Distance
m = re.search(r'(\d{3,4})m', content)
if m:
result["course"]["distance"] = int(m.group(1))
# Type de course
for t in ['TROT ATTELE', 'TROT MONTE', 'PLAT', 'OBSTACLE', 'HAIES', 'STEEPLE']:
if t in content.upper():
result["course"]["type"] = t
break
# Partants : on cherche des blocs numéro / NOM / Jockey / cote
# On s'arrête dès qu'on a trouvé la section "Liste des partants" pour éviter
# de parser aussi le bloc pronostic qui contient les mêmes noms sans cote
liste_idx = content.find("Liste des partants")
prono_idx = content.find("Le pronostic du Quinté+")
partants_zone = content[liste_idx:prono_idx] if liste_idx != -1 and prono_idx != -1 else content
lines_partants = [l.strip() for l in partants_zone.split('\n') if l.strip()]
seen_nums = set()
i = 0
while i < len(lines_partants):
if re.match(r'^\d{1,2}$', lines_partants[i]):
num = int(lines_partants[i])
if 1 <= num <= 20 and num not in seen_nums and i + 2 < len(lines_partants):
nom_cheval = lines_partants[i + 1]
jockey = lines_partants[i + 2]
cote = None
if i + 3 < len(lines_partants) and re.match(r'[\d\.]+/\d', lines_partants[i + 3]):
try:
cote = float(lines_partants[i + 3].split('/')[0])
except:
pass
i += 4
else:
i += 3
# Valider que le nom est bien en majuscules
if re.search(r'[A-Z]{3,}', nom_cheval) and re.search(r'[A-Z]', jockey):
seen_nums.add(num)
result["partants"].append({
"numero": num,
"cheval": nom_cheval.strip(),
"jockey": jockey.strip(),
"cote": cote
})
continue
i += 1
# Pronostic : extraire uniquement les chevaux dans la section dédiée
# On délimite chaque section entre son mot-clé et le suivant
section_keywords = ["Base(s)", "Chance(s) régulière(s)", "Outsider(s)", "Le cheval du Quinté+"]
def extract_horses_between(start_kw, end_kws):
horses = []
idx_start = content.find(start_kw)
if idx_start == -1:
return horses
idx_end = len(content)
for kw in end_kws:
idx = content.find(kw, idx_start + len(start_kw))
if idx != -1 and idx < idx_end:
idx_end = idx
snippet = content[idx_start:idx_end]
for m in re.finditer(r'(\d{1,2})\s+([A-Z][A-Z\s\-\']+?)\s*\(', snippet):
try:
horses.append({"numero": int(m.group(1)), "cheval": m.group(2).strip()})
except:
pass
return horses
result["pronostic"]["bases"] = extract_horses_between("Base(s)", ["Chance(s) régulière(s)", "Outsider(s)", "Le cheval"])
result["pronostic"]["chances"] = extract_horses_between("Chance(s) régulière(s)", ["Outsider(s)", "Le cheval"])
result["pronostic"]["outsiders"] = extract_horses_between("Outsider(s)", ["Le cheval", "Partants détaillés"])
return result
def parse_canalturf_selections(content):
"""
Extrait depuis courses_chevaux_jour.php :
Sélections gagnantes/placées par course (hippodrome, heure, cheval, jockey, cote PMU)
"""
selections = []
today = datetime.now().strftime('%Y-%m-%d')
for m in re.finditer(
r'C(\d+)\s*[-]\s*(PRIX[^(]+)\((\d{1,2}:\d{2})\)\s*'
r'(\d{1,2})\s*[-]\s*([A-Z][A-Z\s\'\-]+?)\s*\(([^)]+)\)',
content
):
race_name = m.group(2).strip()
race_time = m.group(3)
horse_num = int(m.group(4))
horse_name = m.group(5).strip()
jockey = m.group(6).strip()
after = content[m.end():m.end() + 100]
cote_m = re.search(r'(\d+\.?\d*)\s*PMU', after)
cote = float(cote_m.group(1)) if cote_m else 0.0
selections.append({
"date": today,
"race_name": race_name,
"race_time": race_time,
"horse_number": horse_num,
"horse_name": horse_name,
"jockey": jockey,
"cote_pmu": cote,
})
return selections
def parse_boturfers_quinte(content):
"""
Extrait depuis boturfers.fr/quinte-du-jour :
Infos course (nb partants, distance, météo, probabilités)
"""
info = {}
m = re.search(r'(\d+)\s*partants', content)
if m:
info["nb_partants"] = int(m.group(1))
m = re.search(r'(\d+)°C', content)
if m:
info["temperature"] = int(m.group(1))
probs = re.findall(r'(\d+)%\s*\nen (\d+) cheval', content)
if probs:
info["probabilites"] = {f"top{p[1]}": int(p[0]) for p in probs}
return info
def save_parsed_data(quinte_data, selections, today):
"""Sauvegarde en BDD toutes les données parsées"""
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
now = datetime.now().isoformat()
saved = 0
course = quinte_data.get("course", {})
race_name = course.get("nom", "Quinté+")
hippodrome = course.get("hippodrome", "")
race_time = course.get("heure", "13:55")
# 1. Partants avec cotes
for p in quinte_data.get("partants", []):
try:
c.execute('''
INSERT OR IGNORE INTO predictions
(date, race_name, race_hippodrome, race_time, horse_number, horse_name,
odds, prediction_rank, source, jockey, odds_time)
VALUES (?, ?, ?, ?, ?, ?, ?, 0, ?, ?, ?)
''', (today, race_name, hippodrome, race_time,
p["numero"], p["cheval"], p.get("cote") or 0,
"canalturf_partants", p.get("jockey", ""), now))
saved += c.rowcount
except Exception as e:
print(f" ⚠️ Partant {p['cheval']}: {e}")
# 2. Pronostic (bases=1, chances=2, outsiders=3)
for category, rank in [("bases", 1), ("chances", 2), ("outsiders", 3)]:
for horse in quinte_data.get("pronostic", {}).get(category, []):
try:
c.execute('''
INSERT OR IGNORE INTO predictions
(date, race_name, race_hippodrome, race_time, horse_number, horse_name,
odds, prediction_rank, source, odds_time)
VALUES (?, ?, ?, ?, ?, ?, 0, ?, ?, ?)
''', (today, race_name, hippodrome, race_time,
horse["numero"], horse["cheval"], rank,
f"canalturf_prono_{category}", now))
saved += c.rowcount
except Exception as e:
print(f" ⚠️ Prono {horse['cheval']}: {e}")
# 3. Sélections autres courses
for sel in selections:
try:
c.execute('''
INSERT OR IGNORE INTO predictions
(date, race_name, race_hippodrome, race_time, horse_number, horse_name,
odds, prediction_rank, source, jockey, odds_time)
VALUES (?, ?, ?, ?, ?, ?, ?, 0, ?, ?, ?)
''', (sel["date"], sel["race_name"], hippodrome, sel["race_time"],
sel["horse_number"], sel["horse_name"], sel.get("cote_pmu") or 0,
"canalturf_selections", sel.get("jockey", ""), now))
saved += c.rowcount
except Exception as e:
print(f" ⚠️ Sélection {sel['horse_name']}: {e}")
c.execute('''
CREATE TABLE IF NOT EXISTS race_meta (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL,
race_name TEXT,
race_hippodrome TEXT,
race_time TEXT,
race_timestamp INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
c.execute('SELECT COUNT(*) FROM predictions WHERE date = ?', (today,))
total_today = c.fetchone()[0]
conn.close()
return saved, total_today
def save_race_meta(quinte_data, today):
"""Sauvegarde l'heure de la course (HH:MM + timestamp) dans race_meta."""
course = quinte_data.get("course", {})
race_name = course.get("nom", "Quinté+")
hippodrome = course.get("hippodrome", "")
race_time = course.get("heure", "13:55")
# Convertir HH:MM en timestamp du jour
try:
dt = datetime.strptime(f"{today} {race_time}", "%Y-%m-%d %H:%M")
ts = int(dt.timestamp())
except:
ts = None
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
INSERT INTO race_meta (date, race_name, race_hippodrome, race_time, race_timestamp)
VALUES (?, ?, ?, ?, ?)
''', (today, race_name, hippodrome, race_time, ts))
conn.commit()
conn.close()
print(f"🕒 Heure course sauvegardée : {race_time} (ts={ts})")
def save_odds_history(quinte_data, today):
"""
Sauvegarde un snapshot des cotes dans odds_history à chaque run.
Permet de suivre l'évolution des cotes tout au long de la journée.
"""
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
now = datetime.now().isoformat()
saved = 0
course = quinte_data.get("course", {})
race_name = course.get("nom", "Quinté+")
hippodrome = course.get("hippodrome", "")
for p in quinte_data.get("partants", []):
cote = p.get("cote")
if not cote or cote <= 0:
continue
c.execute('''
INSERT INTO odds_history
(date, race_name, race_hippodrome, horse_number, horse_name, odds, scraped_at, source)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''', (today, race_name, hippodrome,
p["numero"], p["cheval"], cote, now, "canalturf"))
saved += c.rowcount
c.execute('''
CREATE TABLE IF NOT EXISTS race_meta (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL,
race_name TEXT,
race_hippodrome TEXT,
race_time TEXT,
race_timestamp INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
return saved
def print_odds_evolution(today):
"""
Affiche l'évolution des cotes depuis le début de la journée.
Compare le premier snapshot du matin avec le snapshot actuel.
"""
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
# Récupérer tous les snapshots du jour
c.execute('''
SELECT horse_name, odds, scraped_at
FROM odds_history
WHERE date = ?
ORDER BY horse_name, scraped_at ASC
''', (today,))
rows = c.fetchall()
conn.close()
if not rows:
return
# Grouper par cheval
horses = {}
for horse, odds, ts in rows:
if horse not in horses:
horses[horse] = []
horses[horse].append((odds, ts))
# Afficher l'évolution
print(f"\n📈 ÉVOLUTION DES COTES — {today}")
print(f"{'-'*60}")
print(f" {'CHEVAL':<25} {'MATIN':<8} {'ACTUEL':<8} {'ÉVOL':<8} TENDANCE")
print(f"{'-'*60}")
evolutions = []
for horse, snapshots in horses.items():
if len(snapshots) < 1:
continue
cote_debut = snapshots[0][0]
cote_actuel = snapshots[-1][0]
nb_snapshots = len(snapshots)
if cote_debut > 0:
evol_pct = ((cote_actuel - cote_debut) / cote_debut) * 100
else:
evol_pct = 0
evolutions.append((horse, cote_debut, cote_actuel, evol_pct, nb_snapshots))
# Trier par cote actuelle
for horse, debut, actuel, evol, nb in sorted(evolutions, key=lambda x: x[2]):
if evol < -5:
tendance = "📉 BAISSE"
elif evol > 5:
tendance = "📈 HAUSSE"
else:
tendance = "➡️ STABLE"
evol_str = f"{evol:+.0f}%" if nb > 1 else "1er snap"
print(f" {horse:<25} {debut:<8} {actuel:<8} {evol_str:<8} {tendance}")
print(f"{'-'*60}")
print(f" ({len(evolutions)} chevaux, {rows[0][2][:16] if rows else '?'} → maintenant)")
# ============== URL LIST ==============
def get_urls():
"""ALL 7 WORKING SITES"""
sites = {
'equidia': ['https://www.equidia.fr/courses', 'https://www.equidia.fr/courses/2026-02-24'],
'zeturf': ['https://www.zeturf.fr/fr/courses-du-jour', 'https://www.zeturf.fr/en'],
'canalturf': ['https://www.canalturf.com/courses_chevaux_jour.php', 'https://www.canalturf.com/courses_quinte.php'],
'boturfers': ['https://www.boturfers.fr', 'https://www.boturfers.fr/quinte-du-jour', 'https://www.boturfers.fr/quinte-de-demain'],
'zone-turf': ['https://www.zone-turf.fr', 'https://www.zone-turf.fr/programmes/'],
'genybet': ['https://www.genybet.fr', 'https://www.genybet.fr/courses/'],
'ruedesjoueurs': ['https://www.ruedesjoueurs.com/turf.html', 'https://www.ruedesjoueurs.com/turf/pronostics.html']
}
urls = []
for site, pages in sites.items():
for url in pages:
urls.append((url, site))
return urls
# ============== MAIN ==============
def main():
start = time.time()
print(f"\n{'='*50}")
print(f"🐾 TURF SCRAPER v5 - REALTIME SAVING")
print(f"{'='*50}\n")
init_db()
urls = get_urls()
counter["total"] = len(urls)
print(f"📡 Fetching {len(urls)} pages...\n")
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {executor.submit(fetch_url, u): u for u in urls}
for future in as_completed(futures):
results.append(future.result())
elapsed = time.time() - start
today = datetime.now().strftime('%Y-%m-%d')
print(f"\n📊 Parsing predictions...")
quinte_data = {"course": {}, "partants": [], "pronostic": {}}
selections = []
boturfers_info = {}
for r in results:
if r['status'] != 'success':
continue
site = r['site']
url = r['url']
content = r['content']
if site == 'canalturf':
if 'quinte' in url:
quinte_data = parse_canalturf_quinte(content)
nb_p = len(quinte_data['partants'])
nb_b = len(quinte_data['pronostic'].get('bases', []))
print(f" canalturf quinté : {nb_p} partants, {nb_b} base(s) trouvé(s)")
else:
selections = parse_canalturf_selections(content)
print(f" canalturf sélections : {len(selections)} course(s)")
elif site == 'boturfers' and 'quinte-du-jour' in url:
boturfers_info = parse_boturfers_quinte(content)
temp = boturfers_info.get('temperature', '?')
print(f" boturfers : {boturfers_info.get('nb_partants', '?')} partants, {temp}°C")
# Sauvegarde BDD
saved, total_today = save_parsed_data(quinte_data, selections, today)
print(f"\n💾 {saved} nouvelles entrées insérées en BDD")
# Snapshot cotes dans odds_history
odds_saved = save_odds_history(quinte_data, today)
print(f"📊 {odds_saved} cotes sauvegardées dans odds_history")
# Afficher l'évolution des cotes
print_odds_evolution(today)
# Affichage résumé Quinté+
if quinte_data["partants"]:
course = quinte_data["course"]
print(f"\n{'='*55}")
print(f"🏇 {course.get('nom', 'Quinté+')}{course.get('hippodrome', '')} {course.get('heure', '')} ({course.get('distance', '')}m)")
print(f"{''*55}")
print(f" {'':<4} {'CHEVAL':<25} {'JOCKEY':<20} COTE")
print(f"{''*55}")
for p in sorted(quinte_data["partants"], key=lambda x: x.get("cote") or 999):
cote_str = str(p['cote']) if p['cote'] else "?"
print(f" {p['numero']:<4} {p['cheval']:<25} {p['jockey']:<20} {cote_str}")
bases = [h['cheval'] for h in quinte_data['pronostic'].get('bases', [])]
if bases:
print(f"\n ⭐ Bases : {', '.join(bases)}")
chances = [h['cheval'] for h in quinte_data['pronostic'].get('chances', [])]
if chances:
print(f" 🎯 Chances : {', '.join(chances)}")
outsiders = [h['cheval'] for h in quinte_data['pronostic'].get('outsiders', [])]
if outsiders:
print(f" 🔍 Outsiders : {', '.join(outsiders)}")
print(f"{'='*55}")
# Stats par site
by_site = {}
for r in results:
s = r['site']
by_site[s] = by_site.get(s, 0) + (1 if r['status'] == 'success' else 0)
print(f"\n📊 STATS:")
for site, count in by_site.items():
print(f" {site}: {count} pages")
# Sauvegarde JSON
output = f"{os.environ.get('TURF_DIR', '/home/h3r7/turf_scraper')}/v5_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output, 'w', encoding='utf-8') as f:
json.dump({
'timestamp': datetime.now().isoformat(),
'runtime_sec': round(elapsed, 2),
'total_pages': len(urls),
'pages': results
}, f, indent=2, ensure_ascii=False)
print(f"\n{'='*50}")
print(f"✅ DONE! {len(results)} pages in {elapsed:.1f}s")
print(f"💾 {total_today} prédictions en BDD pour aujourd'hui")
print(f"📁 {output}")
print(f"{'='*50}\n")
if __name__ == "__main__":
main()