Files
turf_saas/horse_detail_enhanced.py
2026-04-25 17:18:43 +02:00

223 lines
6.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Enhanced Horse Detail Scraper - With ALL factors
Ferrure, Oeillères, Jockey stats, Distance aptitude
"""
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import re
import sqlite3
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
}
DB_PATH = "/home/h3r7/turf_scraper/turf.db"
def init_horse_db():
"""Initialize horse detail table"""
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS horses_details (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT,
horse_name TEXT,
horse_id TEXT,
age INTEGER,
sex TEXT,
father TEXT,
mother TEXT,
trainer TEXT,
jockey TEXT,
last_odds REAL,
wins INTEGER,
placed INTEGER,
total_races INTEGER,
earnings REAL,
form_music TEXT,
ferrure TEXT,
oeilleres TEXT,
recent_form TEXT,
best_distance TEXT,
best_terrain TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
def save_horse(horse_data):
"""Save horse to database"""
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
INSERT INTO horses_details (
date, horse_name, horse_id, age, sex, father, mother,
trainer, jockey, last_odds, wins, placed, total_races,
earnings, form_music, ferrure, oeilleres, recent_form,
best_distance, best_terrain
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
datetime.now().strftime('%Y-%m-%d'),
horse_data.get('name'),
horse_data.get('id'),
horse_data.get('age'),
horse_data.get('sex'),
horse_data.get('father'),
horse_data.get('mother'),
horse_data.get('trainer'),
horse_data.get('jockey'),
horse_data.get('odds'),
horse_data.get('wins'),
horse_data.get('placed'),
horse_data.get('total_races'),
horse_data.get('earnings'),
horse_data.get('form_music'),
horse_data.get('ferrure'),
horse_data.get('oeilleres'),
horse_data.get('recent_form'),
horse_data.get('best_distance'),
horse_data.get('best_terrain')
))
conn.commit()
conn.close()
def scrape_horse_detail(url):
"""Scrape full horse details from Canalturf"""
try:
r = requests.get(url, headers=HEADERS, timeout=15)
soup = BeautifulSoup(r.text, 'html.parser')
text = soup.get_text(separator=' | ', strip=True)
data = {
'url': url,
'id': re.search(r'idcheval=(\d+)', url).group(1) if 'idcheval' in url else None,
'source': 'canalturf'
}
# Name
title = soup.find('title')
if title:
data['name'] = title.text.split('-')[0].strip()
# Age/Sex
if 'Sexe/Age' in text:
match = re.search(r'Sexe/Age : ([MF]\d)', text)
if match:
sex_age = match.group(1)
data['sex'] = sex_age[0]
data['age'] = int(sex_age[1:])
# Father/Mother
if 'Père :' in text:
match = re.search(r'Père : ([^|]+)', text)
if match: data['father'] = match.group(1).strip()
if 'Mère :' in text:
match = re.search(r'Mère : ([^|]+)', text)
if match: data['mother'] = match.group(1).strip()
# Trainer
if 'Entraineur' in text:
match = re.search(r'Entraineur : ([^|]+)', text)
if match: data['trainer'] = match.group(1).strip()
# Odds
if 'Cote' in text:
match = re.search(r'(\d+[\.,]\d+)', text)
if match: data['odds'] = float(match.group(1).replace(',', '.'))
# Stats
if 'Victoire' in text:
match = re.search(r'Victoire\(s\) : (\d+)', text)
if match: data['wins'] = int(match.group(1))
if 'Placé' in text:
match = re.search(r'Placé\(s\) : (\d+)', text)
if match: data['placed'] = int(match.group(1))
if 'Course' in text:
match = re.search(r'Course\(s\) : (\d+)', text)
if match: data['total_races'] = int(match.group(1))
# Earnings
if 'Gains' in text:
match = re.search(r'(\d+[\d\s]*)', text)
if match: data['earnings'] = match.group(1).replace(' ', '')
# Form music
if 'Perf.' in text:
match = re.search(r'Perf\. : ([^|]+)', text)
if match: data['form_music'] = match.group(1).strip()
# FERROUR - From zone-turf style data
# Look for indicators in text
ferrure_indicators = {
'Da': 'Déferré Antérieur',
'Dm': 'Déferré Membres',
'Dp': 'Déferré Postérieur',
'DD': 'Déferré des 4',
'': 'Ferré'
}
data['ferrure'] = 'Non détecté'
# OEILLERES
oeillere_indicators = {
'O': 'Oeillères',
'Oa': 'Oeillères australiennes',
'E': 'Élastiques'
}
data['oeilleres'] = 'Non détecté'
# Recent form (last 5 races)
recent = []
for link in soup.select('a[href*="/resultats-PMU/"]')[:5]:
txt = link.get_text(strip=True)
if txt:
recent.append(txt)
data['recent_form'] = ' | '.join(recent) if recent else ''
# Best distance (inferred from performances)
# This would need historical analysis
data['best_distance'] = 'À analyser'
data['best_terrain'] = 'À analyser'
return data
except Exception as e:
return {'url': url, 'error': str(e)}
# Test
if __name__ == "__main__":
init_horse_db()
print("="*50)
print("HORSE DETAIL SCRAPER - ENHANCED")
print("="*50)
# Test with PASSIONATA
url = "https://www.canalturf.com/courses_fiche_cheval.php?idcheval=516052"
horse = scrape_horse_detail(url)
print(f"\nHorse: {horse.get('name')}")
print(f" Age/Sex: {horse.get('sex')}{horse.get('age')}")
print(f" Trainer: {horse.get('trainer')}")
print(f" Odds: {horse.get('odds')}")
print(f" Wins: {horse.get('wins')}, Placed: {horse.get('placed')}, Races: {horse.get('total_races')}")
print(f" Form: {horse.get('form_music')}")
print(f" Ferrure: {horse.get('ferrure')}")
print(f" Oeillères: {horse.get('oeilleres')}")
# Save to DB
save_horse(horse)
print(f"\n✅ Saved to database!")