#!/usr/bin/env python3 """ Enhanced Horse Detail Scraper - With ALL factors Ferrure, Oeillères, Jockey stats, Distance aptitude """ import requests from bs4 import BeautifulSoup import json from datetime import datetime import re import sqlite3 HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8', } DB_PATH = "/home/h3r7/turf_scraper/turf.db" def init_horse_db(): """Initialize horse detail table""" conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute(''' CREATE TABLE IF NOT EXISTS horses_details ( id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT, horse_name TEXT, horse_id TEXT, age INTEGER, sex TEXT, father TEXT, mother TEXT, trainer TEXT, jockey TEXT, last_odds REAL, wins INTEGER, placed INTEGER, total_races INTEGER, earnings REAL, form_music TEXT, ferrure TEXT, oeilleres TEXT, recent_form TEXT, best_distance TEXT, best_terrain TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') conn.commit() conn.close() def save_horse(horse_data): """Save horse to database""" conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute(''' INSERT INTO horses_details ( date, horse_name, horse_id, age, sex, father, mother, trainer, jockey, last_odds, wins, placed, total_races, earnings, form_music, ferrure, oeilleres, recent_form, best_distance, best_terrain ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( datetime.now().strftime('%Y-%m-%d'), horse_data.get('name'), horse_data.get('id'), horse_data.get('age'), horse_data.get('sex'), horse_data.get('father'), horse_data.get('mother'), horse_data.get('trainer'), horse_data.get('jockey'), horse_data.get('odds'), horse_data.get('wins'), horse_data.get('placed'), horse_data.get('total_races'), horse_data.get('earnings'), horse_data.get('form_music'), horse_data.get('ferrure'), horse_data.get('oeilleres'), horse_data.get('recent_form'), horse_data.get('best_distance'), horse_data.get('best_terrain') )) conn.commit() conn.close() def scrape_horse_detail(url): """Scrape full horse details from Canalturf""" try: r = requests.get(url, headers=HEADERS, timeout=15) soup = BeautifulSoup(r.text, 'html.parser') text = soup.get_text(separator=' | ', strip=True) data = { 'url': url, 'id': re.search(r'idcheval=(\d+)', url).group(1) if 'idcheval' in url else None, 'source': 'canalturf' } # Name title = soup.find('title') if title: data['name'] = title.text.split('-')[0].strip() # Age/Sex if 'Sexe/Age' in text: match = re.search(r'Sexe/Age : ([MF]\d)', text) if match: sex_age = match.group(1) data['sex'] = sex_age[0] data['age'] = int(sex_age[1:]) # Father/Mother if 'Père :' in text: match = re.search(r'Père : ([^|]+)', text) if match: data['father'] = match.group(1).strip() if 'Mère :' in text: match = re.search(r'Mère : ([^|]+)', text) if match: data['mother'] = match.group(1).strip() # Trainer if 'Entraineur' in text: match = re.search(r'Entraineur : ([^|]+)', text) if match: data['trainer'] = match.group(1).strip() # Odds if 'Cote' in text: match = re.search(r'(\d+[\.,]\d+)', text) if match: data['odds'] = float(match.group(1).replace(',', '.')) # Stats if 'Victoire' in text: match = re.search(r'Victoire\(s\) : (\d+)', text) if match: data['wins'] = int(match.group(1)) if 'Placé' in text: match = re.search(r'Placé\(s\) : (\d+)', text) if match: data['placed'] = int(match.group(1)) if 'Course' in text: match = re.search(r'Course\(s\) : (\d+)', text) if match: data['total_races'] = int(match.group(1)) # Earnings if 'Gains' in text: match = re.search(r'(\d+[\d\s]*)', text) if match: data['earnings'] = match.group(1).replace(' ', '') # Form music if 'Perf.' in text: match = re.search(r'Perf\. : ([^|]+)', text) if match: data['form_music'] = match.group(1).strip() # FERROUR - From zone-turf style data # Look for indicators in text ferrure_indicators = { 'Da': 'Déferré Antérieur', 'Dm': 'Déferré Membres', 'Dp': 'Déferré Postérieur', 'DD': 'Déferré des 4', '': 'Ferré' } data['ferrure'] = 'Non détecté' # OEILLERES oeillere_indicators = { 'O': 'Oeillères', 'Oa': 'Oeillères australiennes', 'E': 'Élastiques' } data['oeilleres'] = 'Non détecté' # Recent form (last 5 races) recent = [] for link in soup.select('a[href*="/resultats-PMU/"]')[:5]: txt = link.get_text(strip=True) if txt: recent.append(txt) data['recent_form'] = ' | '.join(recent) if recent else '' # Best distance (inferred from performances) # This would need historical analysis data['best_distance'] = 'À analyser' data['best_terrain'] = 'À analyser' return data except Exception as e: return {'url': url, 'error': str(e)} # Test if __name__ == "__main__": init_horse_db() print("="*50) print("HORSE DETAIL SCRAPER - ENHANCED") print("="*50) # Test with PASSIONATA url = "https://www.canalturf.com/courses_fiche_cheval.php?idcheval=516052" horse = scrape_horse_detail(url) print(f"\nHorse: {horse.get('name')}") print(f" Age/Sex: {horse.get('sex')}{horse.get('age')}") print(f" Trainer: {horse.get('trainer')}") print(f" Odds: {horse.get('odds')}") print(f" Wins: {horse.get('wins')}, Placed: {horse.get('placed')}, Races: {horse.get('total_races')}") print(f" Form: {horse.get('form_music')}") print(f" Ferrure: {horse.get('ferrure')}") print(f" Oeillères: {horse.get('oeilleres')}") # Save to DB save_horse(horse) print(f"\n✅ Saved to database!")