#!/usr/bin/env python3 """ Horse Detail Scraper - Get individual horse data for RUNTIME V4 """ import requests from bs4 import BeautifulSoup import json from datetime import datetime import re HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8', } def get_horse_id_from_url(url): """Extract horse ID from Canalturf URL""" match = re.search(r'idcheval=(\d+)', url) if match: return match.group(1) return None def scrape_canalturf_horse(horse_url): """Scrape horse details from Canalturf""" try: r = requests.get(horse_url, headers=HEADERS, timeout=15) soup = BeautifulSoup(r.text, 'html.parser') data = { 'url': horse_url, 'source': 'canalturf', 'scraped_at': datetime.now().isoformat(), 'status': 'success' } # Get title title = soup.find('title') if title: data['horse_name'] = title.text.split('-')[0].strip() # Extract all text for parsing text = soup.get_text(separator=' | ', strip=True) # PEDIGREE if 'Père :' in text: match = re.search(r'Père : ([^|]+)', text) if match: data['father'] = match.group(1).strip() if 'Mère :' in text: match = re.search(r'Mère : ([^|]+)', text) if match: data['mother'] = match.group(1).strip() # AGE & SEX if 'Sexe/Age' in text: match = re.search(r'Sexe/Age : ([^|]+)', text) if match: data['sex_age'] = match.group(1).strip() # ENTRAINEUR if 'Entraineur' in text: match = re.search(r'Entraineur : ([^|]+)', text) if match: data['trainer'] = match.group(1).strip() # COTE if 'Cote' in text: match = re.search(r'Cote.*?(\d+[\.,]\d+)', text) if match: data['cote'] = match.group(1).replace(',', '.') # DERNIERES PERFORMANCES performances = [] for link in soup.select('a[href*="/resultats-PMU/"]'): perf_text = link.get_text(strip=True) if perf_text: performances.append(perf_text) if performances: data['recent_performances'] = performances[:10] # Last 10 # STATS if 'Victoire' in text: match = re.search(r'Victoire\(s\) : (\d+)', text) if match: data['wins'] = int(match.group(1)) if 'Placé' in text: match = re.search(r'Placé\(s\) : (\d+)', text) if match: data['placed'] = int(match.group(1)) if 'Course' in text: match = re.search(r'Course\(s\) : (\d+)', text) if match: data['total_races'] = int(match.group(1)) # GAINS if 'Gains :' in text: match = re.search(r'Gains : (\d+[\d\s]*)', text) if match: data['earnings'] = match.group(1).replace(' ', '').strip() # MUSIC (Form) if 'Perf.' in text: match = re.search(r'Perf\. : ([^|]+)', text) if match: data['form_music'] = match.group(1).strip() return data except Exception as e: return { 'url': horse_url, 'source': 'canalturf', 'status': 'error', 'error': str(e) } def get_race_horses_urls(race_url): """Get all horse URLs from a race page""" try: r = requests.get(race_url, headers=HEADERS, timeout=15) soup = BeautifulSoup(r.text, 'html.parser') horse_urls = [] for link in soup.select('a[href*="fiche_cheval"]'): href = link.get('href', '') if 'idcheval' in href: # Fix double domain issue href = href.replace('https://www.canalturf.comhttps://www.canalturf.com', 'https://www.canalturf.com') if not href.startswith('http'): href = 'https://www.canalturf.com' + href horse_urls.append(href) return list(set(horse_urls)) except Exception as e: print(f"Error getting horse URLs: {e}") return [] def main(): print("="*50) print("🐴 HORSE DETAIL SCRAPER - RUNTIME V4") print("="*50) # Example: Get horses from Quinté page quinte_url = "https://www.canalturf.com/courses_quinte.php" print(f"\n📋 Getting horses from: {quinte_url}") horse_urls = get_race_horses_urls(quinte_url) print(f" Found {len(horse_urls)} horses") # Scrape first 5 as demo results = [] for i, url in enumerate(horse_urls[:5]): print(f"\n[{i+1}/{min(5, len(horse_urls))}] Scraping: {url[:60]}...") data = scrape_canalturf_horse(url) results.append(data) if data['status'] == 'success': print(f" ✅ {data.get('horse_name', 'Unknown')}") else: print(f" ❌ {data.get('error', 'Error')}") # Save output = f"/home/h3r7/turf_scraper/horses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(output, 'w', encoding='utf-8') as f: json.dump({ 'timestamp': datetime.now().isoformat(), 'race_url': quinte_url, 'total_horses': len(horse_urls), 'horses': results }, f, indent=2, ensure_ascii=False) print(f"\n{'='*50}") print(f"✅ Saved to {output}") print(f"{'='*50}\n") return results if __name__ == "__main__": main()