Files
turf_saas/horse_detail_scraper.py
2026-04-25 17:18:43 +02:00

173 lines
5.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Horse Detail Scraper - Get individual horse data for RUNTIME V4
"""
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import re
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
}
def get_horse_id_from_url(url):
"""Extract horse ID from Canalturf URL"""
match = re.search(r'idcheval=(\d+)', url)
if match:
return match.group(1)
return None
def scrape_canalturf_horse(horse_url):
"""Scrape horse details from Canalturf"""
try:
r = requests.get(horse_url, headers=HEADERS, timeout=15)
soup = BeautifulSoup(r.text, 'html.parser')
data = {
'url': horse_url,
'source': 'canalturf',
'scraped_at': datetime.now().isoformat(),
'status': 'success'
}
# Get title
title = soup.find('title')
if title:
data['horse_name'] = title.text.split('-')[0].strip()
# Extract all text for parsing
text = soup.get_text(separator=' | ', strip=True)
# PEDIGREE
if 'Père :' in text:
match = re.search(r'Père : ([^|]+)', text)
if match: data['father'] = match.group(1).strip()
if 'Mère :' in text:
match = re.search(r'Mère : ([^|]+)', text)
if match: data['mother'] = match.group(1).strip()
# AGE & SEX
if 'Sexe/Age' in text:
match = re.search(r'Sexe/Age : ([^|]+)', text)
if match: data['sex_age'] = match.group(1).strip()
# ENTRAINEUR
if 'Entraineur' in text:
match = re.search(r'Entraineur : ([^|]+)', text)
if match: data['trainer'] = match.group(1).strip()
# COTE
if 'Cote' in text:
match = re.search(r'Cote.*?(\d+[\.,]\d+)', text)
if match: data['cote'] = match.group(1).replace(',', '.')
# DERNIERES PERFORMANCES
performances = []
for link in soup.select('a[href*="/resultats-PMU/"]'):
perf_text = link.get_text(strip=True)
if perf_text:
performances.append(perf_text)
if performances:
data['recent_performances'] = performances[:10] # Last 10
# STATS
if 'Victoire' in text:
match = re.search(r'Victoire\(s\) : (\d+)', text)
if match: data['wins'] = int(match.group(1))
if 'Placé' in text:
match = re.search(r'Placé\(s\) : (\d+)', text)
if match: data['placed'] = int(match.group(1))
if 'Course' in text:
match = re.search(r'Course\(s\) : (\d+)', text)
if match: data['total_races'] = int(match.group(1))
# GAINS
if 'Gains :' in text:
match = re.search(r'Gains : (\d+[\d\s]*)', text)
if match: data['earnings'] = match.group(1).replace(' ', '').strip()
# MUSIC (Form)
if 'Perf.' in text:
match = re.search(r'Perf\. : ([^|]+)', text)
if match: data['form_music'] = match.group(1).strip()
return data
except Exception as e:
return {
'url': horse_url,
'source': 'canalturf',
'status': 'error',
'error': str(e)
}
def get_race_horses_urls(race_url):
"""Get all horse URLs from a race page"""
try:
r = requests.get(race_url, headers=HEADERS, timeout=15)
soup = BeautifulSoup(r.text, 'html.parser')
horse_urls = []
for link in soup.select('a[href*="fiche_cheval"]'):
href = link.get('href', '')
if 'idcheval' in href:
# Fix double domain issue
href = href.replace('https://www.canalturf.comhttps://www.canalturf.com', 'https://www.canalturf.com')
if not href.startswith('http'):
href = 'https://www.canalturf.com' + href
horse_urls.append(href)
return list(set(horse_urls))
except Exception as e:
print(f"Error getting horse URLs: {e}")
return []
def main():
print("="*50)
print("🐴 HORSE DETAIL SCRAPER - RUNTIME V4")
print("="*50)
# Example: Get horses from Quinté page
quinte_url = "https://www.canalturf.com/courses_quinte.php"
print(f"\n📋 Getting horses from: {quinte_url}")
horse_urls = get_race_horses_urls(quinte_url)
print(f" Found {len(horse_urls)} horses")
# Scrape first 5 as demo
results = []
for i, url in enumerate(horse_urls[:5]):
print(f"\n[{i+1}/{min(5, len(horse_urls))}] Scraping: {url[:60]}...")
data = scrape_canalturf_horse(url)
results.append(data)
if data['status'] == 'success':
print(f"{data.get('horse_name', 'Unknown')}")
else:
print(f"{data.get('error', 'Error')}")
# Save
output = f"/home/h3r7/turf_scraper/horses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output, 'w', encoding='utf-8') as f:
json.dump({
'timestamp': datetime.now().isoformat(),
'race_url': quinte_url,
'total_horses': len(horse_urls),
'horses': results
}, f, indent=2, ensure_ascii=False)
print(f"\n{'='*50}")
print(f"✅ Saved to {output}")
print(f"{'='*50}\n")
return results
if __name__ == "__main__":
main()