173 lines
5.6 KiB
Python
Executable File
173 lines
5.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Horse Detail Scraper - Get individual horse data for RUNTIME V4
|
|
"""
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
from datetime import datetime
|
|
import re
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
|
|
}
|
|
|
|
def get_horse_id_from_url(url):
|
|
"""Extract horse ID from Canalturf URL"""
|
|
match = re.search(r'idcheval=(\d+)', url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def scrape_canalturf_horse(horse_url):
|
|
"""Scrape horse details from Canalturf"""
|
|
try:
|
|
r = requests.get(horse_url, headers=HEADERS, timeout=15)
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
|
|
data = {
|
|
'url': horse_url,
|
|
'source': 'canalturf',
|
|
'scraped_at': datetime.now().isoformat(),
|
|
'status': 'success'
|
|
}
|
|
|
|
# Get title
|
|
title = soup.find('title')
|
|
if title:
|
|
data['horse_name'] = title.text.split('-')[0].strip()
|
|
|
|
# Extract all text for parsing
|
|
text = soup.get_text(separator=' | ', strip=True)
|
|
|
|
# PEDIGREE
|
|
if 'Père :' in text:
|
|
match = re.search(r'Père : ([^|]+)', text)
|
|
if match: data['father'] = match.group(1).strip()
|
|
|
|
if 'Mère :' in text:
|
|
match = re.search(r'Mère : ([^|]+)', text)
|
|
if match: data['mother'] = match.group(1).strip()
|
|
|
|
# AGE & SEX
|
|
if 'Sexe/Age' in text:
|
|
match = re.search(r'Sexe/Age : ([^|]+)', text)
|
|
if match: data['sex_age'] = match.group(1).strip()
|
|
|
|
# ENTRAINEUR
|
|
if 'Entraineur' in text:
|
|
match = re.search(r'Entraineur : ([^|]+)', text)
|
|
if match: data['trainer'] = match.group(1).strip()
|
|
|
|
# COTE
|
|
if 'Cote' in text:
|
|
match = re.search(r'Cote.*?(\d+[\.,]\d+)', text)
|
|
if match: data['cote'] = match.group(1).replace(',', '.')
|
|
|
|
# DERNIERES PERFORMANCES
|
|
performances = []
|
|
for link in soup.select('a[href*="/resultats-PMU/"]'):
|
|
perf_text = link.get_text(strip=True)
|
|
if perf_text:
|
|
performances.append(perf_text)
|
|
|
|
if performances:
|
|
data['recent_performances'] = performances[:10] # Last 10
|
|
|
|
# STATS
|
|
if 'Victoire' in text:
|
|
match = re.search(r'Victoire\(s\) : (\d+)', text)
|
|
if match: data['wins'] = int(match.group(1))
|
|
|
|
if 'Placé' in text:
|
|
match = re.search(r'Placé\(s\) : (\d+)', text)
|
|
if match: data['placed'] = int(match.group(1))
|
|
|
|
if 'Course' in text:
|
|
match = re.search(r'Course\(s\) : (\d+)', text)
|
|
if match: data['total_races'] = int(match.group(1))
|
|
|
|
# GAINS
|
|
if 'Gains :' in text:
|
|
match = re.search(r'Gains : (\d+[\d\s]*)', text)
|
|
if match: data['earnings'] = match.group(1).replace(' ', '').strip()
|
|
|
|
# MUSIC (Form)
|
|
if 'Perf.' in text:
|
|
match = re.search(r'Perf\. : ([^|]+)', text)
|
|
if match: data['form_music'] = match.group(1).strip()
|
|
|
|
return data
|
|
|
|
except Exception as e:
|
|
return {
|
|
'url': horse_url,
|
|
'source': 'canalturf',
|
|
'status': 'error',
|
|
'error': str(e)
|
|
}
|
|
|
|
def get_race_horses_urls(race_url):
|
|
"""Get all horse URLs from a race page"""
|
|
try:
|
|
r = requests.get(race_url, headers=HEADERS, timeout=15)
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
|
|
horse_urls = []
|
|
for link in soup.select('a[href*="fiche_cheval"]'):
|
|
href = link.get('href', '')
|
|
if 'idcheval' in href:
|
|
# Fix double domain issue
|
|
href = href.replace('https://www.canalturf.comhttps://www.canalturf.com', 'https://www.canalturf.com')
|
|
if not href.startswith('http'):
|
|
href = 'https://www.canalturf.com' + href
|
|
horse_urls.append(href)
|
|
|
|
return list(set(horse_urls))
|
|
except Exception as e:
|
|
print(f"Error getting horse URLs: {e}")
|
|
return []
|
|
|
|
def main():
|
|
print("="*50)
|
|
print("🐴 HORSE DETAIL SCRAPER - RUNTIME V4")
|
|
print("="*50)
|
|
|
|
# Example: Get horses from Quinté page
|
|
quinte_url = "https://www.canalturf.com/courses_quinte.php"
|
|
|
|
print(f"\n📋 Getting horses from: {quinte_url}")
|
|
horse_urls = get_race_horses_urls(quinte_url)
|
|
print(f" Found {len(horse_urls)} horses")
|
|
|
|
# Scrape first 5 as demo
|
|
results = []
|
|
for i, url in enumerate(horse_urls[:5]):
|
|
print(f"\n[{i+1}/{min(5, len(horse_urls))}] Scraping: {url[:60]}...")
|
|
data = scrape_canalturf_horse(url)
|
|
results.append(data)
|
|
if data['status'] == 'success':
|
|
print(f" ✅ {data.get('horse_name', 'Unknown')}")
|
|
else:
|
|
print(f" ❌ {data.get('error', 'Error')}")
|
|
|
|
# Save
|
|
output = f"/home/h3r7/turf_scraper/horses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(output, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'timestamp': datetime.now().isoformat(),
|
|
'race_url': quinte_url,
|
|
'total_horses': len(horse_urls),
|
|
'horses': results
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"✅ Saved to {output}")
|
|
print(f"{'='*50}\n")
|
|
|
|
return results
|
|
|
|
if __name__ == "__main__":
|
|
main()
|