223 lines
6.9 KiB
Python
Executable File
223 lines
6.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Enhanced Horse Detail Scraper - With ALL factors
|
|
Ferrure, Oeillères, Jockey stats, Distance aptitude
|
|
"""
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
from datetime import datetime
|
|
import re
|
|
import sqlite3
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
|
|
}
|
|
|
|
DB_PATH = "/home/h3r7/turf_scraper/turf.db"
|
|
|
|
def init_horse_db():
|
|
"""Initialize horse detail table"""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
|
|
c.execute('''
|
|
CREATE TABLE IF NOT EXISTS horses_details (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
date TEXT,
|
|
horse_name TEXT,
|
|
horse_id TEXT,
|
|
age INTEGER,
|
|
sex TEXT,
|
|
father TEXT,
|
|
mother TEXT,
|
|
trainer TEXT,
|
|
jockey TEXT,
|
|
last_odds REAL,
|
|
wins INTEGER,
|
|
placed INTEGER,
|
|
total_races INTEGER,
|
|
earnings REAL,
|
|
form_music TEXT,
|
|
ferrure TEXT,
|
|
oeilleres TEXT,
|
|
recent_form TEXT,
|
|
best_distance TEXT,
|
|
best_terrain TEXT,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
''')
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def save_horse(horse_data):
|
|
"""Save horse to database"""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
|
|
c.execute('''
|
|
INSERT INTO horses_details (
|
|
date, horse_name, horse_id, age, sex, father, mother,
|
|
trainer, jockey, last_odds, wins, placed, total_races,
|
|
earnings, form_music, ferrure, oeilleres, recent_form,
|
|
best_distance, best_terrain
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
''', (
|
|
datetime.now().strftime('%Y-%m-%d'),
|
|
horse_data.get('name'),
|
|
horse_data.get('id'),
|
|
horse_data.get('age'),
|
|
horse_data.get('sex'),
|
|
horse_data.get('father'),
|
|
horse_data.get('mother'),
|
|
horse_data.get('trainer'),
|
|
horse_data.get('jockey'),
|
|
horse_data.get('odds'),
|
|
horse_data.get('wins'),
|
|
horse_data.get('placed'),
|
|
horse_data.get('total_races'),
|
|
horse_data.get('earnings'),
|
|
horse_data.get('form_music'),
|
|
horse_data.get('ferrure'),
|
|
horse_data.get('oeilleres'),
|
|
horse_data.get('recent_form'),
|
|
horse_data.get('best_distance'),
|
|
horse_data.get('best_terrain')
|
|
))
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def scrape_horse_detail(url):
|
|
"""Scrape full horse details from Canalturf"""
|
|
try:
|
|
r = requests.get(url, headers=HEADERS, timeout=15)
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
|
|
text = soup.get_text(separator=' | ', strip=True)
|
|
|
|
data = {
|
|
'url': url,
|
|
'id': re.search(r'idcheval=(\d+)', url).group(1) if 'idcheval' in url else None,
|
|
'source': 'canalturf'
|
|
}
|
|
|
|
# Name
|
|
title = soup.find('title')
|
|
if title:
|
|
data['name'] = title.text.split('-')[0].strip()
|
|
|
|
# Age/Sex
|
|
if 'Sexe/Age' in text:
|
|
match = re.search(r'Sexe/Age : ([MF]\d)', text)
|
|
if match:
|
|
sex_age = match.group(1)
|
|
data['sex'] = sex_age[0]
|
|
data['age'] = int(sex_age[1:])
|
|
|
|
# Father/Mother
|
|
if 'Père :' in text:
|
|
match = re.search(r'Père : ([^|]+)', text)
|
|
if match: data['father'] = match.group(1).strip()
|
|
|
|
if 'Mère :' in text:
|
|
match = re.search(r'Mère : ([^|]+)', text)
|
|
if match: data['mother'] = match.group(1).strip()
|
|
|
|
# Trainer
|
|
if 'Entraineur' in text:
|
|
match = re.search(r'Entraineur : ([^|]+)', text)
|
|
if match: data['trainer'] = match.group(1).strip()
|
|
|
|
# Odds
|
|
if 'Cote' in text:
|
|
match = re.search(r'(\d+[\.,]\d+)', text)
|
|
if match: data['odds'] = float(match.group(1).replace(',', '.'))
|
|
|
|
# Stats
|
|
if 'Victoire' in text:
|
|
match = re.search(r'Victoire\(s\) : (\d+)', text)
|
|
if match: data['wins'] = int(match.group(1))
|
|
|
|
if 'Placé' in text:
|
|
match = re.search(r'Placé\(s\) : (\d+)', text)
|
|
if match: data['placed'] = int(match.group(1))
|
|
|
|
if 'Course' in text:
|
|
match = re.search(r'Course\(s\) : (\d+)', text)
|
|
if match: data['total_races'] = int(match.group(1))
|
|
|
|
# Earnings
|
|
if 'Gains' in text:
|
|
match = re.search(r'(\d+[\d\s]*)', text)
|
|
if match: data['earnings'] = match.group(1).replace(' ', '')
|
|
|
|
# Form music
|
|
if 'Perf.' in text:
|
|
match = re.search(r'Perf\. : ([^|]+)', text)
|
|
if match: data['form_music'] = match.group(1).strip()
|
|
|
|
# FERROUR - From zone-turf style data
|
|
# Look for indicators in text
|
|
ferrure_indicators = {
|
|
'Da': 'Déferré Antérieur',
|
|
'Dm': 'Déferré Membres',
|
|
'Dp': 'Déferré Postérieur',
|
|
'DD': 'Déferré des 4',
|
|
'': 'Ferré'
|
|
}
|
|
data['ferrure'] = 'Non détecté'
|
|
|
|
# OEILLERES
|
|
oeillere_indicators = {
|
|
'O': 'Oeillères',
|
|
'Oa': 'Oeillères australiennes',
|
|
'E': 'Élastiques'
|
|
}
|
|
data['oeilleres'] = 'Non détecté'
|
|
|
|
# Recent form (last 5 races)
|
|
recent = []
|
|
for link in soup.select('a[href*="/resultats-PMU/"]')[:5]:
|
|
txt = link.get_text(strip=True)
|
|
if txt:
|
|
recent.append(txt)
|
|
data['recent_form'] = ' | '.join(recent) if recent else ''
|
|
|
|
# Best distance (inferred from performances)
|
|
# This would need historical analysis
|
|
data['best_distance'] = 'À analyser'
|
|
data['best_terrain'] = 'À analyser'
|
|
|
|
return data
|
|
|
|
except Exception as e:
|
|
return {'url': url, 'error': str(e)}
|
|
|
|
# Test
|
|
if __name__ == "__main__":
|
|
init_horse_db()
|
|
|
|
print("="*50)
|
|
print("HORSE DETAIL SCRAPER - ENHANCED")
|
|
print("="*50)
|
|
|
|
# Test with PASSIONATA
|
|
url = "https://www.canalturf.com/courses_fiche_cheval.php?idcheval=516052"
|
|
horse = scrape_horse_detail(url)
|
|
|
|
print(f"\nHorse: {horse.get('name')}")
|
|
print(f" Age/Sex: {horse.get('sex')}{horse.get('age')}")
|
|
print(f" Trainer: {horse.get('trainer')}")
|
|
print(f" Odds: {horse.get('odds')}")
|
|
print(f" Wins: {horse.get('wins')}, Placed: {horse.get('placed')}, Races: {horse.get('total_races')}")
|
|
print(f" Form: {horse.get('form_music')}")
|
|
print(f" Ferrure: {horse.get('ferrure')}")
|
|
print(f" Oeillères: {horse.get('oeilleres')}")
|
|
|
|
# Save to DB
|
|
save_horse(horse)
|
|
print(f"\n✅ Saved to database!")
|