turf_saas/horse_detail_scraper.py

#!/usr/bin/env python3
"""
Horse Detail Scraper - Get individual horse data for RUNTIME V4
"""
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import re

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
}

def get_horse_id_from_url(url):
    """Extract horse ID from Canalturf URL"""
    match = re.search(r'idcheval=(\d+)', url)
    if match:
        return match.group(1)
    return None

def scrape_canalturf_horse(horse_url):
    """Scrape horse details from Canalturf"""
    try:
        r = requests.get(horse_url, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(r.text, 'html.parser')

        data = {
            'url': horse_url,
            'source': 'canalturf',
            'scraped_at': datetime.now().isoformat(),
            'status': 'success'
        }

        # Get title
        title = soup.find('title')
        if title:
            data['horse_name'] = title.text.split('-')[0].strip()

        # Extract all text for parsing
        text = soup.get_text(separator=' | ', strip=True)

        # PEDIGREE
        if 'Père :' in text:
            match = re.search(r'Père : ([^|]+)', text)
            if match: data['father'] = match.group(1).strip()

        if 'Mère :' in text:
            match = re.search(r'Mère : ([^|]+)', text)
            if match: data['mother'] = match.group(1).strip()

        # AGE & SEX
        if 'Sexe/Age' in text:
            match = re.search(r'Sexe/Age : ([^|]+)', text)
            if match: data['sex_age'] = match.group(1).strip()

        # ENTRAINEUR
        if 'Entraineur' in text:
            match = re.search(r'Entraineur : ([^|]+)', text)
            if match: data['trainer'] = match.group(1).strip()

        # COTE
        if 'Cote' in text:
            match = re.search(r'Cote.*?(\d+[\.,]\d+)', text)
            if match: data['cote'] = match.group(1).replace(',', '.')

        # DERNIERES PERFORMANCES
        performances = []
        for link in soup.select('a[href*="/resultats-PMU/"]'):
            perf_text = link.get_text(strip=True)
            if perf_text:
                performances.append(perf_text)

        if performances:
            data['recent_performances'] = performances[:10]  # Last 10

        # STATS
        if 'Victoire' in text:
            match = re.search(r'Victoire\(s\) : (\d+)', text)
            if match: data['wins'] = int(match.group(1))

        if 'Placé' in text:
            match = re.search(r'Placé\(s\) : (\d+)', text)
            if match: data['placed'] = int(match.group(1))

        if 'Course' in text:
            match = re.search(r'Course\(s\) : (\d+)', text)
            if match: data['total_races'] = int(match.group(1))

        # GAINS
        if 'Gains :' in text:
            match = re.search(r'Gains : (\d+[\d\s]*)', text)
            if match: data['earnings'] = match.group(1).replace(' ', '').strip()

        # MUSIC (Form)
        if 'Perf.' in text:
            match = re.search(r'Perf\. : ([^|]+)', text)
            if match: data['form_music'] = match.group(1).strip()

        return data

    except Exception as e:
        return {
            'url': horse_url,
            'source': 'canalturf',
            'status': 'error',
            'error': str(e)
        }

def get_race_horses_urls(race_url):
    """Get all horse URLs from a race page"""
    try:
        r = requests.get(race_url, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(r.text, 'html.parser')

        horse_urls = []
        for link in soup.select('a[href*="fiche_cheval"]'):
            href = link.get('href', '')
            if 'idcheval' in href:
                # Fix double domain issue
                href = href.replace('https://www.canalturf.comhttps://www.canalturf.com', 'https://www.canalturf.com')
                if not href.startswith('http'):
                    href = 'https://www.canalturf.com' + href
                horse_urls.append(href)

        return list(set(horse_urls))
    except Exception as e:
        print(f"Error getting horse URLs: {e}")
        return []

def main():
    print("="*50)
    print("🐴 HORSE DETAIL SCRAPER - RUNTIME V4")
    print("="*50)

    # Example: Get horses from Quinté page
    quinte_url = "https://www.canalturf.com/courses_quinte.php"

    print(f"\n📋 Getting horses from: {quinte_url}")
    horse_urls = get_race_horses_urls(quinte_url)
    print(f"   Found {len(horse_urls)} horses")

    # Scrape first 5 as demo
    results = []
    for i, url in enumerate(horse_urls[:5]):
        print(f"\n[{i+1}/{min(5, len(horse_urls))}] Scraping: {url[:60]}...")
        data = scrape_canalturf_horse(url)
        results.append(data)
        if data['status'] == 'success':
            print(f"   ✅ {data.get('horse_name', 'Unknown')}")
        else:
            print(f"   ❌ {data.get('error', 'Error')}")

    # Save
    output = f"/home/h3r7/turf_scraper/horses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(output, 'w', encoding='utf-8') as f:
        json.dump({
            'timestamp': datetime.now().isoformat(),
            'race_url': quinte_url,
            'total_horses': len(horse_urls),
            'horses': results
        }, f, indent=2, ensure_ascii=False)

    print(f"\n{'='*50}")
    print(f"✅ Saved to {output}")
    print(f"{'='*50}\n")

    return results

if __name__ == "__main__":
    main()