#!/usr/bin/env python3 """ H3R7Tech — LeadHunter Scraper ================================ Agent de scraping pour la détection de restaurants sans site web dans la MEL (Métropole Européenne de Lille). Sources : - Google Places API (primary) - OpenStreetMap / Overpass API (fallback) Quota Google Places Free Tier : - 28 500 requêtes/mois ≈ 950/jour - Compteur persistent dans /home/h3r7/leadhunter_quota.json Auteur: H3R7Tech Backend Engineer Issue: HRT-66 """ import os import json import time import logging import requests from datetime import date, datetime from logging.handlers import RotatingFileHandler # ─── Logging ──────────────────────────────────────────────────────────────── logger = logging.getLogger("leadhunter.scraper") _handler = RotatingFileHandler( "/home/h3r7/leadhunter.log", maxBytes=5 * 1024 * 1024, # 5 MB backupCount=3, ) _handler.setFormatter( logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s") ) logger.setLevel(logging.INFO) if not logger.handlers: logger.addHandler(_handler) logger.addHandler(logging.StreamHandler()) # ─── Configuration ─────────────────────────────────────────────────────────── GOOGLE_PLACES_API_KEY = os.environ.get("GOOGLE_PLACES_API_KEY") # Quota journalier Google Places Free Tier DAILY_QUOTA_FILE = "/home/h3r7/leadhunter_quota.json" DAILY_QUOTA_LIMIT = 900 # marge de sécurité vs les 950 théoriques # Délai entre requêtes Places pour éviter rate-limiting PLACES_SLEEP_S = 0.5 # Bounding box MEL (Métropole Européenne de Lille) MEL_CENTER_LAT = 50.6292 MEL_CENTER_LNG = 3.0573 MEL_RADIUS_M = 20000 # 20 km autour de Lille # Types de lieux ciblés TARGET_TYPES = ["restaurant", "cafe", "bar", "bakery", "food"] # Overpass API endpoint OVERPASS_URL = "https://overpass-api.de/api/interpreter" # Requête Overpass MEL — bounding box directe (50.4,2.8,50.8,3.3) couvrant la MEL # Fix HRT-72 : la résolution area["name"=...] échoue silencieusement sur l'API Overpass publique OVERPASS_MEL_QUERY = """ [out:json][timeout:60]; ( node["amenity"~"^(restaurant|cafe|bar|fast_food|bakery)$"][!"website"](50.4,2.8,50.8,3.3); way["amenity"~"^(restaurant|cafe|bar|fast_food|bakery)$"][!"website"](50.4,2.8,50.8,3.3); ); out center 200; """ # ─── Quota Manager ─────────────────────────────────────────────────────────── def _load_quota() -> dict: """Charge le compteur quotidien depuis le fichier JSON.""" today = str(date.today()) if os.path.exists(DAILY_QUOTA_FILE): try: with open(DAILY_QUOTA_FILE, "r") as f: data = json.load(f) if data.get("date") == today: return data except Exception as e: logger.warning(f"Impossible de lire le fichier quota : {e}") return {"date": today, "count": 0} def _save_quota(data: dict) -> None: """Persiste le compteur quotidien.""" try: with open(DAILY_QUOTA_FILE, "w") as f: json.dump(data, f) except Exception as e: logger.warning(f"Impossible d'écrire le fichier quota : {e}") def _increment_quota(n: int = 1) -> int: """Incrémente le compteur et retourne le total du jour.""" quota = _load_quota() quota["count"] += n _save_quota(quota) return quota["count"] def _quota_remaining() -> int: """Retourne le nombre de requêtes restantes pour aujourd'hui.""" quota = _load_quota() return max(0, DAILY_QUOTA_LIMIT - quota["count"]) # ─── Google Places Scraper ──────────────────────────────────────────────────── class GooglePlacesScraper: """ Scraping via Google Places API (Nearby Search + Place Details). Filtre les lieux sans site web côté API. """ BASE_URL = "https://maps.googleapis.com/maps/api/place" def __init__(self): if not GOOGLE_PLACES_API_KEY: raise EnvironmentError( "GOOGLE_PLACES_API_KEY non définie. " "Ajouter dans /home/h3r7/.env et relancer." ) self.api_key = GOOGLE_PLACES_API_KEY def _nearby_search(self, place_type: str, page_token: str = None) -> dict: """Appel Nearby Search — 1 requête comptabilisée.""" params = { "key": self.api_key, "location": f"{MEL_CENTER_LAT},{MEL_CENTER_LNG}", "radius": MEL_RADIUS_M, "type": place_type, } if page_token: params["pagetoken"] = page_token _increment_quota() time.sleep(PLACES_SLEEP_S) try: resp = requests.get( f"{self.BASE_URL}/nearbysearch/json", params=params, timeout=10, ) resp.raise_for_status() return resp.json() except Exception as e: logger.warning(f"NearbySearch error (type={place_type}): {e}") return {} def _place_details(self, place_id: str) -> dict: """Place Details pour récupérer website, phone, rating, etc. — 1 requête.""" params = { "key": self.api_key, "place_id": place_id, "fields": "name,formatted_address,formatted_phone_number,website,rating,user_ratings_total", } _increment_quota() time.sleep(PLACES_SLEEP_S) try: resp = requests.get( f"{self.BASE_URL}/details/json", params=params, timeout=10, ) resp.raise_for_status() return resp.json().get("result", {}) except Exception as e: logger.warning(f"PlaceDetails error (place_id={place_id}): {e}") return {} def scrape(self, max_leads: int = 50) -> list[dict]: """ Scrape les restaurants/cafés/bars MEL sans site web. Retourne une liste de dicts normalisés compatibles LeadHunter CRM : source, name, address, phone, rating, reviews_count, website, rgpd_ok """ leads = [] seen_ids = set() for place_type in TARGET_TYPES: if _quota_remaining() < 10: logger.warning( "Quota journalier presque épuisé — arrêt scraping Google Places." ) break logger.info(f"Scraping Google Places — type={place_type}") page_token = None while True: if _quota_remaining() < 5: logger.warning("Quota insuffisant pour continuer la pagination.") break data = self._nearby_search(place_type, page_token) results = data.get("results", []) for place in results: if len(leads) >= max_leads: break place_id = place.get("place_id", "") if not place_id or place_id in seen_ids: continue seen_ids.add(place_id) if _quota_remaining() < 2: logger.warning("Quota épuisé avant details.") break details = self._place_details(place_id) # Filtre : on ne garde que les lieux SANS site web if details.get("website"): continue lead = { "source": "google_places", "name": details.get("name") or place.get("name", ""), "address": details.get("formatted_address") or place.get("vicinity", ""), "phone": details.get("formatted_phone_number", ""), "rating": details.get("rating") or place.get("rating"), "reviews_count": details.get("user_ratings_total") or place.get("user_ratings_total"), "website": "", "rgpd_ok": True, # Données publiques Google Places uniquement } leads.append(lead) logger.info(f"Lead trouvé (Google Places) : {lead['name']}") if len(leads) >= max_leads: break page_token = data.get("next_page_token") if not page_token: break # L'API Google Places nécessite un délai avant d'utiliser next_page_token time.sleep(2) logger.info(f"Google Places : {len(leads)} leads collectés.") return leads # ─── Overpass / OSM Fallback ────────────────────────────────────────────────── class OverpassScraper: """ Fallback OSM via Overpass API. Cible les nœuds/ways dans la boundary MEL sans attribut 'website'. Données publiques ODbL — RGPD OK. """ def scrape(self, max_leads: int = 100) -> list[dict]: """ Scrape via Overpass API — retourne des leads normalisés. """ logger.info("Scraping Overpass OSM — boundary MEL") leads = [] try: resp = requests.post( OVERPASS_URL, data={"data": OVERPASS_MEL_QUERY}, headers={ "Content-Type": "application/x-www-form-urlencoded", # Fix HRT-72 Bug2 "User-Agent": "H3R7Tech-LeadHunter/1.0 (contact@h3r7tech.fr)", # Fix HRT-72 Bug3: overpass-api.de blocks python-requests UA }, timeout=90, ) resp.raise_for_status() data = resp.json() except Exception as e: logger.warning(f"Overpass API error : {e}") return [] elements = data.get("elements", []) logger.info(f"Overpass : {len(elements)} éléments bruts reçus.") for el in elements[:max_leads]: tags = el.get("tags", {}) # Coordonnées (pour les ways, Overpass retourne 'center') lat = el.get("lat") or (el.get("center") or {}).get("lat") lon = el.get("lon") or (el.get("center") or {}).get("lon") name = tags.get("name", "") if not name: continue # Ignorer les lieux sans nom addr_parts = [ tags.get("addr:housenumber", ""), tags.get("addr:street", ""), tags.get("addr:city", ""), tags.get("addr:postcode", ""), ] address = " ".join(p for p in addr_parts if p).strip() if not address and lat and lon: address = f"{lat:.4f},{lon:.4f}" lead = { "source": "osm", "name": name, "address": address, "phone": tags.get("phone", tags.get("contact:phone", "")), "rating": None, "reviews_count": None, "website": "", "rgpd_ok": True, # Données publiques ODbL } leads.append(lead) logger.info(f"Lead trouvé (OSM) : {lead['name']}") logger.info(f"Overpass : {len(leads)} leads collectés.") return leads # ─── Orchestrateur ──────────────────────────────────────────────────────────── def run_scraping( max_leads: int = 100, use_google: bool = True, use_osm: bool = True ) -> list[dict]: """ Lance le scraping Google Places + fallback OSM. Args: max_leads: nombre maximum de leads à collecter au total. use_google: activer Google Places (nécessite GOOGLE_PLACES_API_KEY). use_osm: activer le fallback Overpass OSM. Returns: Liste de leads normalisés (dédupliqués par nom + adresse). """ all_leads = [] seen_keys = set() def _dedup_key(lead: dict) -> str: return f"{lead['name'].lower().strip()}|{lead['address'].lower().strip()[:40]}" if use_google: try: scraper = GooglePlacesScraper() google_leads = scraper.scrape(max_leads=max_leads) for lead in google_leads: k = _dedup_key(lead) if k not in seen_keys: seen_keys.add(k) all_leads.append(lead) except EnvironmentError as e: logger.warning(f"Google Places désactivé : {e}") use_google = False remaining = max_leads - len(all_leads) if use_osm and remaining > 0: osm_leads = OverpassScraper().scrape(max_leads=remaining) for lead in osm_leads: k = _dedup_key(lead) if k not in seen_keys: seen_keys.add(k) all_leads.append(lead) logger.info( f"run_scraping terminé — {len(all_leads)} leads uniques " f"(Google={use_google}, OSM={use_osm}). " f"Quota restant aujourd'hui : {_quota_remaining()}" ) return all_leads # ─── CLI (debug) ───────────────────────────────────────────────────────────── if __name__ == "__main__": assert GOOGLE_PLACES_API_KEY, ( "GOOGLE_PLACES_API_KEY manquante — " "ajouter 'export GOOGLE_PLACES_API_KEY=xxx' dans /home/h3r7/.env" ) leads = run_scraping(max_leads=10) for i, l in enumerate(leads, 1): print(f"{i:02d}. [{l['source']}] {l['name']} — {l['address']}")