- leadhunter_scraper.py : Google Places Nearby Search + Place Details avec compteur quota daily_quota.json (limite 900/jour), sleep(0.5) entre requêtes, fallback Overpass OSM boundary MEL, filtre website absent, déduplcation, rgpd_ok=True - leadhunter_scorer.py : moteur de scoring 0-8 pts critère n°1 = site web absent (+3), avis ≥50 (+2), note ≥4.0 (+2), téléphone (+1), note <3.0 (-1) - leadhunter_crm.py : CRM SQLite schéma validé CTO (id, source, name, address, phone, rating, reviews_count, website, score, rgpd_ok, scraped_at, status) CRUD : insert_lead, get_leads, update_lead_status, get_stats, export_csv - leadhunter_api.py : Flask service port 8769 GET /api/leads, POST /api/leads/scrape, GET /api/leads/stats, GET /api/leads/export, PATCH /api/leads/<id>/status, GET /health assert GOOGLE_PLACES_API_KEY au démarrage scraping asynchrone (thread) avec status endpoint - infra/turf-saas-leadhunter.service : service systemd EnvironmentFile=/home/h3r7/.env pour GOOGLE_PLACES_API_KEY Tests : py_compile OK, scorer testé, CRM SQLite testé Co-Authored-By: Paperclip <noreply@paperclip.ing>
394 lines
14 KiB
Python
394 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
H3R7Tech — LeadHunter Scraper
|
|
================================
|
|
Agent de scraping pour la détection de restaurants sans site web
|
|
dans la MEL (Métropole Européenne de Lille).
|
|
|
|
Sources :
|
|
- Google Places API (primary)
|
|
- OpenStreetMap / Overpass API (fallback)
|
|
|
|
Quota Google Places Free Tier :
|
|
- 28 500 requêtes/mois ≈ 950/jour
|
|
- Compteur persistent dans /home/h3r7/leadhunter_quota.json
|
|
|
|
Auteur: H3R7Tech Backend Engineer
|
|
Issue: HRT-66
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import time
|
|
import logging
|
|
import requests
|
|
from datetime import date, datetime
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
# ─── Logging ────────────────────────────────────────────────────────────────
|
|
logger = logging.getLogger("leadhunter.scraper")
|
|
|
|
_handler = RotatingFileHandler(
|
|
"/home/h3r7/leadhunter.log",
|
|
maxBytes=5 * 1024 * 1024, # 5 MB
|
|
backupCount=3,
|
|
)
|
|
_handler.setFormatter(
|
|
logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s")
|
|
)
|
|
logger.setLevel(logging.INFO)
|
|
if not logger.handlers:
|
|
logger.addHandler(_handler)
|
|
logger.addHandler(logging.StreamHandler())
|
|
|
|
# ─── Configuration ───────────────────────────────────────────────────────────
|
|
GOOGLE_PLACES_API_KEY = os.environ.get("GOOGLE_PLACES_API_KEY")
|
|
|
|
# Quota journalier Google Places Free Tier
|
|
DAILY_QUOTA_FILE = "/home/h3r7/leadhunter_quota.json"
|
|
DAILY_QUOTA_LIMIT = 900 # marge de sécurité vs les 950 théoriques
|
|
|
|
# Délai entre requêtes Places pour éviter rate-limiting
|
|
PLACES_SLEEP_S = 0.5
|
|
|
|
# Bounding box MEL (Métropole Européenne de Lille)
|
|
MEL_CENTER_LAT = 50.6292
|
|
MEL_CENTER_LNG = 3.0573
|
|
MEL_RADIUS_M = 20000 # 20 km autour de Lille
|
|
|
|
# Types de lieux ciblés
|
|
TARGET_TYPES = ["restaurant", "cafe", "bar", "bakery", "food"]
|
|
|
|
# Overpass API endpoint
|
|
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
|
|
|
|
# Requête Overpass MEL — boundary nommée "Métropole Européenne de Lille"
|
|
OVERPASS_MEL_QUERY = """
|
|
[out:json][timeout:60];
|
|
area["name"="Métropole Européenne de Lille"]["boundary"="administrative"]->.mel;
|
|
(
|
|
node["amenity"~"^(restaurant|cafe|bar|fast_food|bakery)$"]["website"!~".+"](area.mel);
|
|
way["amenity"~"^(restaurant|cafe|bar|fast_food|bakery)$"]["website"!~".+"](area.mel);
|
|
);
|
|
out center 200;
|
|
"""
|
|
|
|
|
|
# ─── Quota Manager ───────────────────────────────────────────────────────────
|
|
|
|
|
|
def _load_quota() -> dict:
|
|
"""Charge le compteur quotidien depuis le fichier JSON."""
|
|
today = str(date.today())
|
|
if os.path.exists(DAILY_QUOTA_FILE):
|
|
try:
|
|
with open(DAILY_QUOTA_FILE, "r") as f:
|
|
data = json.load(f)
|
|
if data.get("date") == today:
|
|
return data
|
|
except Exception as e:
|
|
logger.warning(f"Impossible de lire le fichier quota : {e}")
|
|
return {"date": today, "count": 0}
|
|
|
|
|
|
def _save_quota(data: dict) -> None:
|
|
"""Persiste le compteur quotidien."""
|
|
try:
|
|
with open(DAILY_QUOTA_FILE, "w") as f:
|
|
json.dump(data, f)
|
|
except Exception as e:
|
|
logger.warning(f"Impossible d'écrire le fichier quota : {e}")
|
|
|
|
|
|
def _increment_quota(n: int = 1) -> int:
|
|
"""Incrémente le compteur et retourne le total du jour."""
|
|
quota = _load_quota()
|
|
quota["count"] += n
|
|
_save_quota(quota)
|
|
return quota["count"]
|
|
|
|
|
|
def _quota_remaining() -> int:
|
|
"""Retourne le nombre de requêtes restantes pour aujourd'hui."""
|
|
quota = _load_quota()
|
|
return max(0, DAILY_QUOTA_LIMIT - quota["count"])
|
|
|
|
|
|
# ─── Google Places Scraper ────────────────────────────────────────────────────
|
|
|
|
|
|
class GooglePlacesScraper:
|
|
"""
|
|
Scraping via Google Places API (Nearby Search + Place Details).
|
|
Filtre les lieux sans site web côté API.
|
|
"""
|
|
|
|
BASE_URL = "https://maps.googleapis.com/maps/api/place"
|
|
|
|
def __init__(self):
|
|
if not GOOGLE_PLACES_API_KEY:
|
|
raise EnvironmentError(
|
|
"GOOGLE_PLACES_API_KEY non définie. "
|
|
"Ajouter dans /home/h3r7/.env et relancer."
|
|
)
|
|
self.api_key = GOOGLE_PLACES_API_KEY
|
|
|
|
def _nearby_search(self, place_type: str, page_token: str = None) -> dict:
|
|
"""Appel Nearby Search — 1 requête comptabilisée."""
|
|
params = {
|
|
"key": self.api_key,
|
|
"location": f"{MEL_CENTER_LAT},{MEL_CENTER_LNG}",
|
|
"radius": MEL_RADIUS_M,
|
|
"type": place_type,
|
|
}
|
|
if page_token:
|
|
params["pagetoken"] = page_token
|
|
|
|
_increment_quota()
|
|
time.sleep(PLACES_SLEEP_S)
|
|
|
|
try:
|
|
resp = requests.get(
|
|
f"{self.BASE_URL}/nearbysearch/json",
|
|
params=params,
|
|
timeout=10,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
except Exception as e:
|
|
logger.warning(f"NearbySearch error (type={place_type}): {e}")
|
|
return {}
|
|
|
|
def _place_details(self, place_id: str) -> dict:
|
|
"""Place Details pour récupérer website, phone, rating, etc. — 1 requête."""
|
|
params = {
|
|
"key": self.api_key,
|
|
"place_id": place_id,
|
|
"fields": "name,formatted_address,formatted_phone_number,website,rating,user_ratings_total",
|
|
}
|
|
|
|
_increment_quota()
|
|
time.sleep(PLACES_SLEEP_S)
|
|
|
|
try:
|
|
resp = requests.get(
|
|
f"{self.BASE_URL}/details/json",
|
|
params=params,
|
|
timeout=10,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json().get("result", {})
|
|
except Exception as e:
|
|
logger.warning(f"PlaceDetails error (place_id={place_id}): {e}")
|
|
return {}
|
|
|
|
def scrape(self, max_leads: int = 50) -> list[dict]:
|
|
"""
|
|
Scrape les restaurants/cafés/bars MEL sans site web.
|
|
|
|
Retourne une liste de dicts normalisés compatibles LeadHunter CRM :
|
|
source, name, address, phone, rating, reviews_count, website, rgpd_ok
|
|
"""
|
|
leads = []
|
|
seen_ids = set()
|
|
|
|
for place_type in TARGET_TYPES:
|
|
if _quota_remaining() < 10:
|
|
logger.warning(
|
|
"Quota journalier presque épuisé — arrêt scraping Google Places."
|
|
)
|
|
break
|
|
|
|
logger.info(f"Scraping Google Places — type={place_type}")
|
|
page_token = None
|
|
|
|
while True:
|
|
if _quota_remaining() < 5:
|
|
logger.warning("Quota insuffisant pour continuer la pagination.")
|
|
break
|
|
|
|
data = self._nearby_search(place_type, page_token)
|
|
results = data.get("results", [])
|
|
|
|
for place in results:
|
|
if len(leads) >= max_leads:
|
|
break
|
|
|
|
place_id = place.get("place_id", "")
|
|
if not place_id or place_id in seen_ids:
|
|
continue
|
|
seen_ids.add(place_id)
|
|
|
|
if _quota_remaining() < 2:
|
|
logger.warning("Quota épuisé avant details.")
|
|
break
|
|
|
|
details = self._place_details(place_id)
|
|
|
|
# Filtre : on ne garde que les lieux SANS site web
|
|
if details.get("website"):
|
|
continue
|
|
|
|
lead = {
|
|
"source": "google_places",
|
|
"name": details.get("name") or place.get("name", ""),
|
|
"address": details.get("formatted_address")
|
|
or place.get("vicinity", ""),
|
|
"phone": details.get("formatted_phone_number", ""),
|
|
"rating": details.get("rating") or place.get("rating"),
|
|
"reviews_count": details.get("user_ratings_total")
|
|
or place.get("user_ratings_total"),
|
|
"website": "",
|
|
"rgpd_ok": True, # Données publiques Google Places uniquement
|
|
}
|
|
leads.append(lead)
|
|
logger.info(f"Lead trouvé (Google Places) : {lead['name']}")
|
|
|
|
if len(leads) >= max_leads:
|
|
break
|
|
|
|
page_token = data.get("next_page_token")
|
|
if not page_token:
|
|
break
|
|
|
|
# L'API Google Places nécessite un délai avant d'utiliser next_page_token
|
|
time.sleep(2)
|
|
|
|
logger.info(f"Google Places : {len(leads)} leads collectés.")
|
|
return leads
|
|
|
|
|
|
# ─── Overpass / OSM Fallback ──────────────────────────────────────────────────
|
|
|
|
|
|
class OverpassScraper:
|
|
"""
|
|
Fallback OSM via Overpass API.
|
|
Cible les nœuds/ways dans la boundary MEL sans attribut 'website'.
|
|
Données publiques ODbL — RGPD OK.
|
|
"""
|
|
|
|
def scrape(self, max_leads: int = 100) -> list[dict]:
|
|
"""
|
|
Scrape via Overpass API — retourne des leads normalisés.
|
|
"""
|
|
logger.info("Scraping Overpass OSM — boundary MEL")
|
|
leads = []
|
|
|
|
try:
|
|
resp = requests.post(
|
|
OVERPASS_URL,
|
|
data={"data": OVERPASS_MEL_QUERY},
|
|
timeout=90,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
except Exception as e:
|
|
logger.warning(f"Overpass API error : {e}")
|
|
return []
|
|
|
|
elements = data.get("elements", [])
|
|
logger.info(f"Overpass : {len(elements)} éléments bruts reçus.")
|
|
|
|
for el in elements[:max_leads]:
|
|
tags = el.get("tags", {})
|
|
|
|
# Coordonnées (pour les ways, Overpass retourne 'center')
|
|
lat = el.get("lat") or (el.get("center") or {}).get("lat")
|
|
lon = el.get("lon") or (el.get("center") or {}).get("lon")
|
|
|
|
name = tags.get("name", "")
|
|
if not name:
|
|
continue # Ignorer les lieux sans nom
|
|
|
|
addr_parts = [
|
|
tags.get("addr:housenumber", ""),
|
|
tags.get("addr:street", ""),
|
|
tags.get("addr:city", ""),
|
|
tags.get("addr:postcode", ""),
|
|
]
|
|
address = " ".join(p for p in addr_parts if p).strip()
|
|
if not address and lat and lon:
|
|
address = f"{lat:.4f},{lon:.4f}"
|
|
|
|
lead = {
|
|
"source": "osm",
|
|
"name": name,
|
|
"address": address,
|
|
"phone": tags.get("phone", tags.get("contact:phone", "")),
|
|
"rating": None,
|
|
"reviews_count": None,
|
|
"website": "",
|
|
"rgpd_ok": True, # Données publiques ODbL
|
|
}
|
|
leads.append(lead)
|
|
logger.info(f"Lead trouvé (OSM) : {lead['name']}")
|
|
|
|
logger.info(f"Overpass : {len(leads)} leads collectés.")
|
|
return leads
|
|
|
|
|
|
# ─── Orchestrateur ────────────────────────────────────────────────────────────
|
|
|
|
|
|
def run_scraping(
|
|
max_leads: int = 100, use_google: bool = True, use_osm: bool = True
|
|
) -> list[dict]:
|
|
"""
|
|
Lance le scraping Google Places + fallback OSM.
|
|
|
|
Args:
|
|
max_leads: nombre maximum de leads à collecter au total.
|
|
use_google: activer Google Places (nécessite GOOGLE_PLACES_API_KEY).
|
|
use_osm: activer le fallback Overpass OSM.
|
|
|
|
Returns:
|
|
Liste de leads normalisés (dédupliqués par nom + adresse).
|
|
"""
|
|
all_leads = []
|
|
seen_keys = set()
|
|
|
|
def _dedup_key(lead: dict) -> str:
|
|
return f"{lead['name'].lower().strip()}|{lead['address'].lower().strip()[:40]}"
|
|
|
|
if use_google:
|
|
try:
|
|
scraper = GooglePlacesScraper()
|
|
google_leads = scraper.scrape(max_leads=max_leads)
|
|
for lead in google_leads:
|
|
k = _dedup_key(lead)
|
|
if k not in seen_keys:
|
|
seen_keys.add(k)
|
|
all_leads.append(lead)
|
|
except EnvironmentError as e:
|
|
logger.warning(f"Google Places désactivé : {e}")
|
|
use_google = False
|
|
|
|
remaining = max_leads - len(all_leads)
|
|
if use_osm and remaining > 0:
|
|
osm_leads = OverpassScraper().scrape(max_leads=remaining)
|
|
for lead in osm_leads:
|
|
k = _dedup_key(lead)
|
|
if k not in seen_keys:
|
|
seen_keys.add(k)
|
|
all_leads.append(lead)
|
|
|
|
logger.info(
|
|
f"run_scraping terminé — {len(all_leads)} leads uniques "
|
|
f"(Google={use_google}, OSM={use_osm}). "
|
|
f"Quota restant aujourd'hui : {_quota_remaining()}"
|
|
)
|
|
return all_leads
|
|
|
|
|
|
# ─── CLI (debug) ─────────────────────────────────────────────────────────────
|
|
|
|
if __name__ == "__main__":
|
|
assert GOOGLE_PLACES_API_KEY, (
|
|
"GOOGLE_PLACES_API_KEY manquante — "
|
|
"ajouter 'export GOOGLE_PLACES_API_KEY=xxx' dans /home/h3r7/.env"
|
|
)
|
|
leads = run_scraping(max_leads=10)
|
|
for i, l in enumerate(leads, 1):
|
|
print(f"{i:02d}. [{l['source']}] {l['name']} — {l['address']}")
|