Merge pull request '[HRT-66] LeadHunter S1 — Core scraping, scoring, CRM SQLite et API Flask' (#8) from feature/HRT-66-leadhunter-core into master
This commit was merged in pull request #8.
This commit is contained in:
21
infra/turf-saas-leadhunter.service
Normal file
21
infra/turf-saas-leadhunter.service
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=H3R7Tech LeadHunter API (Port 8775)
|
||||||
|
Documentation=https://portal-kolifee.duckdns.org
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=h3r7
|
||||||
|
WorkingDirectory=/home/h3r7/turf_saas
|
||||||
|
|
||||||
|
# Charger les variables d'environnement depuis /home/h3r7/.env
|
||||||
|
# (notamment GOOGLE_PLACES_API_KEY)
|
||||||
|
EnvironmentFile=/home/h3r7/.env
|
||||||
|
|
||||||
|
ExecStart=/home/h3r7/turf_saas/venv/bin/python3 /home/h3r7/turf_saas/leadhunter_api.py
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
Environment=PYTHONPATH=/home/h3r7/turf_saas
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
303
leadhunter_api.py
Normal file
303
leadhunter_api.py
Normal file
@@ -0,0 +1,303 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
H3R7Tech — LeadHunter API
|
||||||
|
===========================
|
||||||
|
Service Flask sur port 8775 exposant les endpoints LeadHunter.
|
||||||
|
|
||||||
|
Endpoints :
|
||||||
|
GET /api/leads — Liste les leads (filtres: status, limit, offset)
|
||||||
|
POST /api/leads/scrape — Lance un job de scraping asynchrone
|
||||||
|
GET /api/leads/stats — Statistiques globales du CRM
|
||||||
|
GET /api/leads/export — Export CSV des leads
|
||||||
|
PATCH /api/leads/<id>/status — Met à jour le statut d'un lead
|
||||||
|
|
||||||
|
Port : 8775 (8769 occupé par depenses_trello/app.py, 8770 occupé par turf_scraper/crm_api.py — corrigé HRT-66)
|
||||||
|
|
||||||
|
Auteur: H3R7Tech Backend Engineer
|
||||||
|
Issue: HRT-66
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
import logging
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
from flask import Flask, jsonify, request, Response
|
||||||
|
from flask_cors import CORS
|
||||||
|
|
||||||
|
# Import des modules LeadHunter
|
||||||
|
from leadhunter_crm import (
|
||||||
|
init_db,
|
||||||
|
insert_leads,
|
||||||
|
get_leads,
|
||||||
|
get_lead_by_id,
|
||||||
|
update_lead_status,
|
||||||
|
get_stats,
|
||||||
|
export_csv,
|
||||||
|
VALID_STATUSES,
|
||||||
|
DB_PATH,
|
||||||
|
)
|
||||||
|
from leadhunter_scraper import run_scraping, GOOGLE_PLACES_API_KEY
|
||||||
|
from leadhunter_scorer import LeadScorer
|
||||||
|
|
||||||
|
# ─── Assertions au démarrage ─────────────────────────────────────────────────
|
||||||
|
# Vérification obligatoire : la clé API doit être présente au démarrage
|
||||||
|
assert os.environ.get("GOOGLE_PLACES_API_KEY"), (
|
||||||
|
"GOOGLE_PLACES_API_KEY manquante. "
|
||||||
|
"Ajouter dans /home/h3r7/.env : export GOOGLE_PLACES_API_KEY=xxx"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ─── Logging ────────────────────────────────────────────────────────────────
|
||||||
|
logger = logging.getLogger("leadhunter.api")
|
||||||
|
|
||||||
|
_handler = RotatingFileHandler(
|
||||||
|
"/home/h3r7/leadhunter.log",
|
||||||
|
maxBytes=5 * 1024 * 1024,
|
||||||
|
backupCount=3,
|
||||||
|
)
|
||||||
|
_handler.setFormatter(
|
||||||
|
logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s")
|
||||||
|
)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
if not logger.handlers:
|
||||||
|
logger.addHandler(_handler)
|
||||||
|
logger.addHandler(logging.StreamHandler())
|
||||||
|
|
||||||
|
# ─── App Flask ───────────────────────────────────────────────────────────────
|
||||||
|
app = Flask(__name__)
|
||||||
|
CORS(app)
|
||||||
|
|
||||||
|
# Scorer singleton
|
||||||
|
scorer = LeadScorer()
|
||||||
|
|
||||||
|
# État global du job de scraping (simple flag — pas de celery nécessaire pour le POC)
|
||||||
|
_scrape_job = {
|
||||||
|
"running": False,
|
||||||
|
"last_run": None,
|
||||||
|
"last_count": 0,
|
||||||
|
"last_error": None,
|
||||||
|
}
|
||||||
|
_scrape_lock = threading.Lock()
|
||||||
|
|
||||||
|
# ─── Init DB ─────────────────────────────────────────────────────────────────
|
||||||
|
init_db(DB_PATH)
|
||||||
|
logger.info("LeadHunter API démarrée — DB initialisée.")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _run_scrape_job(max_leads: int, use_google: bool, use_osm: bool) -> None:
|
||||||
|
"""Job de scraping exécuté dans un thread séparé."""
|
||||||
|
with _scrape_lock:
|
||||||
|
_scrape_job["running"] = True
|
||||||
|
_scrape_job["last_error"] = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
leads_raw = run_scraping(
|
||||||
|
max_leads=max_leads,
|
||||||
|
use_google=use_google,
|
||||||
|
use_osm=use_osm,
|
||||||
|
)
|
||||||
|
leads_scored = scorer.score_leads(leads_raw)
|
||||||
|
inserted_ids = insert_leads(leads_scored)
|
||||||
|
|
||||||
|
with _scrape_lock:
|
||||||
|
_scrape_job["last_count"] = len(inserted_ids)
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
_scrape_job["last_run"] = datetime.utcnow().isoformat() + "Z"
|
||||||
|
|
||||||
|
logger.info(f"Scrape job terminé : {len(inserted_ids)} leads insérés.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Scrape job erreur : {e}")
|
||||||
|
with _scrape_lock:
|
||||||
|
_scrape_job["last_error"] = str(e)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
with _scrape_lock:
|
||||||
|
_scrape_job["running"] = False
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Routes ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/leads", methods=["GET"])
|
||||||
|
def api_get_leads():
|
||||||
|
"""
|
||||||
|
Liste les leads du CRM.
|
||||||
|
|
||||||
|
Query params :
|
||||||
|
- status (str, optional) : filtre sur new/contacted/closed/rejected
|
||||||
|
- limit (int, default=50) : pagination
|
||||||
|
- offset (int, default=0) : pagination
|
||||||
|
"""
|
||||||
|
status = request.args.get("status")
|
||||||
|
try:
|
||||||
|
limit = int(request.args.get("limit", 50))
|
||||||
|
offset = int(request.args.get("offset", 0))
|
||||||
|
except ValueError:
|
||||||
|
return jsonify({"error": "limit et offset doivent être des entiers"}), 400
|
||||||
|
|
||||||
|
if status and status not in VALID_STATUSES:
|
||||||
|
return jsonify(
|
||||||
|
{"error": f"status invalide. Valeurs acceptées : {VALID_STATUSES}"}
|
||||||
|
), 400
|
||||||
|
|
||||||
|
leads = get_leads(status=status, limit=limit, offset=offset)
|
||||||
|
return jsonify(
|
||||||
|
{
|
||||||
|
"leads": leads,
|
||||||
|
"count": len(leads),
|
||||||
|
"limit": limit,
|
||||||
|
"offset": offset,
|
||||||
|
"status_filter": status,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/leads/scrape", methods=["POST"])
|
||||||
|
def api_scrape():
|
||||||
|
"""
|
||||||
|
Lance un job de scraping asynchrone.
|
||||||
|
|
||||||
|
Body JSON (optionnel) :
|
||||||
|
- max_leads (int, default=100)
|
||||||
|
- use_google (bool, default=true)
|
||||||
|
- use_osm (bool, default=true)
|
||||||
|
|
||||||
|
Retourne immédiatement avec le statut du job.
|
||||||
|
"""
|
||||||
|
with _scrape_lock:
|
||||||
|
if _scrape_job["running"]:
|
||||||
|
return jsonify(
|
||||||
|
{
|
||||||
|
"status": "already_running",
|
||||||
|
"message": "Un job de scraping est déjà en cours.",
|
||||||
|
}
|
||||||
|
), 409
|
||||||
|
|
||||||
|
body = request.get_json(silent=True) or {}
|
||||||
|
max_leads = int(body.get("max_leads", 100))
|
||||||
|
use_google = bool(body.get("use_google", True))
|
||||||
|
use_osm = bool(body.get("use_osm", True))
|
||||||
|
|
||||||
|
thread = threading.Thread(
|
||||||
|
target=_run_scrape_job,
|
||||||
|
args=(max_leads, use_google, use_osm),
|
||||||
|
daemon=True,
|
||||||
|
)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Job de scraping lancé (max_leads={max_leads}, "
|
||||||
|
f"use_google={use_google}, use_osm={use_osm})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return jsonify(
|
||||||
|
{
|
||||||
|
"status": "started",
|
||||||
|
"message": "Job de scraping démarré en arrière-plan.",
|
||||||
|
"params": {
|
||||||
|
"max_leads": max_leads,
|
||||||
|
"use_google": use_google,
|
||||||
|
"use_osm": use_osm,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
), 202
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/leads/scrape/status", methods=["GET"])
|
||||||
|
def api_scrape_status():
|
||||||
|
"""Retourne l'état courant du job de scraping."""
|
||||||
|
with _scrape_lock:
|
||||||
|
return jsonify(dict(_scrape_job))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/leads/stats", methods=["GET"])
|
||||||
|
def api_stats():
|
||||||
|
"""
|
||||||
|
Statistiques globales du CRM LeadHunter.
|
||||||
|
|
||||||
|
Retourne : total, by_status, by_source, avg_score, top_leads_count
|
||||||
|
"""
|
||||||
|
stats = get_stats()
|
||||||
|
if not stats:
|
||||||
|
return jsonify({"error": "Impossible de calculer les statistiques"}), 500
|
||||||
|
return jsonify(stats)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/leads/export", methods=["GET"])
|
||||||
|
def api_export():
|
||||||
|
"""
|
||||||
|
Export CSV de tous les leads (ou filtrés par status).
|
||||||
|
|
||||||
|
Query params :
|
||||||
|
- status (str, optional)
|
||||||
|
"""
|
||||||
|
status = request.args.get("status")
|
||||||
|
if status and status not in VALID_STATUSES:
|
||||||
|
return jsonify({"error": f"status invalide : {VALID_STATUSES}"}), 400
|
||||||
|
|
||||||
|
csv_content = export_csv(status=status)
|
||||||
|
filename = f"leadhunter_leads{'_' + status if status else ''}.csv"
|
||||||
|
|
||||||
|
return Response(
|
||||||
|
csv_content,
|
||||||
|
mimetype="text/csv",
|
||||||
|
headers={
|
||||||
|
"Content-Disposition": f"attachment; filename={filename}",
|
||||||
|
"Content-Type": "text/csv; charset=utf-8",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/leads/<int:lead_id>/status", methods=["PATCH"])
|
||||||
|
def api_update_status(lead_id: int):
|
||||||
|
"""
|
||||||
|
Met à jour le statut d'un lead.
|
||||||
|
|
||||||
|
Body JSON :
|
||||||
|
- status (str) : new | contacted | closed | rejected
|
||||||
|
"""
|
||||||
|
body = request.get_json(silent=True)
|
||||||
|
if not body or "status" not in body:
|
||||||
|
return jsonify({"error": "Body JSON requis avec le champ 'status'"}), 400
|
||||||
|
|
||||||
|
new_status = body["status"]
|
||||||
|
if new_status not in VALID_STATUSES:
|
||||||
|
return jsonify({"error": f"status invalide. Valeurs : {VALID_STATUSES}"}), 400
|
||||||
|
|
||||||
|
lead = get_lead_by_id(lead_id)
|
||||||
|
if not lead:
|
||||||
|
return jsonify({"error": f"Lead id={lead_id} introuvable"}), 404
|
||||||
|
|
||||||
|
success = update_lead_status(lead_id, new_status)
|
||||||
|
if not success:
|
||||||
|
return jsonify({"error": "Mise à jour échouée"}), 500
|
||||||
|
|
||||||
|
return jsonify(
|
||||||
|
{
|
||||||
|
"success": True,
|
||||||
|
"lead_id": lead_id,
|
||||||
|
"new_status": new_status,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/health", methods=["GET"])
|
||||||
|
def health():
|
||||||
|
"""Healthcheck pour systemd / monitoring."""
|
||||||
|
return jsonify(
|
||||||
|
{
|
||||||
|
"status": "ok",
|
||||||
|
"service": "leadhunter-api",
|
||||||
|
"port": 8775,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Entrypoint ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(host="0.0.0.0", port=8775, debug=False)
|
||||||
349
leadhunter_crm.py
Normal file
349
leadhunter_crm.py
Normal file
@@ -0,0 +1,349 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
H3R7Tech — LeadHunter CRM (SQLite)
|
||||||
|
=====================================
|
||||||
|
Couche de persistance SQLite pour les leads LeadHunter.
|
||||||
|
|
||||||
|
Schéma validé CTO (HRT-66) :
|
||||||
|
CREATE TABLE leads (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
source TEXT NOT NULL, -- 'google_places' ou 'osm'
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
address TEXT,
|
||||||
|
phone TEXT,
|
||||||
|
rating REAL,
|
||||||
|
reviews_count INTEGER,
|
||||||
|
website TEXT,
|
||||||
|
score INTEGER,
|
||||||
|
rgpd_ok BOOLEAN DEFAULT 1,
|
||||||
|
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
status TEXT DEFAULT 'new' -- new, contacted, closed, rejected
|
||||||
|
);
|
||||||
|
|
||||||
|
Auteur: H3R7Tech Backend Engineer
|
||||||
|
Issue: HRT-66
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import logging
|
||||||
|
import csv
|
||||||
|
import io
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from datetime import datetime
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
# ─── Logging ────────────────────────────────────────────────────────────────
|
||||||
|
logger = logging.getLogger("leadhunter.crm")
|
||||||
|
|
||||||
|
_handler = RotatingFileHandler(
|
||||||
|
"/home/h3r7/leadhunter.log",
|
||||||
|
maxBytes=5 * 1024 * 1024,
|
||||||
|
backupCount=3,
|
||||||
|
)
|
||||||
|
_handler.setFormatter(
|
||||||
|
logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s")
|
||||||
|
)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
if not logger.handlers:
|
||||||
|
logger.addHandler(_handler)
|
||||||
|
logger.addHandler(logging.StreamHandler())
|
||||||
|
|
||||||
|
# ─── Chemin DB ───────────────────────────────────────────────────────────────
|
||||||
|
DB_PATH = "/home/h3r7/leadhunter.db"
|
||||||
|
|
||||||
|
# Statuts valides pour un lead
|
||||||
|
VALID_STATUSES = {"new", "contacted", "closed", "rejected"}
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Initialisation ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def init_db(db_path: str = DB_PATH) -> None:
|
||||||
|
"""
|
||||||
|
Crée la base SQLite et la table leads si elle n'existe pas.
|
||||||
|
Idempotent — peut être appelé au démarrage de l'API.
|
||||||
|
"""
|
||||||
|
with sqlite3.connect(db_path) as conn:
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS leads (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
address TEXT,
|
||||||
|
phone TEXT,
|
||||||
|
rating REAL,
|
||||||
|
reviews_count INTEGER,
|
||||||
|
website TEXT,
|
||||||
|
score INTEGER,
|
||||||
|
rgpd_ok BOOLEAN DEFAULT 1,
|
||||||
|
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
status TEXT DEFAULT 'new'
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.commit()
|
||||||
|
logger.info(f"DB initialisée : {db_path}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Context manager ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _get_conn(db_path: str = DB_PATH):
|
||||||
|
"""Fournit une connexion SQLite avec row_factory."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
conn.commit()
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logger.warning(f"DB transaction rollback : {e}")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── CRUD ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def insert_lead(lead: dict, db_path: str = DB_PATH) -> Optional[int]:
|
||||||
|
"""
|
||||||
|
Insère un lead normalisé dans la DB.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lead: dict avec les champs normalisés (source, name, address, ...)
|
||||||
|
db_path: chemin vers la DB SQLite.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
L'id SQLite du lead inséré, ou None en cas d'erreur.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with _get_conn(db_path) as conn:
|
||||||
|
cursor = conn.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO leads
|
||||||
|
(source, name, address, phone, rating, reviews_count,
|
||||||
|
website, score, rgpd_ok, status)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
lead.get("source", "unknown"),
|
||||||
|
lead.get("name", ""),
|
||||||
|
lead.get("address", ""),
|
||||||
|
lead.get("phone", ""),
|
||||||
|
lead.get("rating"),
|
||||||
|
lead.get("reviews_count"),
|
||||||
|
lead.get("website", ""),
|
||||||
|
lead.get("score"),
|
||||||
|
1 if lead.get("rgpd_ok", True) else 0,
|
||||||
|
lead.get("status", "new"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
lead_id = cursor.lastrowid
|
||||||
|
logger.info(f"Lead inséré id={lead_id} : {lead.get('name')}")
|
||||||
|
return lead_id
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"insert_lead error : {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def insert_leads(leads: list[dict], db_path: str = DB_PATH) -> list[int]:
|
||||||
|
"""
|
||||||
|
Insère une liste de leads en batch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Liste des ids insérés.
|
||||||
|
"""
|
||||||
|
ids = []
|
||||||
|
for lead in leads:
|
||||||
|
lead_id = insert_lead(lead, db_path)
|
||||||
|
if lead_id is not None:
|
||||||
|
ids.append(lead_id)
|
||||||
|
logger.info(f"insert_leads : {len(ids)}/{len(leads)} insérés.")
|
||||||
|
return ids
|
||||||
|
|
||||||
|
|
||||||
|
def get_leads(
|
||||||
|
status: Optional[str] = None,
|
||||||
|
limit: int = 100,
|
||||||
|
offset: int = 0,
|
||||||
|
db_path: str = DB_PATH,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Récupère les leads avec filtre optionnel sur le statut.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
status: filtre sur le champ 'status' (new, contacted, closed, rejected).
|
||||||
|
limit: pagination — nombre de résultats max.
|
||||||
|
offset: pagination — décalage.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Liste de dicts (tous les champs de la table leads).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with _get_conn(db_path) as conn:
|
||||||
|
if status:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT * FROM leads WHERE status = ? ORDER BY score DESC, scraped_at DESC LIMIT ? OFFSET ?",
|
||||||
|
(status, limit, offset),
|
||||||
|
).fetchall()
|
||||||
|
else:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT * FROM leads ORDER BY score DESC, scraped_at DESC LIMIT ? OFFSET ?",
|
||||||
|
(limit, offset),
|
||||||
|
).fetchall()
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"get_leads error : {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def get_lead_by_id(lead_id: int, db_path: str = DB_PATH) -> Optional[dict]:
|
||||||
|
"""Récupère un lead par son id."""
|
||||||
|
try:
|
||||||
|
with _get_conn(db_path) as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT * FROM leads WHERE id = ?", (lead_id,)
|
||||||
|
).fetchone()
|
||||||
|
return dict(row) if row else None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"get_lead_by_id error : {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def update_lead_status(lead_id: int, status: str, db_path: str = DB_PATH) -> bool:
|
||||||
|
"""
|
||||||
|
Met à jour le statut d'un lead.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lead_id: id du lead.
|
||||||
|
status: nouveau statut ('new', 'contacted', 'closed', 'rejected').
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True si mise à jour réussie, False sinon.
|
||||||
|
"""
|
||||||
|
if status not in VALID_STATUSES:
|
||||||
|
logger.warning(f"update_lead_status : statut invalide '{status}'")
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
with _get_conn(db_path) as conn:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE leads SET status = ? WHERE id = ?",
|
||||||
|
(status, lead_id),
|
||||||
|
)
|
||||||
|
logger.info(f"Lead id={lead_id} statut → {status}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"update_lead_status error : {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_stats(db_path: str = DB_PATH) -> dict:
|
||||||
|
"""
|
||||||
|
Retourne les statistiques globales du CRM.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict avec total, by_status, by_source, avg_score, top_leads_count
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with _get_conn(db_path) as conn:
|
||||||
|
total = conn.execute("SELECT COUNT(*) FROM leads").fetchone()[0]
|
||||||
|
|
||||||
|
by_status_rows = conn.execute(
|
||||||
|
"SELECT status, COUNT(*) as cnt FROM leads GROUP BY status"
|
||||||
|
).fetchall()
|
||||||
|
by_status = {r["status"]: r["cnt"] for r in by_status_rows}
|
||||||
|
|
||||||
|
by_source_rows = conn.execute(
|
||||||
|
"SELECT source, COUNT(*) as cnt FROM leads GROUP BY source"
|
||||||
|
).fetchall()
|
||||||
|
by_source = {r["source"]: r["cnt"] for r in by_source_rows}
|
||||||
|
|
||||||
|
avg_score_row = conn.execute(
|
||||||
|
"SELECT AVG(score) FROM leads WHERE score IS NOT NULL"
|
||||||
|
).fetchone()
|
||||||
|
avg_score = round(avg_score_row[0] or 0, 2)
|
||||||
|
|
||||||
|
# Leads "chauds" = score ≥ 5
|
||||||
|
top_count = conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM leads WHERE score >= 5"
|
||||||
|
).fetchone()[0]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total": total,
|
||||||
|
"by_status": by_status,
|
||||||
|
"by_source": by_source,
|
||||||
|
"avg_score": avg_score,
|
||||||
|
"top_leads_count": top_count,
|
||||||
|
"generated_at": datetime.utcnow().isoformat() + "Z",
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"get_stats error : {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def export_csv(
|
||||||
|
status: Optional[str] = None,
|
||||||
|
db_path: str = DB_PATH,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Exporte les leads en CSV (string).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
status: filtre optionnel sur le statut.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Contenu CSV en string UTF-8.
|
||||||
|
"""
|
||||||
|
leads = get_leads(status=status, limit=10000, db_path=db_path)
|
||||||
|
|
||||||
|
output = io.StringIO()
|
||||||
|
fieldnames = [
|
||||||
|
"id",
|
||||||
|
"source",
|
||||||
|
"name",
|
||||||
|
"address",
|
||||||
|
"phone",
|
||||||
|
"rating",
|
||||||
|
"reviews_count",
|
||||||
|
"website",
|
||||||
|
"score",
|
||||||
|
"rgpd_ok",
|
||||||
|
"scraped_at",
|
||||||
|
"status",
|
||||||
|
]
|
||||||
|
writer = csv.DictWriter(output, fieldnames=fieldnames, extrasaction="ignore")
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(leads)
|
||||||
|
|
||||||
|
logger.info(f"export_csv : {len(leads)} leads exportés.")
|
||||||
|
return output.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── CLI (debug) ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
init_db()
|
||||||
|
|
||||||
|
# Test insertion
|
||||||
|
test_lead = {
|
||||||
|
"source": "google_places",
|
||||||
|
"name": "Restaurant Test",
|
||||||
|
"address": "10 rue de la Paix, 59000 Lille",
|
||||||
|
"phone": "+33 3 20 00 00 01",
|
||||||
|
"rating": 4.5,
|
||||||
|
"reviews_count": 120,
|
||||||
|
"website": "",
|
||||||
|
"score": 8,
|
||||||
|
"rgpd_ok": True,
|
||||||
|
"status": "new",
|
||||||
|
}
|
||||||
|
lead_id = insert_lead(test_lead)
|
||||||
|
print(f"Lead inséré : id={lead_id}")
|
||||||
|
|
||||||
|
leads = get_leads()
|
||||||
|
print(f"Leads en DB : {len(leads)}")
|
||||||
|
|
||||||
|
stats = get_stats()
|
||||||
|
print(f"Stats : {stats}")
|
||||||
193
leadhunter_scorer.py
Normal file
193
leadhunter_scorer.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
H3R7Tech — LeadHunter Scorer
|
||||||
|
================================
|
||||||
|
Moteur de scoring des leads restaurants MEL.
|
||||||
|
|
||||||
|
Critères (ordre de priorité métier) :
|
||||||
|
1. [+3] Site web absent ← CRITIQUE : raison d'être du produit
|
||||||
|
2. [+2] Nombre d'avis élevé (≥ 50) : forte activité = bon prospect de vente
|
||||||
|
3. [+2] Note Google élevée (≥ 4.0) : établissement sérieux
|
||||||
|
4. [+1] Téléphone présent : facilite la prise de contact
|
||||||
|
5. [-1] Note faible (< 3.0) : risque reputationnel pour la prestation web
|
||||||
|
|
||||||
|
Score maximum théorique : 8
|
||||||
|
Score minimum : 0 (leads avec site web ne doivent pas passer ici)
|
||||||
|
|
||||||
|
Auteur: H3R7Tech Backend Engineer
|
||||||
|
Issue: HRT-66
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
|
||||||
|
# ─── Logging ────────────────────────────────────────────────────────────────
|
||||||
|
logger = logging.getLogger("leadhunter.scorer")
|
||||||
|
|
||||||
|
_handler = RotatingFileHandler(
|
||||||
|
"/home/h3r7/leadhunter.log",
|
||||||
|
maxBytes=5 * 1024 * 1024,
|
||||||
|
backupCount=3,
|
||||||
|
)
|
||||||
|
_handler.setFormatter(
|
||||||
|
logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s")
|
||||||
|
)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
if not logger.handlers:
|
||||||
|
logger.addHandler(_handler)
|
||||||
|
logger.addHandler(logging.StreamHandler())
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Scorer ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class LeadScorer:
|
||||||
|
"""
|
||||||
|
Calcule le score de priorité d'un lead.
|
||||||
|
|
||||||
|
Le score sert à trier les leads dans le CRM :
|
||||||
|
- Score élevé = prospect chaud (sans site + actif + bien noté)
|
||||||
|
- Score faible = prospect froid (peut être ignoré ou traité en dernier)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _calculate_score(self, lead: dict) -> int:
|
||||||
|
"""
|
||||||
|
Calcule le score d'un lead.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lead: dict avec les champs normalisés du scraper
|
||||||
|
(name, website, rating, reviews_count, phone, ...)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Score entier (0–8)
|
||||||
|
"""
|
||||||
|
score = 0
|
||||||
|
|
||||||
|
# ── Critère 1 : site web absent [CRITIQUE — logique métier centrale] ──
|
||||||
|
# C'est le critère n°1 : on cherche des restaurants SANS site web
|
||||||
|
# pour leur proposer une création de site à 800–1500€.
|
||||||
|
website = lead.get("website", "")
|
||||||
|
if not website or not website.strip():
|
||||||
|
score += 3
|
||||||
|
logger.debug(f"{lead.get('name')}: +3 (site web absent)")
|
||||||
|
else:
|
||||||
|
# Si le lead a un site web, score = 0 immédiatement.
|
||||||
|
# Ce cas ne devrait pas se produire (filtre scraper),
|
||||||
|
# mais on reste défensif.
|
||||||
|
logger.warning(
|
||||||
|
f"{lead.get('name')}: site web présent ({website}), "
|
||||||
|
"lead ignoré pour scoring."
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# ── Critère 2 : nombre d'avis élevé (≥ 50) ──────────────────────────
|
||||||
|
reviews = lead.get("reviews_count")
|
||||||
|
if reviews is not None:
|
||||||
|
try:
|
||||||
|
reviews = int(reviews)
|
||||||
|
if reviews >= 50:
|
||||||
|
score += 2
|
||||||
|
logger.debug(f"{lead.get('name')}: +2 (avis ≥ 50 : {reviews})")
|
||||||
|
except (TypeError, ValueError) as e:
|
||||||
|
logger.warning(f"reviews_count invalide pour {lead.get('name')}: {e}")
|
||||||
|
|
||||||
|
# ── Critère 3 : bonne note Google (≥ 4.0) ───────────────────────────
|
||||||
|
rating = lead.get("rating")
|
||||||
|
if rating is not None:
|
||||||
|
try:
|
||||||
|
rating = float(rating)
|
||||||
|
if rating >= 4.0:
|
||||||
|
score += 2
|
||||||
|
logger.debug(f"{lead.get('name')}: +2 (note ≥ 4.0 : {rating})")
|
||||||
|
elif rating < 3.0:
|
||||||
|
score -= 1
|
||||||
|
logger.debug(f"{lead.get('name')}: -1 (note < 3.0 : {rating})")
|
||||||
|
except (TypeError, ValueError) as e:
|
||||||
|
logger.warning(f"rating invalide pour {lead.get('name')}: {e}")
|
||||||
|
|
||||||
|
# ── Critère 4 : téléphone présent ────────────────────────────────────
|
||||||
|
phone = lead.get("phone", "")
|
||||||
|
if phone and phone.strip():
|
||||||
|
score += 1
|
||||||
|
logger.debug(f"{lead.get('name')}: +1 (téléphone présent)")
|
||||||
|
|
||||||
|
# Plancher à 0
|
||||||
|
score = max(0, score)
|
||||||
|
logger.info(f"Score calculé pour '{lead.get('name')}' : {score}/8")
|
||||||
|
return score
|
||||||
|
|
||||||
|
def score_lead(self, lead: dict) -> dict:
|
||||||
|
"""
|
||||||
|
Enrichit un lead avec son score.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lead: dict normalisé du scraper.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Même dict avec le champ 'score' ajouté/mis à jour.
|
||||||
|
"""
|
||||||
|
lead = dict(lead) # copie défensive
|
||||||
|
lead["score"] = self._calculate_score(lead)
|
||||||
|
return lead
|
||||||
|
|
||||||
|
def score_leads(self, leads: list[dict]) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Score et trie une liste de leads (score décroissant).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
leads: liste de dicts normalisés.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Liste triée par score décroissant.
|
||||||
|
"""
|
||||||
|
scored = [self.score_lead(lead) for lead in leads]
|
||||||
|
scored.sort(key=lambda l: l.get("score", 0), reverse=True)
|
||||||
|
logger.info(
|
||||||
|
f"score_leads terminé : {len(scored)} leads scorés. "
|
||||||
|
f"Score max = {scored[0]['score'] if scored else 0}, "
|
||||||
|
f"Score min = {scored[-1]['score'] if scored else 0}"
|
||||||
|
)
|
||||||
|
return scored
|
||||||
|
|
||||||
|
|
||||||
|
# ─── CLI (debug) ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Exemple de test rapide sans appel API
|
||||||
|
test_leads = [
|
||||||
|
{
|
||||||
|
"name": "Restaurant A",
|
||||||
|
"website": "",
|
||||||
|
"rating": 4.5,
|
||||||
|
"reviews_count": 120,
|
||||||
|
"phone": "+33 3 20 00 00 01",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Restaurant B",
|
||||||
|
"website": "",
|
||||||
|
"rating": 3.8,
|
||||||
|
"reviews_count": 30,
|
||||||
|
"phone": "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Café C",
|
||||||
|
"website": "",
|
||||||
|
"rating": 2.5,
|
||||||
|
"reviews_count": 5,
|
||||||
|
"phone": "+33 3 20 00 00 03",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Bar D avec site",
|
||||||
|
"website": "https://bar-d.fr",
|
||||||
|
"rating": 4.2,
|
||||||
|
"reviews_count": 80,
|
||||||
|
"phone": "+33 3 20 00 00 04",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
scorer = LeadScorer()
|
||||||
|
results = scorer.score_leads(test_leads)
|
||||||
|
|
||||||
|
print("\n=== Résultats scoring ===")
|
||||||
|
for r in results:
|
||||||
|
print(f" [{r['score']:2d}/8] {r['name']}")
|
||||||
393
leadhunter_scraper.py
Normal file
393
leadhunter_scraper.py
Normal file
@@ -0,0 +1,393 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
H3R7Tech — LeadHunter Scraper
|
||||||
|
================================
|
||||||
|
Agent de scraping pour la détection de restaurants sans site web
|
||||||
|
dans la MEL (Métropole Européenne de Lille).
|
||||||
|
|
||||||
|
Sources :
|
||||||
|
- Google Places API (primary)
|
||||||
|
- OpenStreetMap / Overpass API (fallback)
|
||||||
|
|
||||||
|
Quota Google Places Free Tier :
|
||||||
|
- 28 500 requêtes/mois ≈ 950/jour
|
||||||
|
- Compteur persistent dans /home/h3r7/leadhunter_quota.json
|
||||||
|
|
||||||
|
Auteur: H3R7Tech Backend Engineer
|
||||||
|
Issue: HRT-66
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
from datetime import date, datetime
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
|
||||||
|
# ─── Logging ────────────────────────────────────────────────────────────────
|
||||||
|
logger = logging.getLogger("leadhunter.scraper")
|
||||||
|
|
||||||
|
_handler = RotatingFileHandler(
|
||||||
|
"/home/h3r7/leadhunter.log",
|
||||||
|
maxBytes=5 * 1024 * 1024, # 5 MB
|
||||||
|
backupCount=3,
|
||||||
|
)
|
||||||
|
_handler.setFormatter(
|
||||||
|
logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s")
|
||||||
|
)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
if not logger.handlers:
|
||||||
|
logger.addHandler(_handler)
|
||||||
|
logger.addHandler(logging.StreamHandler())
|
||||||
|
|
||||||
|
# ─── Configuration ───────────────────────────────────────────────────────────
|
||||||
|
GOOGLE_PLACES_API_KEY = os.environ.get("GOOGLE_PLACES_API_KEY")
|
||||||
|
|
||||||
|
# Quota journalier Google Places Free Tier
|
||||||
|
DAILY_QUOTA_FILE = "/home/h3r7/leadhunter_quota.json"
|
||||||
|
DAILY_QUOTA_LIMIT = 900 # marge de sécurité vs les 950 théoriques
|
||||||
|
|
||||||
|
# Délai entre requêtes Places pour éviter rate-limiting
|
||||||
|
PLACES_SLEEP_S = 0.5
|
||||||
|
|
||||||
|
# Bounding box MEL (Métropole Européenne de Lille)
|
||||||
|
MEL_CENTER_LAT = 50.6292
|
||||||
|
MEL_CENTER_LNG = 3.0573
|
||||||
|
MEL_RADIUS_M = 20000 # 20 km autour de Lille
|
||||||
|
|
||||||
|
# Types de lieux ciblés
|
||||||
|
TARGET_TYPES = ["restaurant", "cafe", "bar", "bakery", "food"]
|
||||||
|
|
||||||
|
# Overpass API endpoint
|
||||||
|
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
|
||||||
|
|
||||||
|
# Requête Overpass MEL — boundary nommée "Métropole Européenne de Lille"
|
||||||
|
OVERPASS_MEL_QUERY = """
|
||||||
|
[out:json][timeout:60];
|
||||||
|
area["name"="Métropole Européenne de Lille"]["boundary"="administrative"]->.mel;
|
||||||
|
(
|
||||||
|
node["amenity"~"^(restaurant|cafe|bar|fast_food|bakery)$"]["website"!~".+"](area.mel);
|
||||||
|
way["amenity"~"^(restaurant|cafe|bar|fast_food|bakery)$"]["website"!~".+"](area.mel);
|
||||||
|
);
|
||||||
|
out center 200;
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Quota Manager ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _load_quota() -> dict:
|
||||||
|
"""Charge le compteur quotidien depuis le fichier JSON."""
|
||||||
|
today = str(date.today())
|
||||||
|
if os.path.exists(DAILY_QUOTA_FILE):
|
||||||
|
try:
|
||||||
|
with open(DAILY_QUOTA_FILE, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
if data.get("date") == today:
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Impossible de lire le fichier quota : {e}")
|
||||||
|
return {"date": today, "count": 0}
|
||||||
|
|
||||||
|
|
||||||
|
def _save_quota(data: dict) -> None:
|
||||||
|
"""Persiste le compteur quotidien."""
|
||||||
|
try:
|
||||||
|
with open(DAILY_QUOTA_FILE, "w") as f:
|
||||||
|
json.dump(data, f)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Impossible d'écrire le fichier quota : {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def _increment_quota(n: int = 1) -> int:
|
||||||
|
"""Incrémente le compteur et retourne le total du jour."""
|
||||||
|
quota = _load_quota()
|
||||||
|
quota["count"] += n
|
||||||
|
_save_quota(quota)
|
||||||
|
return quota["count"]
|
||||||
|
|
||||||
|
|
||||||
|
def _quota_remaining() -> int:
|
||||||
|
"""Retourne le nombre de requêtes restantes pour aujourd'hui."""
|
||||||
|
quota = _load_quota()
|
||||||
|
return max(0, DAILY_QUOTA_LIMIT - quota["count"])
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Google Places Scraper ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class GooglePlacesScraper:
|
||||||
|
"""
|
||||||
|
Scraping via Google Places API (Nearby Search + Place Details).
|
||||||
|
Filtre les lieux sans site web côté API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
BASE_URL = "https://maps.googleapis.com/maps/api/place"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
if not GOOGLE_PLACES_API_KEY:
|
||||||
|
raise EnvironmentError(
|
||||||
|
"GOOGLE_PLACES_API_KEY non définie. "
|
||||||
|
"Ajouter dans /home/h3r7/.env et relancer."
|
||||||
|
)
|
||||||
|
self.api_key = GOOGLE_PLACES_API_KEY
|
||||||
|
|
||||||
|
def _nearby_search(self, place_type: str, page_token: str = None) -> dict:
|
||||||
|
"""Appel Nearby Search — 1 requête comptabilisée."""
|
||||||
|
params = {
|
||||||
|
"key": self.api_key,
|
||||||
|
"location": f"{MEL_CENTER_LAT},{MEL_CENTER_LNG}",
|
||||||
|
"radius": MEL_RADIUS_M,
|
||||||
|
"type": place_type,
|
||||||
|
}
|
||||||
|
if page_token:
|
||||||
|
params["pagetoken"] = page_token
|
||||||
|
|
||||||
|
_increment_quota()
|
||||||
|
time.sleep(PLACES_SLEEP_S)
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.get(
|
||||||
|
f"{self.BASE_URL}/nearbysearch/json",
|
||||||
|
params=params,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"NearbySearch error (type={place_type}): {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _place_details(self, place_id: str) -> dict:
|
||||||
|
"""Place Details pour récupérer website, phone, rating, etc. — 1 requête."""
|
||||||
|
params = {
|
||||||
|
"key": self.api_key,
|
||||||
|
"place_id": place_id,
|
||||||
|
"fields": "name,formatted_address,formatted_phone_number,website,rating,user_ratings_total",
|
||||||
|
}
|
||||||
|
|
||||||
|
_increment_quota()
|
||||||
|
time.sleep(PLACES_SLEEP_S)
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.get(
|
||||||
|
f"{self.BASE_URL}/details/json",
|
||||||
|
params=params,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json().get("result", {})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"PlaceDetails error (place_id={place_id}): {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def scrape(self, max_leads: int = 50) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Scrape les restaurants/cafés/bars MEL sans site web.
|
||||||
|
|
||||||
|
Retourne une liste de dicts normalisés compatibles LeadHunter CRM :
|
||||||
|
source, name, address, phone, rating, reviews_count, website, rgpd_ok
|
||||||
|
"""
|
||||||
|
leads = []
|
||||||
|
seen_ids = set()
|
||||||
|
|
||||||
|
for place_type in TARGET_TYPES:
|
||||||
|
if _quota_remaining() < 10:
|
||||||
|
logger.warning(
|
||||||
|
"Quota journalier presque épuisé — arrêt scraping Google Places."
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
logger.info(f"Scraping Google Places — type={place_type}")
|
||||||
|
page_token = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if _quota_remaining() < 5:
|
||||||
|
logger.warning("Quota insuffisant pour continuer la pagination.")
|
||||||
|
break
|
||||||
|
|
||||||
|
data = self._nearby_search(place_type, page_token)
|
||||||
|
results = data.get("results", [])
|
||||||
|
|
||||||
|
for place in results:
|
||||||
|
if len(leads) >= max_leads:
|
||||||
|
break
|
||||||
|
|
||||||
|
place_id = place.get("place_id", "")
|
||||||
|
if not place_id or place_id in seen_ids:
|
||||||
|
continue
|
||||||
|
seen_ids.add(place_id)
|
||||||
|
|
||||||
|
if _quota_remaining() < 2:
|
||||||
|
logger.warning("Quota épuisé avant details.")
|
||||||
|
break
|
||||||
|
|
||||||
|
details = self._place_details(place_id)
|
||||||
|
|
||||||
|
# Filtre : on ne garde que les lieux SANS site web
|
||||||
|
if details.get("website"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
lead = {
|
||||||
|
"source": "google_places",
|
||||||
|
"name": details.get("name") or place.get("name", ""),
|
||||||
|
"address": details.get("formatted_address")
|
||||||
|
or place.get("vicinity", ""),
|
||||||
|
"phone": details.get("formatted_phone_number", ""),
|
||||||
|
"rating": details.get("rating") or place.get("rating"),
|
||||||
|
"reviews_count": details.get("user_ratings_total")
|
||||||
|
or place.get("user_ratings_total"),
|
||||||
|
"website": "",
|
||||||
|
"rgpd_ok": True, # Données publiques Google Places uniquement
|
||||||
|
}
|
||||||
|
leads.append(lead)
|
||||||
|
logger.info(f"Lead trouvé (Google Places) : {lead['name']}")
|
||||||
|
|
||||||
|
if len(leads) >= max_leads:
|
||||||
|
break
|
||||||
|
|
||||||
|
page_token = data.get("next_page_token")
|
||||||
|
if not page_token:
|
||||||
|
break
|
||||||
|
|
||||||
|
# L'API Google Places nécessite un délai avant d'utiliser next_page_token
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
logger.info(f"Google Places : {len(leads)} leads collectés.")
|
||||||
|
return leads
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Overpass / OSM Fallback ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class OverpassScraper:
|
||||||
|
"""
|
||||||
|
Fallback OSM via Overpass API.
|
||||||
|
Cible les nœuds/ways dans la boundary MEL sans attribut 'website'.
|
||||||
|
Données publiques ODbL — RGPD OK.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def scrape(self, max_leads: int = 100) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Scrape via Overpass API — retourne des leads normalisés.
|
||||||
|
"""
|
||||||
|
logger.info("Scraping Overpass OSM — boundary MEL")
|
||||||
|
leads = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.post(
|
||||||
|
OVERPASS_URL,
|
||||||
|
data={"data": OVERPASS_MEL_QUERY},
|
||||||
|
timeout=90,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Overpass API error : {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
elements = data.get("elements", [])
|
||||||
|
logger.info(f"Overpass : {len(elements)} éléments bruts reçus.")
|
||||||
|
|
||||||
|
for el in elements[:max_leads]:
|
||||||
|
tags = el.get("tags", {})
|
||||||
|
|
||||||
|
# Coordonnées (pour les ways, Overpass retourne 'center')
|
||||||
|
lat = el.get("lat") or (el.get("center") or {}).get("lat")
|
||||||
|
lon = el.get("lon") or (el.get("center") or {}).get("lon")
|
||||||
|
|
||||||
|
name = tags.get("name", "")
|
||||||
|
if not name:
|
||||||
|
continue # Ignorer les lieux sans nom
|
||||||
|
|
||||||
|
addr_parts = [
|
||||||
|
tags.get("addr:housenumber", ""),
|
||||||
|
tags.get("addr:street", ""),
|
||||||
|
tags.get("addr:city", ""),
|
||||||
|
tags.get("addr:postcode", ""),
|
||||||
|
]
|
||||||
|
address = " ".join(p for p in addr_parts if p).strip()
|
||||||
|
if not address and lat and lon:
|
||||||
|
address = f"{lat:.4f},{lon:.4f}"
|
||||||
|
|
||||||
|
lead = {
|
||||||
|
"source": "osm",
|
||||||
|
"name": name,
|
||||||
|
"address": address,
|
||||||
|
"phone": tags.get("phone", tags.get("contact:phone", "")),
|
||||||
|
"rating": None,
|
||||||
|
"reviews_count": None,
|
||||||
|
"website": "",
|
||||||
|
"rgpd_ok": True, # Données publiques ODbL
|
||||||
|
}
|
||||||
|
leads.append(lead)
|
||||||
|
logger.info(f"Lead trouvé (OSM) : {lead['name']}")
|
||||||
|
|
||||||
|
logger.info(f"Overpass : {len(leads)} leads collectés.")
|
||||||
|
return leads
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Orchestrateur ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def run_scraping(
|
||||||
|
max_leads: int = 100, use_google: bool = True, use_osm: bool = True
|
||||||
|
) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Lance le scraping Google Places + fallback OSM.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_leads: nombre maximum de leads à collecter au total.
|
||||||
|
use_google: activer Google Places (nécessite GOOGLE_PLACES_API_KEY).
|
||||||
|
use_osm: activer le fallback Overpass OSM.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Liste de leads normalisés (dédupliqués par nom + adresse).
|
||||||
|
"""
|
||||||
|
all_leads = []
|
||||||
|
seen_keys = set()
|
||||||
|
|
||||||
|
def _dedup_key(lead: dict) -> str:
|
||||||
|
return f"{lead['name'].lower().strip()}|{lead['address'].lower().strip()[:40]}"
|
||||||
|
|
||||||
|
if use_google:
|
||||||
|
try:
|
||||||
|
scraper = GooglePlacesScraper()
|
||||||
|
google_leads = scraper.scrape(max_leads=max_leads)
|
||||||
|
for lead in google_leads:
|
||||||
|
k = _dedup_key(lead)
|
||||||
|
if k not in seen_keys:
|
||||||
|
seen_keys.add(k)
|
||||||
|
all_leads.append(lead)
|
||||||
|
except EnvironmentError as e:
|
||||||
|
logger.warning(f"Google Places désactivé : {e}")
|
||||||
|
use_google = False
|
||||||
|
|
||||||
|
remaining = max_leads - len(all_leads)
|
||||||
|
if use_osm and remaining > 0:
|
||||||
|
osm_leads = OverpassScraper().scrape(max_leads=remaining)
|
||||||
|
for lead in osm_leads:
|
||||||
|
k = _dedup_key(lead)
|
||||||
|
if k not in seen_keys:
|
||||||
|
seen_keys.add(k)
|
||||||
|
all_leads.append(lead)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"run_scraping terminé — {len(all_leads)} leads uniques "
|
||||||
|
f"(Google={use_google}, OSM={use_osm}). "
|
||||||
|
f"Quota restant aujourd'hui : {_quota_remaining()}"
|
||||||
|
)
|
||||||
|
return all_leads
|
||||||
|
|
||||||
|
|
||||||
|
# ─── CLI (debug) ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
assert GOOGLE_PLACES_API_KEY, (
|
||||||
|
"GOOGLE_PLACES_API_KEY manquante — "
|
||||||
|
"ajouter 'export GOOGLE_PLACES_API_KEY=xxx' dans /home/h3r7/.env"
|
||||||
|
)
|
||||||
|
leads = run_scraping(max_leads=10)
|
||||||
|
for i, l in enumerate(leads, 1):
|
||||||
|
print(f"{i:02d}. [{l['source']}] {l['name']} — {l['address']}")
|
||||||
Reference in New Issue
Block a user