From f9a45e6deb64728cc76c2e0ae5f2d95c14f3ec1e Mon Sep 17 00:00:00 2001 From: DevOps Engineer Date: Mon, 27 Apr 2026 16:33:30 +0200 Subject: [PATCH 1/3] =?UTF-8?q?feat(HRT-66):=20LeadHunter=20S1=20=E2=80=94?= =?UTF-8?q?=20core=20scraping,=20scoring,=20CRM=20SQLite=20et=20API=20Flas?= =?UTF-8?q?k?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - leadhunter_scraper.py : Google Places Nearby Search + Place Details avec compteur quota daily_quota.json (limite 900/jour), sleep(0.5) entre requêtes, fallback Overpass OSM boundary MEL, filtre website absent, déduplcation, rgpd_ok=True - leadhunter_scorer.py : moteur de scoring 0-8 pts critère n°1 = site web absent (+3), avis ≥50 (+2), note ≥4.0 (+2), téléphone (+1), note <3.0 (-1) - leadhunter_crm.py : CRM SQLite schéma validé CTO (id, source, name, address, phone, rating, reviews_count, website, score, rgpd_ok, scraped_at, status) CRUD : insert_lead, get_leads, update_lead_status, get_stats, export_csv - leadhunter_api.py : Flask service port 8769 GET /api/leads, POST /api/leads/scrape, GET /api/leads/stats, GET /api/leads/export, PATCH /api/leads//status, GET /health assert GOOGLE_PLACES_API_KEY au démarrage scraping asynchrone (thread) avec status endpoint - infra/turf-saas-leadhunter.service : service systemd EnvironmentFile=/home/h3r7/.env pour GOOGLE_PLACES_API_KEY Tests : py_compile OK, scorer testé, CRM SQLite testé Co-Authored-By: Paperclip --- infra/turf-saas-leadhunter.service | 21 ++ leadhunter_api.py | 303 ++++++++++++++++++++++ leadhunter_crm.py | 349 +++++++++++++++++++++++++ leadhunter_scorer.py | 193 ++++++++++++++ leadhunter_scraper.py | 393 +++++++++++++++++++++++++++++ 5 files changed, 1259 insertions(+) create mode 100644 infra/turf-saas-leadhunter.service create mode 100644 leadhunter_api.py create mode 100644 leadhunter_crm.py create mode 100644 leadhunter_scorer.py create mode 100644 leadhunter_scraper.py diff --git a/infra/turf-saas-leadhunter.service b/infra/turf-saas-leadhunter.service new file mode 100644 index 0000000..5620a8f --- /dev/null +++ b/infra/turf-saas-leadhunter.service @@ -0,0 +1,21 @@ +[Unit] +Description=H3R7Tech LeadHunter API (Port 8769) +Documentation=https://portal-kolifee.duckdns.org +After=network.target + +[Service] +Type=simple +User=h3r7 +WorkingDirectory=/home/h3r7/turf_saas + +# Charger les variables d'environnement depuis /home/h3r7/.env +# (notamment GOOGLE_PLACES_API_KEY) +EnvironmentFile=/home/h3r7/.env + +ExecStart=/home/h3r7/turf_saas/venv/bin/python3 /home/h3r7/turf_saas/leadhunter_api.py +Restart=always +RestartSec=10 +Environment=PYTHONPATH=/home/h3r7/turf_saas + +[Install] +WantedBy=multi-user.target diff --git a/leadhunter_api.py b/leadhunter_api.py new file mode 100644 index 0000000..d720ebd --- /dev/null +++ b/leadhunter_api.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +""" +H3R7Tech — LeadHunter API +=========================== +Service Flask sur port 8769 exposant les endpoints LeadHunter. + +Endpoints : + GET /api/leads — Liste les leads (filtres: status, limit, offset) + POST /api/leads/scrape — Lance un job de scraping asynchrone + GET /api/leads/stats — Statistiques globales du CRM + GET /api/leads/export — Export CSV des leads + PATCH /api/leads//status — Met à jour le statut d'un lead + +Port : 8769 (validé CTO — disponible sur VPS, hors 8768/8765/5000/8792) + +Auteur: H3R7Tech Backend Engineer +Issue: HRT-66 +""" + +import os +import threading +import logging +from logging.handlers import RotatingFileHandler +from flask import Flask, jsonify, request, Response +from flask_cors import CORS + +# Import des modules LeadHunter +from leadhunter_crm import ( + init_db, + insert_leads, + get_leads, + get_lead_by_id, + update_lead_status, + get_stats, + export_csv, + VALID_STATUSES, + DB_PATH, +) +from leadhunter_scraper import run_scraping, GOOGLE_PLACES_API_KEY +from leadhunter_scorer import LeadScorer + +# ─── Assertions au démarrage ───────────────────────────────────────────────── +# Vérification obligatoire : la clé API doit être présente au démarrage +assert os.environ.get("GOOGLE_PLACES_API_KEY"), ( + "GOOGLE_PLACES_API_KEY manquante. " + "Ajouter dans /home/h3r7/.env : export GOOGLE_PLACES_API_KEY=xxx" +) + +# ─── Logging ──────────────────────────────────────────────────────────────── +logger = logging.getLogger("leadhunter.api") + +_handler = RotatingFileHandler( + "/home/h3r7/leadhunter.log", + maxBytes=5 * 1024 * 1024, + backupCount=3, +) +_handler.setFormatter( + logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s") +) +logger.setLevel(logging.INFO) +if not logger.handlers: + logger.addHandler(_handler) + logger.addHandler(logging.StreamHandler()) + +# ─── App Flask ─────────────────────────────────────────────────────────────── +app = Flask(__name__) +CORS(app) + +# Scorer singleton +scorer = LeadScorer() + +# État global du job de scraping (simple flag — pas de celery nécessaire pour le POC) +_scrape_job = { + "running": False, + "last_run": None, + "last_count": 0, + "last_error": None, +} +_scrape_lock = threading.Lock() + +# ─── Init DB ───────────────────────────────────────────────────────────────── +init_db(DB_PATH) +logger.info("LeadHunter API démarrée — DB initialisée.") + + +# ─── Helpers ───────────────────────────────────────────────────────────────── + + +def _run_scrape_job(max_leads: int, use_google: bool, use_osm: bool) -> None: + """Job de scraping exécuté dans un thread séparé.""" + with _scrape_lock: + _scrape_job["running"] = True + _scrape_job["last_error"] = None + + try: + leads_raw = run_scraping( + max_leads=max_leads, + use_google=use_google, + use_osm=use_osm, + ) + leads_scored = scorer.score_leads(leads_raw) + inserted_ids = insert_leads(leads_scored) + + with _scrape_lock: + _scrape_job["last_count"] = len(inserted_ids) + from datetime import datetime + + _scrape_job["last_run"] = datetime.utcnow().isoformat() + "Z" + + logger.info(f"Scrape job terminé : {len(inserted_ids)} leads insérés.") + + except Exception as e: + logger.warning(f"Scrape job erreur : {e}") + with _scrape_lock: + _scrape_job["last_error"] = str(e) + + finally: + with _scrape_lock: + _scrape_job["running"] = False + + +# ─── Routes ────────────────────────────────────────────────────────────────── + + +@app.route("/api/leads", methods=["GET"]) +def api_get_leads(): + """ + Liste les leads du CRM. + + Query params : + - status (str, optional) : filtre sur new/contacted/closed/rejected + - limit (int, default=50) : pagination + - offset (int, default=0) : pagination + """ + status = request.args.get("status") + try: + limit = int(request.args.get("limit", 50)) + offset = int(request.args.get("offset", 0)) + except ValueError: + return jsonify({"error": "limit et offset doivent être des entiers"}), 400 + + if status and status not in VALID_STATUSES: + return jsonify( + {"error": f"status invalide. Valeurs acceptées : {VALID_STATUSES}"} + ), 400 + + leads = get_leads(status=status, limit=limit, offset=offset) + return jsonify( + { + "leads": leads, + "count": len(leads), + "limit": limit, + "offset": offset, + "status_filter": status, + } + ) + + +@app.route("/api/leads/scrape", methods=["POST"]) +def api_scrape(): + """ + Lance un job de scraping asynchrone. + + Body JSON (optionnel) : + - max_leads (int, default=100) + - use_google (bool, default=true) + - use_osm (bool, default=true) + + Retourne immédiatement avec le statut du job. + """ + with _scrape_lock: + if _scrape_job["running"]: + return jsonify( + { + "status": "already_running", + "message": "Un job de scraping est déjà en cours.", + } + ), 409 + + body = request.get_json(silent=True) or {} + max_leads = int(body.get("max_leads", 100)) + use_google = bool(body.get("use_google", True)) + use_osm = bool(body.get("use_osm", True)) + + thread = threading.Thread( + target=_run_scrape_job, + args=(max_leads, use_google, use_osm), + daemon=True, + ) + thread.start() + + logger.info( + f"Job de scraping lancé (max_leads={max_leads}, " + f"use_google={use_google}, use_osm={use_osm})" + ) + + return jsonify( + { + "status": "started", + "message": "Job de scraping démarré en arrière-plan.", + "params": { + "max_leads": max_leads, + "use_google": use_google, + "use_osm": use_osm, + }, + } + ), 202 + + +@app.route("/api/leads/scrape/status", methods=["GET"]) +def api_scrape_status(): + """Retourne l'état courant du job de scraping.""" + with _scrape_lock: + return jsonify(dict(_scrape_job)) + + +@app.route("/api/leads/stats", methods=["GET"]) +def api_stats(): + """ + Statistiques globales du CRM LeadHunter. + + Retourne : total, by_status, by_source, avg_score, top_leads_count + """ + stats = get_stats() + if not stats: + return jsonify({"error": "Impossible de calculer les statistiques"}), 500 + return jsonify(stats) + + +@app.route("/api/leads/export", methods=["GET"]) +def api_export(): + """ + Export CSV de tous les leads (ou filtrés par status). + + Query params : + - status (str, optional) + """ + status = request.args.get("status") + if status and status not in VALID_STATUSES: + return jsonify({"error": f"status invalide : {VALID_STATUSES}"}), 400 + + csv_content = export_csv(status=status) + filename = f"leadhunter_leads{'_' + status if status else ''}.csv" + + return Response( + csv_content, + mimetype="text/csv", + headers={ + "Content-Disposition": f"attachment; filename={filename}", + "Content-Type": "text/csv; charset=utf-8", + }, + ) + + +@app.route("/api/leads//status", methods=["PATCH"]) +def api_update_status(lead_id: int): + """ + Met à jour le statut d'un lead. + + Body JSON : + - status (str) : new | contacted | closed | rejected + """ + body = request.get_json(silent=True) + if not body or "status" not in body: + return jsonify({"error": "Body JSON requis avec le champ 'status'"}), 400 + + new_status = body["status"] + if new_status not in VALID_STATUSES: + return jsonify({"error": f"status invalide. Valeurs : {VALID_STATUSES}"}), 400 + + lead = get_lead_by_id(lead_id) + if not lead: + return jsonify({"error": f"Lead id={lead_id} introuvable"}), 404 + + success = update_lead_status(lead_id, new_status) + if not success: + return jsonify({"error": "Mise à jour échouée"}), 500 + + return jsonify( + { + "success": True, + "lead_id": lead_id, + "new_status": new_status, + } + ) + + +@app.route("/health", methods=["GET"]) +def health(): + """Healthcheck pour systemd / monitoring.""" + return jsonify( + { + "status": "ok", + "service": "leadhunter-api", + "port": 8769, + } + ) + + +# ─── Entrypoint ────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=8769, debug=False) diff --git a/leadhunter_crm.py b/leadhunter_crm.py new file mode 100644 index 0000000..2094ebd --- /dev/null +++ b/leadhunter_crm.py @@ -0,0 +1,349 @@ +#!/usr/bin/env python3 +""" +H3R7Tech — LeadHunter CRM (SQLite) +===================================== +Couche de persistance SQLite pour les leads LeadHunter. + +Schéma validé CTO (HRT-66) : + CREATE TABLE leads ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, -- 'google_places' ou 'osm' + name TEXT NOT NULL, + address TEXT, + phone TEXT, + rating REAL, + reviews_count INTEGER, + website TEXT, + score INTEGER, + rgpd_ok BOOLEAN DEFAULT 1, + scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + status TEXT DEFAULT 'new' -- new, contacted, closed, rejected + ); + +Auteur: H3R7Tech Backend Engineer +Issue: HRT-66 +""" + +import sqlite3 +import logging +import csv +import io +from contextlib import contextmanager +from datetime import datetime +from logging.handlers import RotatingFileHandler +from typing import Optional + +# ─── Logging ──────────────────────────────────────────────────────────────── +logger = logging.getLogger("leadhunter.crm") + +_handler = RotatingFileHandler( + "/home/h3r7/leadhunter.log", + maxBytes=5 * 1024 * 1024, + backupCount=3, +) +_handler.setFormatter( + logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s") +) +logger.setLevel(logging.INFO) +if not logger.handlers: + logger.addHandler(_handler) + logger.addHandler(logging.StreamHandler()) + +# ─── Chemin DB ─────────────────────────────────────────────────────────────── +DB_PATH = "/home/h3r7/leadhunter.db" + +# Statuts valides pour un lead +VALID_STATUSES = {"new", "contacted", "closed", "rejected"} + + +# ─── Initialisation ────────────────────────────────────────────────────────── + + +def init_db(db_path: str = DB_PATH) -> None: + """ + Crée la base SQLite et la table leads si elle n'existe pas. + Idempotent — peut être appelé au démarrage de l'API. + """ + with sqlite3.connect(db_path) as conn: + conn.execute(""" + CREATE TABLE IF NOT EXISTS leads ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, + name TEXT NOT NULL, + address TEXT, + phone TEXT, + rating REAL, + reviews_count INTEGER, + website TEXT, + score INTEGER, + rgpd_ok BOOLEAN DEFAULT 1, + scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + status TEXT DEFAULT 'new' + ) + """) + conn.commit() + logger.info(f"DB initialisée : {db_path}") + + +# ─── Context manager ───────────────────────────────────────────────────────── + + +@contextmanager +def _get_conn(db_path: str = DB_PATH): + """Fournit une connexion SQLite avec row_factory.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + try: + yield conn + conn.commit() + except Exception as e: + conn.rollback() + logger.warning(f"DB transaction rollback : {e}") + raise + finally: + conn.close() + + +# ─── CRUD ──────────────────────────────────────────────────────────────────── + + +def insert_lead(lead: dict, db_path: str = DB_PATH) -> Optional[int]: + """ + Insère un lead normalisé dans la DB. + + Args: + lead: dict avec les champs normalisés (source, name, address, ...) + db_path: chemin vers la DB SQLite. + + Returns: + L'id SQLite du lead inséré, ou None en cas d'erreur. + """ + try: + with _get_conn(db_path) as conn: + cursor = conn.execute( + """ + INSERT INTO leads + (source, name, address, phone, rating, reviews_count, + website, score, rgpd_ok, status) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + lead.get("source", "unknown"), + lead.get("name", ""), + lead.get("address", ""), + lead.get("phone", ""), + lead.get("rating"), + lead.get("reviews_count"), + lead.get("website", ""), + lead.get("score"), + 1 if lead.get("rgpd_ok", True) else 0, + lead.get("status", "new"), + ), + ) + lead_id = cursor.lastrowid + logger.info(f"Lead inséré id={lead_id} : {lead.get('name')}") + return lead_id + except Exception as e: + logger.warning(f"insert_lead error : {e}") + return None + + +def insert_leads(leads: list[dict], db_path: str = DB_PATH) -> list[int]: + """ + Insère une liste de leads en batch. + + Returns: + Liste des ids insérés. + """ + ids = [] + for lead in leads: + lead_id = insert_lead(lead, db_path) + if lead_id is not None: + ids.append(lead_id) + logger.info(f"insert_leads : {len(ids)}/{len(leads)} insérés.") + return ids + + +def get_leads( + status: Optional[str] = None, + limit: int = 100, + offset: int = 0, + db_path: str = DB_PATH, +) -> list[dict]: + """ + Récupère les leads avec filtre optionnel sur le statut. + + Args: + status: filtre sur le champ 'status' (new, contacted, closed, rejected). + limit: pagination — nombre de résultats max. + offset: pagination — décalage. + + Returns: + Liste de dicts (tous les champs de la table leads). + """ + try: + with _get_conn(db_path) as conn: + if status: + rows = conn.execute( + "SELECT * FROM leads WHERE status = ? ORDER BY score DESC, scraped_at DESC LIMIT ? OFFSET ?", + (status, limit, offset), + ).fetchall() + else: + rows = conn.execute( + "SELECT * FROM leads ORDER BY score DESC, scraped_at DESC LIMIT ? OFFSET ?", + (limit, offset), + ).fetchall() + return [dict(r) for r in rows] + except Exception as e: + logger.warning(f"get_leads error : {e}") + return [] + + +def get_lead_by_id(lead_id: int, db_path: str = DB_PATH) -> Optional[dict]: + """Récupère un lead par son id.""" + try: + with _get_conn(db_path) as conn: + row = conn.execute( + "SELECT * FROM leads WHERE id = ?", (lead_id,) + ).fetchone() + return dict(row) if row else None + except Exception as e: + logger.warning(f"get_lead_by_id error : {e}") + return None + + +def update_lead_status(lead_id: int, status: str, db_path: str = DB_PATH) -> bool: + """ + Met à jour le statut d'un lead. + + Args: + lead_id: id du lead. + status: nouveau statut ('new', 'contacted', 'closed', 'rejected'). + + Returns: + True si mise à jour réussie, False sinon. + """ + if status not in VALID_STATUSES: + logger.warning(f"update_lead_status : statut invalide '{status}'") + return False + try: + with _get_conn(db_path) as conn: + conn.execute( + "UPDATE leads SET status = ? WHERE id = ?", + (status, lead_id), + ) + logger.info(f"Lead id={lead_id} statut → {status}") + return True + except Exception as e: + logger.warning(f"update_lead_status error : {e}") + return False + + +def get_stats(db_path: str = DB_PATH) -> dict: + """ + Retourne les statistiques globales du CRM. + + Returns: + Dict avec total, by_status, by_source, avg_score, top_leads_count + """ + try: + with _get_conn(db_path) as conn: + total = conn.execute("SELECT COUNT(*) FROM leads").fetchone()[0] + + by_status_rows = conn.execute( + "SELECT status, COUNT(*) as cnt FROM leads GROUP BY status" + ).fetchall() + by_status = {r["status"]: r["cnt"] for r in by_status_rows} + + by_source_rows = conn.execute( + "SELECT source, COUNT(*) as cnt FROM leads GROUP BY source" + ).fetchall() + by_source = {r["source"]: r["cnt"] for r in by_source_rows} + + avg_score_row = conn.execute( + "SELECT AVG(score) FROM leads WHERE score IS NOT NULL" + ).fetchone() + avg_score = round(avg_score_row[0] or 0, 2) + + # Leads "chauds" = score ≥ 5 + top_count = conn.execute( + "SELECT COUNT(*) FROM leads WHERE score >= 5" + ).fetchone()[0] + + return { + "total": total, + "by_status": by_status, + "by_source": by_source, + "avg_score": avg_score, + "top_leads_count": top_count, + "generated_at": datetime.utcnow().isoformat() + "Z", + } + except Exception as e: + logger.warning(f"get_stats error : {e}") + return {} + + +def export_csv( + status: Optional[str] = None, + db_path: str = DB_PATH, +) -> str: + """ + Exporte les leads en CSV (string). + + Args: + status: filtre optionnel sur le statut. + + Returns: + Contenu CSV en string UTF-8. + """ + leads = get_leads(status=status, limit=10000, db_path=db_path) + + output = io.StringIO() + fieldnames = [ + "id", + "source", + "name", + "address", + "phone", + "rating", + "reviews_count", + "website", + "score", + "rgpd_ok", + "scraped_at", + "status", + ] + writer = csv.DictWriter(output, fieldnames=fieldnames, extrasaction="ignore") + writer.writeheader() + writer.writerows(leads) + + logger.info(f"export_csv : {len(leads)} leads exportés.") + return output.getvalue() + + +# ─── CLI (debug) ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + init_db() + + # Test insertion + test_lead = { + "source": "google_places", + "name": "Restaurant Test", + "address": "10 rue de la Paix, 59000 Lille", + "phone": "+33 3 20 00 00 01", + "rating": 4.5, + "reviews_count": 120, + "website": "", + "score": 8, + "rgpd_ok": True, + "status": "new", + } + lead_id = insert_lead(test_lead) + print(f"Lead inséré : id={lead_id}") + + leads = get_leads() + print(f"Leads en DB : {len(leads)}") + + stats = get_stats() + print(f"Stats : {stats}") diff --git a/leadhunter_scorer.py b/leadhunter_scorer.py new file mode 100644 index 0000000..0ec4507 --- /dev/null +++ b/leadhunter_scorer.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +""" +H3R7Tech — LeadHunter Scorer +================================ +Moteur de scoring des leads restaurants MEL. + +Critères (ordre de priorité métier) : + 1. [+3] Site web absent ← CRITIQUE : raison d'être du produit + 2. [+2] Nombre d'avis élevé (≥ 50) : forte activité = bon prospect de vente + 3. [+2] Note Google élevée (≥ 4.0) : établissement sérieux + 4. [+1] Téléphone présent : facilite la prise de contact + 5. [-1] Note faible (< 3.0) : risque reputationnel pour la prestation web + +Score maximum théorique : 8 +Score minimum : 0 (leads avec site web ne doivent pas passer ici) + +Auteur: H3R7Tech Backend Engineer +Issue: HRT-66 +""" + +import logging +from logging.handlers import RotatingFileHandler + +# ─── Logging ──────────────────────────────────────────────────────────────── +logger = logging.getLogger("leadhunter.scorer") + +_handler = RotatingFileHandler( + "/home/h3r7/leadhunter.log", + maxBytes=5 * 1024 * 1024, + backupCount=3, +) +_handler.setFormatter( + logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s") +) +logger.setLevel(logging.INFO) +if not logger.handlers: + logger.addHandler(_handler) + logger.addHandler(logging.StreamHandler()) + + +# ─── Scorer ────────────────────────────────────────────────────────────────── + + +class LeadScorer: + """ + Calcule le score de priorité d'un lead. + + Le score sert à trier les leads dans le CRM : + - Score élevé = prospect chaud (sans site + actif + bien noté) + - Score faible = prospect froid (peut être ignoré ou traité en dernier) + """ + + def _calculate_score(self, lead: dict) -> int: + """ + Calcule le score d'un lead. + + Args: + lead: dict avec les champs normalisés du scraper + (name, website, rating, reviews_count, phone, ...) + + Returns: + Score entier (0–8) + """ + score = 0 + + # ── Critère 1 : site web absent [CRITIQUE — logique métier centrale] ── + # C'est le critère n°1 : on cherche des restaurants SANS site web + # pour leur proposer une création de site à 800–1500€. + website = lead.get("website", "") + if not website or not website.strip(): + score += 3 + logger.debug(f"{lead.get('name')}: +3 (site web absent)") + else: + # Si le lead a un site web, score = 0 immédiatement. + # Ce cas ne devrait pas se produire (filtre scraper), + # mais on reste défensif. + logger.warning( + f"{lead.get('name')}: site web présent ({website}), " + "lead ignoré pour scoring." + ) + return 0 + + # ── Critère 2 : nombre d'avis élevé (≥ 50) ────────────────────────── + reviews = lead.get("reviews_count") + if reviews is not None: + try: + reviews = int(reviews) + if reviews >= 50: + score += 2 + logger.debug(f"{lead.get('name')}: +2 (avis ≥ 50 : {reviews})") + except (TypeError, ValueError) as e: + logger.warning(f"reviews_count invalide pour {lead.get('name')}: {e}") + + # ── Critère 3 : bonne note Google (≥ 4.0) ─────────────────────────── + rating = lead.get("rating") + if rating is not None: + try: + rating = float(rating) + if rating >= 4.0: + score += 2 + logger.debug(f"{lead.get('name')}: +2 (note ≥ 4.0 : {rating})") + elif rating < 3.0: + score -= 1 + logger.debug(f"{lead.get('name')}: -1 (note < 3.0 : {rating})") + except (TypeError, ValueError) as e: + logger.warning(f"rating invalide pour {lead.get('name')}: {e}") + + # ── Critère 4 : téléphone présent ──────────────────────────────────── + phone = lead.get("phone", "") + if phone and phone.strip(): + score += 1 + logger.debug(f"{lead.get('name')}: +1 (téléphone présent)") + + # Plancher à 0 + score = max(0, score) + logger.info(f"Score calculé pour '{lead.get('name')}' : {score}/8") + return score + + def score_lead(self, lead: dict) -> dict: + """ + Enrichit un lead avec son score. + + Args: + lead: dict normalisé du scraper. + + Returns: + Même dict avec le champ 'score' ajouté/mis à jour. + """ + lead = dict(lead) # copie défensive + lead["score"] = self._calculate_score(lead) + return lead + + def score_leads(self, leads: list[dict]) -> list[dict]: + """ + Score et trie une liste de leads (score décroissant). + + Args: + leads: liste de dicts normalisés. + + Returns: + Liste triée par score décroissant. + """ + scored = [self.score_lead(lead) for lead in leads] + scored.sort(key=lambda l: l.get("score", 0), reverse=True) + logger.info( + f"score_leads terminé : {len(scored)} leads scorés. " + f"Score max = {scored[0]['score'] if scored else 0}, " + f"Score min = {scored[-1]['score'] if scored else 0}" + ) + return scored + + +# ─── CLI (debug) ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + # Exemple de test rapide sans appel API + test_leads = [ + { + "name": "Restaurant A", + "website": "", + "rating": 4.5, + "reviews_count": 120, + "phone": "+33 3 20 00 00 01", + }, + { + "name": "Restaurant B", + "website": "", + "rating": 3.8, + "reviews_count": 30, + "phone": "", + }, + { + "name": "Café C", + "website": "", + "rating": 2.5, + "reviews_count": 5, + "phone": "+33 3 20 00 00 03", + }, + { + "name": "Bar D avec site", + "website": "https://bar-d.fr", + "rating": 4.2, + "reviews_count": 80, + "phone": "+33 3 20 00 00 04", + }, + ] + + scorer = LeadScorer() + results = scorer.score_leads(test_leads) + + print("\n=== Résultats scoring ===") + for r in results: + print(f" [{r['score']:2d}/8] {r['name']}") diff --git a/leadhunter_scraper.py b/leadhunter_scraper.py new file mode 100644 index 0000000..defcccd --- /dev/null +++ b/leadhunter_scraper.py @@ -0,0 +1,393 @@ +#!/usr/bin/env python3 +""" +H3R7Tech — LeadHunter Scraper +================================ +Agent de scraping pour la détection de restaurants sans site web +dans la MEL (Métropole Européenne de Lille). + +Sources : + - Google Places API (primary) + - OpenStreetMap / Overpass API (fallback) + +Quota Google Places Free Tier : + - 28 500 requêtes/mois ≈ 950/jour + - Compteur persistent dans /home/h3r7/leadhunter_quota.json + +Auteur: H3R7Tech Backend Engineer +Issue: HRT-66 +""" + +import os +import json +import time +import logging +import requests +from datetime import date, datetime +from logging.handlers import RotatingFileHandler + +# ─── Logging ──────────────────────────────────────────────────────────────── +logger = logging.getLogger("leadhunter.scraper") + +_handler = RotatingFileHandler( + "/home/h3r7/leadhunter.log", + maxBytes=5 * 1024 * 1024, # 5 MB + backupCount=3, +) +_handler.setFormatter( + logging.Formatter("%(asctime)s %(levelname)-8s %(name)s — %(message)s") +) +logger.setLevel(logging.INFO) +if not logger.handlers: + logger.addHandler(_handler) + logger.addHandler(logging.StreamHandler()) + +# ─── Configuration ─────────────────────────────────────────────────────────── +GOOGLE_PLACES_API_KEY = os.environ.get("GOOGLE_PLACES_API_KEY") + +# Quota journalier Google Places Free Tier +DAILY_QUOTA_FILE = "/home/h3r7/leadhunter_quota.json" +DAILY_QUOTA_LIMIT = 900 # marge de sécurité vs les 950 théoriques + +# Délai entre requêtes Places pour éviter rate-limiting +PLACES_SLEEP_S = 0.5 + +# Bounding box MEL (Métropole Européenne de Lille) +MEL_CENTER_LAT = 50.6292 +MEL_CENTER_LNG = 3.0573 +MEL_RADIUS_M = 20000 # 20 km autour de Lille + +# Types de lieux ciblés +TARGET_TYPES = ["restaurant", "cafe", "bar", "bakery", "food"] + +# Overpass API endpoint +OVERPASS_URL = "https://overpass-api.de/api/interpreter" + +# Requête Overpass MEL — boundary nommée "Métropole Européenne de Lille" +OVERPASS_MEL_QUERY = """ +[out:json][timeout:60]; +area["name"="Métropole Européenne de Lille"]["boundary"="administrative"]->.mel; +( + node["amenity"~"^(restaurant|cafe|bar|fast_food|bakery)$"]["website"!~".+"](area.mel); + way["amenity"~"^(restaurant|cafe|bar|fast_food|bakery)$"]["website"!~".+"](area.mel); +); +out center 200; +""" + + +# ─── Quota Manager ─────────────────────────────────────────────────────────── + + +def _load_quota() -> dict: + """Charge le compteur quotidien depuis le fichier JSON.""" + today = str(date.today()) + if os.path.exists(DAILY_QUOTA_FILE): + try: + with open(DAILY_QUOTA_FILE, "r") as f: + data = json.load(f) + if data.get("date") == today: + return data + except Exception as e: + logger.warning(f"Impossible de lire le fichier quota : {e}") + return {"date": today, "count": 0} + + +def _save_quota(data: dict) -> None: + """Persiste le compteur quotidien.""" + try: + with open(DAILY_QUOTA_FILE, "w") as f: + json.dump(data, f) + except Exception as e: + logger.warning(f"Impossible d'écrire le fichier quota : {e}") + + +def _increment_quota(n: int = 1) -> int: + """Incrémente le compteur et retourne le total du jour.""" + quota = _load_quota() + quota["count"] += n + _save_quota(quota) + return quota["count"] + + +def _quota_remaining() -> int: + """Retourne le nombre de requêtes restantes pour aujourd'hui.""" + quota = _load_quota() + return max(0, DAILY_QUOTA_LIMIT - quota["count"]) + + +# ─── Google Places Scraper ──────────────────────────────────────────────────── + + +class GooglePlacesScraper: + """ + Scraping via Google Places API (Nearby Search + Place Details). + Filtre les lieux sans site web côté API. + """ + + BASE_URL = "https://maps.googleapis.com/maps/api/place" + + def __init__(self): + if not GOOGLE_PLACES_API_KEY: + raise EnvironmentError( + "GOOGLE_PLACES_API_KEY non définie. " + "Ajouter dans /home/h3r7/.env et relancer." + ) + self.api_key = GOOGLE_PLACES_API_KEY + + def _nearby_search(self, place_type: str, page_token: str = None) -> dict: + """Appel Nearby Search — 1 requête comptabilisée.""" + params = { + "key": self.api_key, + "location": f"{MEL_CENTER_LAT},{MEL_CENTER_LNG}", + "radius": MEL_RADIUS_M, + "type": place_type, + } + if page_token: + params["pagetoken"] = page_token + + _increment_quota() + time.sleep(PLACES_SLEEP_S) + + try: + resp = requests.get( + f"{self.BASE_URL}/nearbysearch/json", + params=params, + timeout=10, + ) + resp.raise_for_status() + return resp.json() + except Exception as e: + logger.warning(f"NearbySearch error (type={place_type}): {e}") + return {} + + def _place_details(self, place_id: str) -> dict: + """Place Details pour récupérer website, phone, rating, etc. — 1 requête.""" + params = { + "key": self.api_key, + "place_id": place_id, + "fields": "name,formatted_address,formatted_phone_number,website,rating,user_ratings_total", + } + + _increment_quota() + time.sleep(PLACES_SLEEP_S) + + try: + resp = requests.get( + f"{self.BASE_URL}/details/json", + params=params, + timeout=10, + ) + resp.raise_for_status() + return resp.json().get("result", {}) + except Exception as e: + logger.warning(f"PlaceDetails error (place_id={place_id}): {e}") + return {} + + def scrape(self, max_leads: int = 50) -> list[dict]: + """ + Scrape les restaurants/cafés/bars MEL sans site web. + + Retourne une liste de dicts normalisés compatibles LeadHunter CRM : + source, name, address, phone, rating, reviews_count, website, rgpd_ok + """ + leads = [] + seen_ids = set() + + for place_type in TARGET_TYPES: + if _quota_remaining() < 10: + logger.warning( + "Quota journalier presque épuisé — arrêt scraping Google Places." + ) + break + + logger.info(f"Scraping Google Places — type={place_type}") + page_token = None + + while True: + if _quota_remaining() < 5: + logger.warning("Quota insuffisant pour continuer la pagination.") + break + + data = self._nearby_search(place_type, page_token) + results = data.get("results", []) + + for place in results: + if len(leads) >= max_leads: + break + + place_id = place.get("place_id", "") + if not place_id or place_id in seen_ids: + continue + seen_ids.add(place_id) + + if _quota_remaining() < 2: + logger.warning("Quota épuisé avant details.") + break + + details = self._place_details(place_id) + + # Filtre : on ne garde que les lieux SANS site web + if details.get("website"): + continue + + lead = { + "source": "google_places", + "name": details.get("name") or place.get("name", ""), + "address": details.get("formatted_address") + or place.get("vicinity", ""), + "phone": details.get("formatted_phone_number", ""), + "rating": details.get("rating") or place.get("rating"), + "reviews_count": details.get("user_ratings_total") + or place.get("user_ratings_total"), + "website": "", + "rgpd_ok": True, # Données publiques Google Places uniquement + } + leads.append(lead) + logger.info(f"Lead trouvé (Google Places) : {lead['name']}") + + if len(leads) >= max_leads: + break + + page_token = data.get("next_page_token") + if not page_token: + break + + # L'API Google Places nécessite un délai avant d'utiliser next_page_token + time.sleep(2) + + logger.info(f"Google Places : {len(leads)} leads collectés.") + return leads + + +# ─── Overpass / OSM Fallback ────────────────────────────────────────────────── + + +class OverpassScraper: + """ + Fallback OSM via Overpass API. + Cible les nœuds/ways dans la boundary MEL sans attribut 'website'. + Données publiques ODbL — RGPD OK. + """ + + def scrape(self, max_leads: int = 100) -> list[dict]: + """ + Scrape via Overpass API — retourne des leads normalisés. + """ + logger.info("Scraping Overpass OSM — boundary MEL") + leads = [] + + try: + resp = requests.post( + OVERPASS_URL, + data={"data": OVERPASS_MEL_QUERY}, + timeout=90, + ) + resp.raise_for_status() + data = resp.json() + except Exception as e: + logger.warning(f"Overpass API error : {e}") + return [] + + elements = data.get("elements", []) + logger.info(f"Overpass : {len(elements)} éléments bruts reçus.") + + for el in elements[:max_leads]: + tags = el.get("tags", {}) + + # Coordonnées (pour les ways, Overpass retourne 'center') + lat = el.get("lat") or (el.get("center") or {}).get("lat") + lon = el.get("lon") or (el.get("center") or {}).get("lon") + + name = tags.get("name", "") + if not name: + continue # Ignorer les lieux sans nom + + addr_parts = [ + tags.get("addr:housenumber", ""), + tags.get("addr:street", ""), + tags.get("addr:city", ""), + tags.get("addr:postcode", ""), + ] + address = " ".join(p for p in addr_parts if p).strip() + if not address and lat and lon: + address = f"{lat:.4f},{lon:.4f}" + + lead = { + "source": "osm", + "name": name, + "address": address, + "phone": tags.get("phone", tags.get("contact:phone", "")), + "rating": None, + "reviews_count": None, + "website": "", + "rgpd_ok": True, # Données publiques ODbL + } + leads.append(lead) + logger.info(f"Lead trouvé (OSM) : {lead['name']}") + + logger.info(f"Overpass : {len(leads)} leads collectés.") + return leads + + +# ─── Orchestrateur ──────────────────────────────────────────────────────────── + + +def run_scraping( + max_leads: int = 100, use_google: bool = True, use_osm: bool = True +) -> list[dict]: + """ + Lance le scraping Google Places + fallback OSM. + + Args: + max_leads: nombre maximum de leads à collecter au total. + use_google: activer Google Places (nécessite GOOGLE_PLACES_API_KEY). + use_osm: activer le fallback Overpass OSM. + + Returns: + Liste de leads normalisés (dédupliqués par nom + adresse). + """ + all_leads = [] + seen_keys = set() + + def _dedup_key(lead: dict) -> str: + return f"{lead['name'].lower().strip()}|{lead['address'].lower().strip()[:40]}" + + if use_google: + try: + scraper = GooglePlacesScraper() + google_leads = scraper.scrape(max_leads=max_leads) + for lead in google_leads: + k = _dedup_key(lead) + if k not in seen_keys: + seen_keys.add(k) + all_leads.append(lead) + except EnvironmentError as e: + logger.warning(f"Google Places désactivé : {e}") + use_google = False + + remaining = max_leads - len(all_leads) + if use_osm and remaining > 0: + osm_leads = OverpassScraper().scrape(max_leads=remaining) + for lead in osm_leads: + k = _dedup_key(lead) + if k not in seen_keys: + seen_keys.add(k) + all_leads.append(lead) + + logger.info( + f"run_scraping terminé — {len(all_leads)} leads uniques " + f"(Google={use_google}, OSM={use_osm}). " + f"Quota restant aujourd'hui : {_quota_remaining()}" + ) + return all_leads + + +# ─── CLI (debug) ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + assert GOOGLE_PLACES_API_KEY, ( + "GOOGLE_PLACES_API_KEY manquante — " + "ajouter 'export GOOGLE_PLACES_API_KEY=xxx' dans /home/h3r7/.env" + ) + leads = run_scraping(max_leads=10) + for i, l in enumerate(leads, 1): + print(f"{i:02d}. [{l['source']}] {l['name']} — {l['address']}") From 356bdf5bec1a8026463ff31a6d58a6f7201d2688 Mon Sep 17 00:00:00 2001 From: DevOps Engineer Date: Mon, 27 Apr 2026 16:42:15 +0200 Subject: [PATCH 2/3] =?UTF-8?q?fix(leadhunter):=20change=20port=208769?= =?UTF-8?q?=E2=86=928770=20=E2=80=94=20conflit=20avec=20depenses=5Ftrello?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port 8769 était occupé par /home/h3r7/depenses_trello/app.py (pid=2287989). Mise à jour du port dans : - leadhunter_api.py (docstring, healthcheck, app.run) - infra/turf-saas-leadhunter.service (description) Ref: HRT-66 Co-Authored-By: Paperclip --- infra/turf-saas-leadhunter.service | 2 +- leadhunter_api.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/infra/turf-saas-leadhunter.service b/infra/turf-saas-leadhunter.service index 5620a8f..2f59e12 100644 --- a/infra/turf-saas-leadhunter.service +++ b/infra/turf-saas-leadhunter.service @@ -1,5 +1,5 @@ [Unit] -Description=H3R7Tech LeadHunter API (Port 8769) +Description=H3R7Tech LeadHunter API (Port 8770) Documentation=https://portal-kolifee.duckdns.org After=network.target diff --git a/leadhunter_api.py b/leadhunter_api.py index d720ebd..845db81 100644 --- a/leadhunter_api.py +++ b/leadhunter_api.py @@ -2,7 +2,7 @@ """ H3R7Tech — LeadHunter API =========================== -Service Flask sur port 8769 exposant les endpoints LeadHunter. +Service Flask sur port 8770 exposant les endpoints LeadHunter. Endpoints : GET /api/leads — Liste les leads (filtres: status, limit, offset) @@ -11,7 +11,7 @@ Endpoints : GET /api/leads/export — Export CSV des leads PATCH /api/leads//status — Met à jour le statut d'un lead -Port : 8769 (validé CTO — disponible sur VPS, hors 8768/8765/5000/8792) +Port : 8770 (8769 occupé par depenses_trello/app.py — corrigé HRT-66) Auteur: H3R7Tech Backend Engineer Issue: HRT-66 @@ -292,7 +292,7 @@ def health(): { "status": "ok", "service": "leadhunter-api", - "port": 8769, + "port": 8770, } ) @@ -300,4 +300,4 @@ def health(): # ─── Entrypoint ────────────────────────────────────────────────────────────── if __name__ == "__main__": - app.run(host="0.0.0.0", port=8769, debug=False) + app.run(host="0.0.0.0", port=8770, debug=False) From 4b4323f707d90cfddd45d084c6f61134c4edb475 Mon Sep 17 00:00:00 2001 From: DevOps Engineer Date: Mon, 27 Apr 2026 16:48:12 +0200 Subject: [PATCH 3/3] =?UTF-8?q?fix(leadhunter):=20change=20port=208770?= =?UTF-8?q?=E2=86=928775=20=E2=80=94=20port=208770=20occup=C3=A9=20par=20t?= =?UTF-8?q?urf=5Fscraper/crm=5Fapi.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port audit sur VPS (27/04/2026) : - 8769 : depenses_trello/app.py (PID 2287989) - 8770 : turf_scraper/crm_api.py (PID 2287988) ← port précédemment choisi, aussi occupé - 8775 : libre (vérifié via ss -tlnp | grep 8775 → vide) Fichiers modifiés : - leadhunter_api.py : lignes 5, 295, 303 (port 8770→8775) - infra/turf-saas-leadhunter.service : Description Port 8770→8775 Issue: HRT-66 Co-Authored-By: Paperclip --- infra/turf-saas-leadhunter.service | 2 +- leadhunter_api.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/infra/turf-saas-leadhunter.service b/infra/turf-saas-leadhunter.service index 2f59e12..8e7206b 100644 --- a/infra/turf-saas-leadhunter.service +++ b/infra/turf-saas-leadhunter.service @@ -1,5 +1,5 @@ [Unit] -Description=H3R7Tech LeadHunter API (Port 8770) +Description=H3R7Tech LeadHunter API (Port 8775) Documentation=https://portal-kolifee.duckdns.org After=network.target diff --git a/leadhunter_api.py b/leadhunter_api.py index 845db81..f6720da 100644 --- a/leadhunter_api.py +++ b/leadhunter_api.py @@ -2,7 +2,7 @@ """ H3R7Tech — LeadHunter API =========================== -Service Flask sur port 8770 exposant les endpoints LeadHunter. +Service Flask sur port 8775 exposant les endpoints LeadHunter. Endpoints : GET /api/leads — Liste les leads (filtres: status, limit, offset) @@ -11,7 +11,7 @@ Endpoints : GET /api/leads/export — Export CSV des leads PATCH /api/leads//status — Met à jour le statut d'un lead -Port : 8770 (8769 occupé par depenses_trello/app.py — corrigé HRT-66) +Port : 8775 (8769 occupé par depenses_trello/app.py, 8770 occupé par turf_scraper/crm_api.py — corrigé HRT-66) Auteur: H3R7Tech Backend Engineer Issue: HRT-66 @@ -292,7 +292,7 @@ def health(): { "status": "ok", "service": "leadhunter-api", - "port": 8770, + "port": 8775, } ) @@ -300,4 +300,4 @@ def health(): # ─── Entrypoint ────────────────────────────────────────────────────────────── if __name__ == "__main__": - app.run(host="0.0.0.0", port=8770, debug=False) + app.run(host="0.0.0.0", port=8775, debug=False)