#!/usr/bin/env python3 """ Archive les fichiers v5_*.json antérieurs à aujourd'hui Usage: python3 archive_v5_files.py [--dry-run] """ import os import json import sqlite3 import shutil from datetime import datetime, date from pathlib import Path TURF_DIR = Path("/home/h3r7/turf_scraper") ARCHIVE_DIR = TURF_DIR / "archive" DB_PATH = TURF_DIR / "turf.db" LOG_FILE = Path("/home/h3r7/logs/archive_v5.log") def log(message: str): timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") log_entry = f"[{timestamp}] {message}" print(log_entry) LOG_FILE.parent.mkdir(parents=True, exist_ok=True) with open(LOG_FILE, "a") as f: f.write(log_entry + "\n") def init_archive_table(): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute(""" CREATE TABLE IF NOT EXISTS scraping_archives ( id INTEGER PRIMARY KEY AUTOINCREMENT, filename TEXT UNIQUE NOT NULL, execution_date TEXT NOT NULL, runtime_sec REAL, total_pages INTEGER, file_size_kb INTEGER, archive_path TEXT, pages_success INTEGER DEFAULT 0, pages_error INTEGER DEFAULT 0, archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, archived_by TEXT DEFAULT 'cron_6h' ) """) conn.commit() conn.close() def extract_metadata(filepath: Path) -> dict: try: with open(filepath) as f: data = json.load(f) pages_success = sum(1 for p in data.get("pages", []) if p.get("status") == "success") pages_error = sum(1 for p in data.get("pages", []) if p.get("status") == "error") return { "runtime_sec": data.get("runtime_sec"), "total_pages": data.get("total_pages"), "pages_success": pages_success, "pages_error": pages_error } except Exception as e: log(f"Erreur lecture {filepath}: {e}") return {} def archive_files(dry_run: bool = False): today = date.today() today_str = today.strftime("%Y%m%d") init_archive_table() log("=== Début archivage v5 ===") conn = sqlite3.connect(DB_PATH) c = conn.cursor() archived_count = 0 for file in sorted(TURF_DIR.glob("v5_*.json")): filename = file.name parts = filename.split("_") if len(parts) < 2: continue file_date_str = parts[1] if file_date_str >= today_str: continue metadata = extract_metadata(file) file_size_kb = file.stat().st_size // 1024 year = file_date_str[:4] month = file_date_str[4:6] target_dir = ARCHIVE_DIR / year / month target_path = target_dir / filename if not dry_run: target_dir.mkdir(parents=True, exist_ok=True) c.execute(""" INSERT OR IGNORE INTO scraping_archives (filename, execution_date, runtime_sec, total_pages, file_size_kb, archive_path, pages_success, pages_error) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, ( filename, file_date_str, metadata.get("runtime_sec"), metadata.get("total_pages"), file_size_kb, str(target_path), metadata.get("pages_success", 0), metadata.get("pages_error", 0) )) shutil.move(str(file), str(target_path)) log(f"Archivé: {filename} → {target_path}") archived_count += 1 if not dry_run: conn.commit() conn.close() log(f"=== Fin archivage: {archived_count} fichiers ===") return archived_count if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true", help="Simulation sans déplacement") args = parser.parse_args() archive_files(dry_run=args.dry_run)