turf_saas/archive_v5_files.py

#!/usr/bin/env python3
"""
Archive les fichiers v5_*.json antérieurs à aujourd'hui
Usage: python3 archive_v5_files.py [--dry-run]
"""

import os
import json
import sqlite3
import shutil
from datetime import datetime, date
from pathlib import Path

TURF_DIR = Path("/home/h3r7/turf_scraper")
ARCHIVE_DIR = TURF_DIR / "archive"
DB_PATH = TURF_DIR / "turf.db"
LOG_FILE = Path("/home/h3r7/logs/archive_v5.log")

def log(message: str):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {message}"
    print(log_entry)
    LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(LOG_FILE, "a") as f:
        f.write(log_entry + "\n")

def init_archive_table():
    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute("""
        CREATE TABLE IF NOT EXISTS scraping_archives (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT UNIQUE NOT NULL,
            execution_date TEXT NOT NULL,
            runtime_sec REAL,
            total_pages INTEGER,
            file_size_kb INTEGER,
            archive_path TEXT,
            pages_success INTEGER DEFAULT 0,
            pages_error INTEGER DEFAULT 0,
            archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            archived_by TEXT DEFAULT 'cron_6h'
        )
    """)
    conn.commit()
    conn.close()

def extract_metadata(filepath: Path) -> dict:
    try:
        with open(filepath) as f:
            data = json.load(f)
        pages_success = sum(1 for p in data.get("pages", []) if p.get("status") == "success")
        pages_error = sum(1 for p in data.get("pages", []) if p.get("status") == "error")
        return {
            "runtime_sec": data.get("runtime_sec"),
            "total_pages": data.get("total_pages"),
            "pages_success": pages_success,
            "pages_error": pages_error
        }
    except Exception as e:
        log(f"Erreur lecture {filepath}: {e}")
        return {}

def archive_files(dry_run: bool = False):
    today = date.today()
    today_str = today.strftime("%Y%m%d")

    init_archive_table()
    log("=== Début archivage v5 ===")

    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()

    archived_count = 0

    for file in sorted(TURF_DIR.glob("v5_*.json")):
        filename = file.name
        parts = filename.split("_")
        if len(parts) < 2:
            continue
        file_date_str = parts[1]

        if file_date_str >= today_str:
            continue

        metadata = extract_metadata(file)
        file_size_kb = file.stat().st_size // 1024

        year = file_date_str[:4]
        month = file_date_str[4:6]
        target_dir = ARCHIVE_DIR / year / month
        target_path = target_dir / filename

        if not dry_run:
            target_dir.mkdir(parents=True, exist_ok=True)

            c.execute("""
                INSERT OR IGNORE INTO scraping_archives
                (filename, execution_date, runtime_sec, total_pages,
                 file_size_kb, archive_path, pages_success, pages_error)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                filename, file_date_str, metadata.get("runtime_sec"),
                metadata.get("total_pages"), file_size_kb,
                str(target_path), metadata.get("pages_success", 0),
                metadata.get("pages_error", 0)
            ))

            shutil.move(str(file), str(target_path))

        log(f"Archivé: {filename} → {target_path}")
        archived_count += 1

    if not dry_run:
        conn.commit()
    conn.close()

    log(f"=== Fin archivage: {archived_count} fichiers ===")
    return archived_count

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true", help="Simulation sans déplacement")
    args = parser.parse_args()

    archive_files(dry_run=args.dry_run)