Files
turf_saas/archive_v5_files.py
2026-04-25 17:18:43 +02:00

128 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Archive les fichiers v5_*.json antérieurs à aujourd'hui
Usage: python3 archive_v5_files.py [--dry-run]
"""
import os
import json
import sqlite3
import shutil
from datetime import datetime, date
from pathlib import Path
TURF_DIR = Path("/home/h3r7/turf_scraper")
ARCHIVE_DIR = TURF_DIR / "archive"
DB_PATH = TURF_DIR / "turf.db"
LOG_FILE = Path("/home/h3r7/logs/archive_v5.log")
def log(message: str):
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_entry = f"[{timestamp}] {message}"
print(log_entry)
LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(LOG_FILE, "a") as f:
f.write(log_entry + "\n")
def init_archive_table():
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute("""
CREATE TABLE IF NOT EXISTS scraping_archives (
id INTEGER PRIMARY KEY AUTOINCREMENT,
filename TEXT UNIQUE NOT NULL,
execution_date TEXT NOT NULL,
runtime_sec REAL,
total_pages INTEGER,
file_size_kb INTEGER,
archive_path TEXT,
pages_success INTEGER DEFAULT 0,
pages_error INTEGER DEFAULT 0,
archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
archived_by TEXT DEFAULT 'cron_6h'
)
""")
conn.commit()
conn.close()
def extract_metadata(filepath: Path) -> dict:
try:
with open(filepath) as f:
data = json.load(f)
pages_success = sum(1 for p in data.get("pages", []) if p.get("status") == "success")
pages_error = sum(1 for p in data.get("pages", []) if p.get("status") == "error")
return {
"runtime_sec": data.get("runtime_sec"),
"total_pages": data.get("total_pages"),
"pages_success": pages_success,
"pages_error": pages_error
}
except Exception as e:
log(f"Erreur lecture {filepath}: {e}")
return {}
def archive_files(dry_run: bool = False):
today = date.today()
today_str = today.strftime("%Y%m%d")
init_archive_table()
log("=== Début archivage v5 ===")
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
archived_count = 0
for file in sorted(TURF_DIR.glob("v5_*.json")):
filename = file.name
parts = filename.split("_")
if len(parts) < 2:
continue
file_date_str = parts[1]
if file_date_str >= today_str:
continue
metadata = extract_metadata(file)
file_size_kb = file.stat().st_size // 1024
year = file_date_str[:4]
month = file_date_str[4:6]
target_dir = ARCHIVE_DIR / year / month
target_path = target_dir / filename
if not dry_run:
target_dir.mkdir(parents=True, exist_ok=True)
c.execute("""
INSERT OR IGNORE INTO scraping_archives
(filename, execution_date, runtime_sec, total_pages,
file_size_kb, archive_path, pages_success, pages_error)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
filename, file_date_str, metadata.get("runtime_sec"),
metadata.get("total_pages"), file_size_kb,
str(target_path), metadata.get("pages_success", 0),
metadata.get("pages_error", 0)
))
shutil.move(str(file), str(target_path))
log(f"Archivé: {filename}{target_path}")
archived_count += 1
if not dry_run:
conn.commit()
conn.close()
log(f"=== Fin archivage: {archived_count} fichiers ===")
return archived_count
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true", help="Simulation sans déplacement")
args = parser.parse_args()
archive_files(dry_run=args.dry_run)