128 lines
3.9 KiB
Python
Executable File
128 lines
3.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Archive les fichiers v5_*.json antérieurs à aujourd'hui
|
|
Usage: python3 archive_v5_files.py [--dry-run]
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import sqlite3
|
|
import shutil
|
|
from datetime import datetime, date
|
|
from pathlib import Path
|
|
|
|
TURF_DIR = Path("/home/h3r7/turf_scraper")
|
|
ARCHIVE_DIR = TURF_DIR / "archive"
|
|
DB_PATH = TURF_DIR / "turf.db"
|
|
LOG_FILE = Path("/home/h3r7/logs/archive_v5.log")
|
|
|
|
def log(message: str):
|
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
log_entry = f"[{timestamp}] {message}"
|
|
print(log_entry)
|
|
LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(LOG_FILE, "a") as f:
|
|
f.write(log_entry + "\n")
|
|
|
|
def init_archive_table():
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
c.execute("""
|
|
CREATE TABLE IF NOT EXISTS scraping_archives (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
filename TEXT UNIQUE NOT NULL,
|
|
execution_date TEXT NOT NULL,
|
|
runtime_sec REAL,
|
|
total_pages INTEGER,
|
|
file_size_kb INTEGER,
|
|
archive_path TEXT,
|
|
pages_success INTEGER DEFAULT 0,
|
|
pages_error INTEGER DEFAULT 0,
|
|
archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
archived_by TEXT DEFAULT 'cron_6h'
|
|
)
|
|
""")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def extract_metadata(filepath: Path) -> dict:
|
|
try:
|
|
with open(filepath) as f:
|
|
data = json.load(f)
|
|
pages_success = sum(1 for p in data.get("pages", []) if p.get("status") == "success")
|
|
pages_error = sum(1 for p in data.get("pages", []) if p.get("status") == "error")
|
|
return {
|
|
"runtime_sec": data.get("runtime_sec"),
|
|
"total_pages": data.get("total_pages"),
|
|
"pages_success": pages_success,
|
|
"pages_error": pages_error
|
|
}
|
|
except Exception as e:
|
|
log(f"Erreur lecture {filepath}: {e}")
|
|
return {}
|
|
|
|
def archive_files(dry_run: bool = False):
|
|
today = date.today()
|
|
today_str = today.strftime("%Y%m%d")
|
|
|
|
init_archive_table()
|
|
log("=== Début archivage v5 ===")
|
|
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
|
|
archived_count = 0
|
|
|
|
for file in sorted(TURF_DIR.glob("v5_*.json")):
|
|
filename = file.name
|
|
parts = filename.split("_")
|
|
if len(parts) < 2:
|
|
continue
|
|
file_date_str = parts[1]
|
|
|
|
if file_date_str >= today_str:
|
|
continue
|
|
|
|
metadata = extract_metadata(file)
|
|
file_size_kb = file.stat().st_size // 1024
|
|
|
|
year = file_date_str[:4]
|
|
month = file_date_str[4:6]
|
|
target_dir = ARCHIVE_DIR / year / month
|
|
target_path = target_dir / filename
|
|
|
|
if not dry_run:
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
c.execute("""
|
|
INSERT OR IGNORE INTO scraping_archives
|
|
(filename, execution_date, runtime_sec, total_pages,
|
|
file_size_kb, archive_path, pages_success, pages_error)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
filename, file_date_str, metadata.get("runtime_sec"),
|
|
metadata.get("total_pages"), file_size_kb,
|
|
str(target_path), metadata.get("pages_success", 0),
|
|
metadata.get("pages_error", 0)
|
|
))
|
|
|
|
shutil.move(str(file), str(target_path))
|
|
|
|
log(f"Archivé: {filename} → {target_path}")
|
|
archived_count += 1
|
|
|
|
if not dry_run:
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
log(f"=== Fin archivage: {archived_count} fichiers ===")
|
|
return archived_count
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--dry-run", action="store_true", help="Simulation sans déplacement")
|
|
args = parser.parse_args()
|
|
|
|
archive_files(dry_run=args.dry_run)
|