turf_saas/llm_cache.py

#!/usr/bin/env python3
"""
Cache LLM - Turf Scraper
Réduction des appels API par mise en cache des réponses
"""
import json
import hashlib
import os
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Any


class LLMCache:
    """Cache pour réponses LLM avec expiration"""

    def __init__(self, cache_dir: str = None, ttl_hours: int = 24):
        """
        Args:
            cache_dir: Répertoire pour le cache (défaut: ~/.cache/turf_llm/)
            ttl_hours: Time-to-live en heures (défaut: 24h)
        """
        if cache_dir is None:
            cache_dir = os.path.expanduser("~/.cache/turf_llm")

        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.ttl = timedelta(hours=ttl_hours)

    def _hash_key(self, key: str) -> str:
        """Génère un hash pour la clé"""
        return hashlib.sha256(key.encode()).hexdigest()

    def _get_cache_path(self, key: str) -> Path:
        """Retourne le chemin du fichier cache"""
        hash_key = self._hash_key(key)
        return self.cache_dir / f"{hash_key}.json"

    def get(self, key: str) -> Optional[dict]:
        """
        Récupère une valeur du cache

        Args:
            key: Clé de recherche

        Returns:
            dict avec 'response' et 'timestamp' ou None si expiré/absent
        """
        cache_path = self._get_cache_path(key)

        if not cache_path.exists():
            return None

        try:
            with open(cache_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            cached_time = datetime.fromisoformat(data.get('timestamp', ''))

            if datetime.now() - cached_time > self.ttl:
                cache_path.unlink()
                return None

            return data

        except (json.JSONDecodeError, ValueError, OSError):
            return None

    def set(self, key: str, response: Any, metadata: dict = None) -> bool:
        """
        Sauvegarde une réponse dans le cache

        Args:
            key: Clé de recherche
            response: Réponse à sauvegarder
            metadata: Métadonnées additionnelles

        Returns:
            True si succès
        """
        cache_path = self._get_cache_path(key)

        data = {
            'key': key,
            'response': response,
            'timestamp': datetime.now().isoformat(),
            'metadata': metadata or {}
        }

        try:
            with open(cache_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            return True
        except OSError:
            return False

    def delete(self, key: str) -> bool:
        """Supprime une entrée du cache"""
        cache_path = self._get_cache_path(key)
        try:
            if cache_path.exists():
                cache_path.unlink()
                return True
        except OSError:
            pass
        return False

    def clear(self) -> int:
        """Supprime tout le cache"""
        count = 0
        for f in self.cache_dir.glob("*.json"):
            try:
                f.unlink()
                count += 1
            except OSError:
                pass
        return count

    def clear_expired(self) -> int:
        """Supprime les entrées expirées"""
        count = 0
        now = datetime.now()

        for f in self.cache_dir.glob("*.json"):
            try:
                with open(f, 'r', encoding='utf-8') as fp:
                    data = json.load(fp)

                cached_time = datetime.fromisoformat(data.get('timestamp', ''))

                if now - cached_time > self.ttl:
                    f.unlink()
                    count += 1
            except (json.JSONDecodeError, ValueError, OSError):
                pass

        return count

    def get_stats(self) -> dict:
        """Retourne des statistiques sur le cache"""
        files = list(self.cache_dir.glob("*.json"))
        total_size = sum(f.stat().st_size for f in files)

        now = datetime.now()
        expired = 0

        for f in files:
            try:
                with open(f, 'r', encoding='utf-8') as fp:
                    data = json.load(fp)
                cached_time = datetime.fromisoformat(data.get('timestamp', ''))
                if now - cached_time > self.ttl:
                    expired += 1
            except:
                pass

        return {
            'total_entries': len(files),
            'total_size_bytes': total_size,
            'expired_entries': expired,
            'active_entries': len(files) - expired
        }


class QuestionCache:
    """Cache spécifique pour les questions SQL"""

    def __init__(self, db_path: str = None):
        if db_path is None:
            db_path = os.path.expanduser("~/.cache/turf_llm/sql_cache.json")

        self.cache_file = Path(db_path)
        self.cache = self._load()

    def _load(self) -> dict:
        """Charge le cache depuis le fichier"""
        if self.cache_file.exists():
            try:
                with open(self.cache_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except:
                pass
        return {}

    def _save(self):
        """Sauvegarde le cache"""
        try:
            self.cache_file.parent.mkdir(parents=True, exist_ok=True)
            with open(self.cache_file, 'w', encoding='utf-8') as f:
                json.dump(self.cache, f, indent=2)
        except:
            pass

    def get_sql(self, question: str) -> Optional[str]:
        """Récupère SQL pour une question similaire"""
        normalized = question.lower().strip()

        if normalized in self.cache:
            return self.cache[normalized].get('sql')

        for key, value in self.cache.items():
            if self._similarity(normalized, key) > 0.7:
                return value.get('sql')

        return None

    def set_sql(self, question: str, sql: str, success: bool = True):
        """Sauvegarde SQL pour une question"""
        normalized = question.lower().strip()

        self.cache[normalized] = {
            'sql': sql,
            'success': success,
            'timestamp': datetime.now().isoformat(),
            'count': self.cache.get(normalized, {}).get('count', 0) + 1
        }
        self._save()

    def _similarity(self, s1: str, s2: str) -> float:
        """Calcule similarité simple entre deux strings"""
        words1 = set(s1.split())
        words2 = set(s2.split())

        if not words1 or not words2:
            return 0.0

        intersection = len(words1 & words2)
        union = len(words1 | words2)

        return intersection / union if union > 0 else 0.0

    def get_frequent_questions(self, limit: int = 10) -> list:
        """Retourne les questions les plus fréquentes"""
        sorted_questions = sorted(
            self.cache.items(),
            key=lambda x: x[1].get('count', 0),
            reverse=True
        )
        return [q[0] for q in sorted_questions[:limit]]


_global_cache = None
_sql_cache = None


def get_llm_cache() -> LLMCache:
    """Singleton pour le cache global"""
    global _global_cache
    if _global_cache is None:
        _global_cache = LLMCache()
    return _global_cache


def get_sql_cache() -> QuestionCache:
    """Singleton pour le cache SQL"""
    global _sql_cache
    if _sql_cache is None:
        _sql_cache = QuestionCache()
    return _sql_cache