261 lines
7.7 KiB
Python
261 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cache LLM - Turf Scraper
|
|
Réduction des appels API par mise en cache des réponses
|
|
"""
|
|
import json
|
|
import hashlib
|
|
import os
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Optional, Any
|
|
|
|
|
|
class LLMCache:
|
|
"""Cache pour réponses LLM avec expiration"""
|
|
|
|
def __init__(self, cache_dir: str = None, ttl_hours: int = 24):
|
|
"""
|
|
Args:
|
|
cache_dir: Répertoire pour le cache (défaut: ~/.cache/turf_llm/)
|
|
ttl_hours: Time-to-live en heures (défaut: 24h)
|
|
"""
|
|
if cache_dir is None:
|
|
cache_dir = os.path.expanduser("~/.cache/turf_llm")
|
|
|
|
self.cache_dir = Path(cache_dir)
|
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
self.ttl = timedelta(hours=ttl_hours)
|
|
|
|
def _hash_key(self, key: str) -> str:
|
|
"""Génère un hash pour la clé"""
|
|
return hashlib.sha256(key.encode()).hexdigest()
|
|
|
|
def _get_cache_path(self, key: str) -> Path:
|
|
"""Retourne le chemin du fichier cache"""
|
|
hash_key = self._hash_key(key)
|
|
return self.cache_dir / f"{hash_key}.json"
|
|
|
|
def get(self, key: str) -> Optional[dict]:
|
|
"""
|
|
Récupère une valeur du cache
|
|
|
|
Args:
|
|
key: Clé de recherche
|
|
|
|
Returns:
|
|
dict avec 'response' et 'timestamp' ou None si expiré/absent
|
|
"""
|
|
cache_path = self._get_cache_path(key)
|
|
|
|
if not cache_path.exists():
|
|
return None
|
|
|
|
try:
|
|
with open(cache_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
cached_time = datetime.fromisoformat(data.get('timestamp', ''))
|
|
|
|
if datetime.now() - cached_time > self.ttl:
|
|
cache_path.unlink()
|
|
return None
|
|
|
|
return data
|
|
|
|
except (json.JSONDecodeError, ValueError, OSError):
|
|
return None
|
|
|
|
def set(self, key: str, response: Any, metadata: dict = None) -> bool:
|
|
"""
|
|
Sauvegarde une réponse dans le cache
|
|
|
|
Args:
|
|
key: Clé de recherche
|
|
response: Réponse à sauvegarder
|
|
metadata: Métadonnées additionnelles
|
|
|
|
Returns:
|
|
True si succès
|
|
"""
|
|
cache_path = self._get_cache_path(key)
|
|
|
|
data = {
|
|
'key': key,
|
|
'response': response,
|
|
'timestamp': datetime.now().isoformat(),
|
|
'metadata': metadata or {}
|
|
}
|
|
|
|
try:
|
|
with open(cache_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
return True
|
|
except OSError:
|
|
return False
|
|
|
|
def delete(self, key: str) -> bool:
|
|
"""Supprime une entrée du cache"""
|
|
cache_path = self._get_cache_path(key)
|
|
try:
|
|
if cache_path.exists():
|
|
cache_path.unlink()
|
|
return True
|
|
except OSError:
|
|
pass
|
|
return False
|
|
|
|
def clear(self) -> int:
|
|
"""Supprime tout le cache"""
|
|
count = 0
|
|
for f in self.cache_dir.glob("*.json"):
|
|
try:
|
|
f.unlink()
|
|
count += 1
|
|
except OSError:
|
|
pass
|
|
return count
|
|
|
|
def clear_expired(self) -> int:
|
|
"""Supprime les entrées expirées"""
|
|
count = 0
|
|
now = datetime.now()
|
|
|
|
for f in self.cache_dir.glob("*.json"):
|
|
try:
|
|
with open(f, 'r', encoding='utf-8') as fp:
|
|
data = json.load(fp)
|
|
|
|
cached_time = datetime.fromisoformat(data.get('timestamp', ''))
|
|
|
|
if now - cached_time > self.ttl:
|
|
f.unlink()
|
|
count += 1
|
|
except (json.JSONDecodeError, ValueError, OSError):
|
|
pass
|
|
|
|
return count
|
|
|
|
def get_stats(self) -> dict:
|
|
"""Retourne des statistiques sur le cache"""
|
|
files = list(self.cache_dir.glob("*.json"))
|
|
total_size = sum(f.stat().st_size for f in files)
|
|
|
|
now = datetime.now()
|
|
expired = 0
|
|
|
|
for f in files:
|
|
try:
|
|
with open(f, 'r', encoding='utf-8') as fp:
|
|
data = json.load(fp)
|
|
cached_time = datetime.fromisoformat(data.get('timestamp', ''))
|
|
if now - cached_time > self.ttl:
|
|
expired += 1
|
|
except:
|
|
pass
|
|
|
|
return {
|
|
'total_entries': len(files),
|
|
'total_size_bytes': total_size,
|
|
'expired_entries': expired,
|
|
'active_entries': len(files) - expired
|
|
}
|
|
|
|
|
|
class QuestionCache:
|
|
"""Cache spécifique pour les questions SQL"""
|
|
|
|
def __init__(self, db_path: str = None):
|
|
if db_path is None:
|
|
db_path = os.path.expanduser("~/.cache/turf_llm/sql_cache.json")
|
|
|
|
self.cache_file = Path(db_path)
|
|
self.cache = self._load()
|
|
|
|
def _load(self) -> dict:
|
|
"""Charge le cache depuis le fichier"""
|
|
if self.cache_file.exists():
|
|
try:
|
|
with open(self.cache_file, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except:
|
|
pass
|
|
return {}
|
|
|
|
def _save(self):
|
|
"""Sauvegarde le cache"""
|
|
try:
|
|
self.cache_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(self.cache_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.cache, f, indent=2)
|
|
except:
|
|
pass
|
|
|
|
def get_sql(self, question: str) -> Optional[str]:
|
|
"""Récupère SQL pour une question similaire"""
|
|
normalized = question.lower().strip()
|
|
|
|
if normalized in self.cache:
|
|
return self.cache[normalized].get('sql')
|
|
|
|
for key, value in self.cache.items():
|
|
if self._similarity(normalized, key) > 0.7:
|
|
return value.get('sql')
|
|
|
|
return None
|
|
|
|
def set_sql(self, question: str, sql: str, success: bool = True):
|
|
"""Sauvegarde SQL pour une question"""
|
|
normalized = question.lower().strip()
|
|
|
|
self.cache[normalized] = {
|
|
'sql': sql,
|
|
'success': success,
|
|
'timestamp': datetime.now().isoformat(),
|
|
'count': self.cache.get(normalized, {}).get('count', 0) + 1
|
|
}
|
|
self._save()
|
|
|
|
def _similarity(self, s1: str, s2: str) -> float:
|
|
"""Calcule similarité simple entre deux strings"""
|
|
words1 = set(s1.split())
|
|
words2 = set(s2.split())
|
|
|
|
if not words1 or not words2:
|
|
return 0.0
|
|
|
|
intersection = len(words1 & words2)
|
|
union = len(words1 | words2)
|
|
|
|
return intersection / union if union > 0 else 0.0
|
|
|
|
def get_frequent_questions(self, limit: int = 10) -> list:
|
|
"""Retourne les questions les plus fréquentes"""
|
|
sorted_questions = sorted(
|
|
self.cache.items(),
|
|
key=lambda x: x[1].get('count', 0),
|
|
reverse=True
|
|
)
|
|
return [q[0] for q in sorted_questions[:limit]]
|
|
|
|
|
|
_global_cache = None
|
|
_sql_cache = None
|
|
|
|
|
|
def get_llm_cache() -> LLMCache:
|
|
"""Singleton pour le cache global"""
|
|
global _global_cache
|
|
if _global_cache is None:
|
|
_global_cache = LLMCache()
|
|
return _global_cache
|
|
|
|
|
|
def get_sql_cache() -> QuestionCache:
|
|
"""Singleton pour le cache SQL"""
|
|
global _sql_cache
|
|
if _sql_cache is None:
|
|
_sql_cache = QuestionCache()
|
|
return _sql_cache
|