Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,591 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import threading
|
7 |
+
import time
|
8 |
+
from dataclasses import dataclass, asdict
|
9 |
+
from typing import Any, Dict, Generator, List, Optional
|
10 |
+
|
11 |
+
from flask import Flask, Response, jsonify, request, stream_with_context
|
12 |
+
from flask_cors import CORS
|
13 |
+
from dotenv import load_dotenv
|
14 |
+
|
15 |
+
load_dotenv() # charge .env si présent
|
16 |
+
|
17 |
+
|
18 |
+
# -------------------------------
|
19 |
+
# Dépendances externes
|
20 |
+
# -------------------------------
|
21 |
+
try:
|
22 |
+
from llama_cpp import Llama
|
23 |
+
except Exception as e: # pragma: no cover
|
24 |
+
raise RuntimeError(
|
25 |
+
"llama-cpp-python is required. Install with `pip install --upgrade llama-cpp-python`."
|
26 |
+
) from e
|
27 |
+
|
28 |
+
try:
|
29 |
+
from huggingface_hub import hf_hub_download
|
30 |
+
HAS_HF = True
|
31 |
+
except Exception:
|
32 |
+
HAS_HF = False
|
33 |
+
|
34 |
+
|
35 |
+
# -------------------------------
|
36 |
+
# Helpers ENV
|
37 |
+
# -------------------------------
|
38 |
+
def getenv_str(name: str, default: Optional[str] = None) -> Optional[str]:
|
39 |
+
v = os.getenv(name, default)
|
40 |
+
return v if (v is None or v.strip() != "") else default
|
41 |
+
|
42 |
+
|
43 |
+
def getenv_int(name: str, default: int) -> int:
|
44 |
+
try:
|
45 |
+
return int(os.getenv(name, default))
|
46 |
+
except Exception:
|
47 |
+
return default
|
48 |
+
|
49 |
+
|
50 |
+
def getenv_bool(name: str, default: bool = False) -> bool:
|
51 |
+
v = os.getenv(name)
|
52 |
+
if v is None:
|
53 |
+
return default
|
54 |
+
return v.strip().lower() in ("1", "true", "yes", "y", "on")
|
55 |
+
|
56 |
+
|
57 |
+
# -------------------------------
|
58 |
+
# Modèle & Configuration
|
59 |
+
# -------------------------------
|
60 |
+
@dataclass
|
61 |
+
class ModelConfig:
|
62 |
+
# Chemin local vers un GGUF (sous MODELS_DIR)
|
63 |
+
model_path: Optional[str] = getenv_str("MODEL_PATH")
|
64 |
+
|
65 |
+
# Paramètres d'inférence
|
66 |
+
n_ctx: int = getenv_int("N_CTX", 4096)
|
67 |
+
n_threads: int = getenv_int("N_THREADS", os.cpu_count() or 4)
|
68 |
+
n_batch: int = getenv_int("N_BATCH", 256)
|
69 |
+
n_gpu_layers: int = getenv_int("N_GPU_LAYERS", 0) # CPU par défaut
|
70 |
+
verbose: bool = os.getenv("LLM_VERBOSE", "0") == "1"
|
71 |
+
|
72 |
+
def to_display(self) -> Dict[str, Any]:
|
73 |
+
d = asdict(self)
|
74 |
+
if d.get("model_path"):
|
75 |
+
d["model_path"] = os.path.basename(str(d["model_path"]))
|
76 |
+
return d
|
77 |
+
|
78 |
+
|
79 |
+
class ModelManager:
|
80 |
+
"""Gestion thread-safe du modèle (lazy load + reload)."""
|
81 |
+
|
82 |
+
def __init__(self, config: Optional[ModelConfig] = None):
|
83 |
+
self._lock = threading.RLock()
|
84 |
+
self._llm: Optional[Llama] = None
|
85 |
+
self._config = config or ModelConfig()
|
86 |
+
|
87 |
+
@property
|
88 |
+
def config(self) -> ModelConfig:
|
89 |
+
return self._config
|
90 |
+
|
91 |
+
def is_loaded(self) -> bool:
|
92 |
+
with self._lock:
|
93 |
+
return self._llm is not None
|
94 |
+
|
95 |
+
def set_config(self, new_cfg: ModelConfig) -> None:
|
96 |
+
"""Met à jour la config sans charger en mémoire."""
|
97 |
+
with self._lock:
|
98 |
+
self._config = new_cfg
|
99 |
+
# ne touche pas à _llm
|
100 |
+
|
101 |
+
def get_llm(self) -> Llama:
|
102 |
+
with self._lock:
|
103 |
+
if self._llm is not None:
|
104 |
+
return self._llm
|
105 |
+
self._llm = self._load_model(self._config)
|
106 |
+
return self._llm
|
107 |
+
|
108 |
+
def _load_model(self, cfg: ModelConfig) -> Llama:
|
109 |
+
if not cfg.model_path or not os.path.exists(cfg.model_path):
|
110 |
+
raise RuntimeError(
|
111 |
+
f"MODEL_PATH does not exist: {cfg.model_path!r}. Use /load to fetch a model or set MODEL_PATH."
|
112 |
+
)
|
113 |
+
t0 = time.time()
|
114 |
+
llm = Llama(
|
115 |
+
model_path=cfg.model_path,
|
116 |
+
n_ctx=cfg.n_ctx,
|
117 |
+
n_threads=cfg.n_threads,
|
118 |
+
n_batch=cfg.n_batch,
|
119 |
+
n_gpu_layers=cfg.n_gpu_layers,
|
120 |
+
verbose=cfg.verbose,
|
121 |
+
)
|
122 |
+
dt = time.time() - t0
|
123 |
+
print(f"[LLM] Loaded model in {dt:.2f}s | cfg={cfg.to_display()}")
|
124 |
+
return llm
|
125 |
+
|
126 |
+
def reload(self, new_cfg: Optional[ModelConfig] = None) -> Dict[str, Any]:
|
127 |
+
with self._lock:
|
128 |
+
self._llm = None
|
129 |
+
if new_cfg is not None:
|
130 |
+
self._config = new_cfg
|
131 |
+
_ = self.get_llm() # force le chargement pour valider
|
132 |
+
return {"status": "reloaded", "config": self._config.to_display()}
|
133 |
+
|
134 |
+
|
135 |
+
# -------------------------------
|
136 |
+
# Flask app
|
137 |
+
# -------------------------------
|
138 |
+
app = Flask(__name__)
|
139 |
+
CORS(app)
|
140 |
+
|
141 |
+
MODELS_DIR = getenv_str("MODELS_DIR", "/models") or "/models"
|
142 |
+
os.makedirs(MODELS_DIR, exist_ok=True)
|
143 |
+
|
144 |
+
# Options d'auto-load au démarrage
|
145 |
+
AUTO_REPO_ID = getenv_str("AUTO_REPO_ID")
|
146 |
+
AUTO_FILENAME = getenv_str("AUTO_FILENAME") # ex: "gemma-3-270m-it-Q4_K_M.gguf"
|
147 |
+
AUTO_IF_MISSING = getenv_bool("AUTO_IF_MISSING", True) # ne télécharge que si le fichier manque
|
148 |
+
AUTO_PRELOAD = getenv_bool("AUTO_PRELOAD", False) # charge en mémoire au boot
|
149 |
+
|
150 |
+
manager = ModelManager()
|
151 |
+
|
152 |
+
|
153 |
+
def _bootstrap_autoload() -> Dict[str, Any]:
|
154 |
+
"""
|
155 |
+
Si MODEL_PATH n'existe pas et que AUTO_REPO_ID/AUTO_FILENAME sont fournis,
|
156 |
+
tente un téléchargement dans MODELS_DIR, puis prépare MODEL_PATH.
|
157 |
+
Si AUTO_PRELOAD=True, charge aussi le modèle en mémoire (via reload()).
|
158 |
+
"""
|
159 |
+
info: Dict[str, Any] = {
|
160 |
+
"models_dir": MODELS_DIR,
|
161 |
+
"auto_repo_id": AUTO_REPO_ID,
|
162 |
+
"auto_filename": AUTO_FILENAME,
|
163 |
+
"auto_if_missing": AUTO_IF_MISSING,
|
164 |
+
"auto_preload": AUTO_PRELOAD,
|
165 |
+
"performed": False,
|
166 |
+
"path": None,
|
167 |
+
"error": None,
|
168 |
+
}
|
169 |
+
|
170 |
+
cfg = manager.config
|
171 |
+
target_path = cfg.model_path
|
172 |
+
|
173 |
+
# Si MODEL_PATH n'est pas défini mais qu'on a un AUTO_FILENAME, on propose une cible.
|
174 |
+
if (not target_path) and AUTO_FILENAME:
|
175 |
+
target_path = os.path.join(MODELS_DIR, os.path.basename(AUTO_FILENAME))
|
176 |
+
|
177 |
+
# Si on a déjà un fichier présent, pas besoin de télécharger.
|
178 |
+
if target_path and os.path.exists(target_path):
|
179 |
+
# Met à jour la config (sans précharger)
|
180 |
+
manager.set_config(
|
181 |
+
ModelConfig(
|
182 |
+
model_path=target_path,
|
183 |
+
n_ctx=cfg.n_ctx,
|
184 |
+
n_threads=cfg.n_threads,
|
185 |
+
n_batch=cfg.n_batch,
|
186 |
+
n_gpu_layers=cfg.n_gpu_layers,
|
187 |
+
verbose=cfg.verbose,
|
188 |
+
)
|
189 |
+
)
|
190 |
+
if AUTO_PRELOAD:
|
191 |
+
try:
|
192 |
+
manager.reload() # charge en mémoire
|
193 |
+
except Exception as e:
|
194 |
+
info["error"] = f"preload_failed: {e}"
|
195 |
+
info["performed"] = False
|
196 |
+
info["path"] = target_path
|
197 |
+
print(f"[BOOT] Model file already present at {target_path}")
|
198 |
+
return info
|
199 |
+
|
200 |
+
# Ici, le fichier n'existe pas : on tente un auto-load si possible.
|
201 |
+
if AUTO_IF_MISSING and AUTO_REPO_ID and AUTO_FILENAME and HAS_HF:
|
202 |
+
try:
|
203 |
+
print(f"[BOOT] Auto-load from HF: {AUTO_REPO_ID}/{AUTO_FILENAME}")
|
204 |
+
local_cached = hf_hub_download(repo_id=AUTO_REPO_ID, filename=AUTO_FILENAME)
|
205 |
+
target_path = os.path.join(MODELS_DIR, os.path.basename(AUTO_FILENAME))
|
206 |
+
if os.path.abspath(local_cached) != os.path.abspath(target_path):
|
207 |
+
# Copie chunkée
|
208 |
+
with open(local_cached, "rb") as src, open(target_path, "wb") as dst:
|
209 |
+
while True:
|
210 |
+
chunk = src.read(1024 * 1024)
|
211 |
+
if not chunk:
|
212 |
+
break
|
213 |
+
dst.write(chunk)
|
214 |
+
|
215 |
+
new_cfg = ModelConfig(
|
216 |
+
model_path=target_path,
|
217 |
+
n_ctx=cfg.n_ctx,
|
218 |
+
n_threads=cfg.n_threads,
|
219 |
+
n_batch=cfg.n_batch,
|
220 |
+
n_gpu_layers=cfg.n_gpu_layers,
|
221 |
+
verbose=cfg.verbose,
|
222 |
+
)
|
223 |
+
|
224 |
+
if AUTO_PRELOAD:
|
225 |
+
manager.reload(new_cfg) # charge en mémoire tout de suite
|
226 |
+
else:
|
227 |
+
manager.set_config(new_cfg) # défini le chemin, chargera à la 1re requête
|
228 |
+
|
229 |
+
info["performed"] = True
|
230 |
+
info["path"] = target_path
|
231 |
+
print(f"[BOOT] Auto-loaded model to {target_path} (preload={AUTO_PRELOAD})")
|
232 |
+
return info
|
233 |
+
except Exception as e:
|
234 |
+
info["error"] = str(e)
|
235 |
+
print(f"[BOOT][ERROR] Auto-load failed: {e}")
|
236 |
+
return info
|
237 |
+
|
238 |
+
# Pas d'auto-load possible / activé
|
239 |
+
return info
|
240 |
+
|
241 |
+
|
242 |
+
# Effectue l'auto-load au démarrage (non bloquant au niveau de la disponibilité HTTP)
|
243 |
+
BOOT_INFO = _bootstrap_autoload()
|
244 |
+
|
245 |
+
|
246 |
+
# -------------------------------
|
247 |
+
# Endpoints d'info
|
248 |
+
# -------------------------------
|
249 |
+
@app.get("/health")
|
250 |
+
def health() -> Any:
|
251 |
+
"""
|
252 |
+
Health check "soft":
|
253 |
+
- Si MODEL_PATH absent ou fichier manquant -> status="unloaded" (HTTP 200), sans 500.
|
254 |
+
- Si fichier présent mais modèle non encore chargé -> status="available".
|
255 |
+
- Si modèle en mémoire -> status="ready".
|
256 |
+
"""
|
257 |
+
try:
|
258 |
+
cfg = manager.config
|
259 |
+
path = cfg.model_path
|
260 |
+
if not path or not os.path.exists(path):
|
261 |
+
return jsonify(
|
262 |
+
{
|
263 |
+
"status": "unloaded",
|
264 |
+
"reason": "MODEL_PATH is not set or file not found",
|
265 |
+
"model": cfg.to_display(),
|
266 |
+
"exists": bool(path and os.path.exists(path)),
|
267 |
+
"boot_info": BOOT_INFO,
|
268 |
+
"hint": 'Use /load then /reload with {"model_path": "/models/your.gguf"} '
|
269 |
+
'or set AUTO_REPO_ID/AUTO_FILENAME in .env for auto-load.',
|
270 |
+
}
|
271 |
+
)
|
272 |
+
if not manager.is_loaded():
|
273 |
+
# Fichier OK, modèle pas encore chargé
|
274 |
+
return jsonify(
|
275 |
+
{
|
276 |
+
"status": "available",
|
277 |
+
"model": cfg.to_display(),
|
278 |
+
"path": path,
|
279 |
+
"boot_info": BOOT_INFO,
|
280 |
+
"hint": 'Call /reload with {"model_path": "%s"}' % path,
|
281 |
+
}
|
282 |
+
)
|
283 |
+
# Modèle déjà en mémoire
|
284 |
+
return jsonify({"status": "ready", "model": cfg.to_display(), "path": path})
|
285 |
+
except Exception as e:
|
286 |
+
# Cas vraiment inattendu
|
287 |
+
return jsonify({"status": "error", "error": str(e)}), 500
|
288 |
+
|
289 |
+
|
290 |
+
@app.get("/models")
|
291 |
+
def models() -> Any:
|
292 |
+
"""
|
293 |
+
Liste les GGUF disponibles et l'état du modèle actif.
|
294 |
+
"""
|
295 |
+
cfg = manager.config
|
296 |
+
|
297 |
+
available = []
|
298 |
+
for f in os.listdir(MODELS_DIR):
|
299 |
+
if f.lower().endswith(".gguf"):
|
300 |
+
full = os.path.join(MODELS_DIR, f)
|
301 |
+
try:
|
302 |
+
size = os.path.getsize(full)
|
303 |
+
except Exception:
|
304 |
+
size = None
|
305 |
+
available.append({"name": f, "path": full, "size_bytes": size})
|
306 |
+
|
307 |
+
active_path = cfg.model_path
|
308 |
+
active_exists = bool(active_path and os.path.exists(active_path))
|
309 |
+
state = (
|
310 |
+
"ready"
|
311 |
+
if manager.is_loaded()
|
312 |
+
else ("available" if active_exists else "unloaded")
|
313 |
+
)
|
314 |
+
|
315 |
+
return jsonify(
|
316 |
+
{
|
317 |
+
"object": "list",
|
318 |
+
"active": {
|
319 |
+
"path": active_path,
|
320 |
+
"basename": os.path.basename(active_path) if active_path else None,
|
321 |
+
"exists": active_exists,
|
322 |
+
"loaded": manager.is_loaded(),
|
323 |
+
"state": state,
|
324 |
+
"config": cfg.to_display(),
|
325 |
+
},
|
326 |
+
"available": available,
|
327 |
+
"hint": 'Switch with POST /reload {"model_path": "/models/<file>.gguf"}',
|
328 |
+
"boot_info": BOOT_INFO,
|
329 |
+
}
|
330 |
+
)
|
331 |
+
|
332 |
+
|
333 |
+
# -------------------------------
|
334 |
+
# Gestion des modèles (load/reload)
|
335 |
+
# -------------------------------
|
336 |
+
@app.post("/load")
|
337 |
+
def load_model_file() -> Any:
|
338 |
+
"""
|
339 |
+
Télécharge un GGUF depuis Hugging Face Hub vers MODELS_DIR.
|
340 |
+
Body: {"repo_id": str, "filename": str}
|
341 |
+
Ne recharge pas automatiquement le modèle — appeler /reload ensuite.
|
342 |
+
"""
|
343 |
+
if not HAS_HF:
|
344 |
+
return (
|
345 |
+
jsonify(
|
346 |
+
{"error": {"message": "huggingface_hub is not installed on server."}}
|
347 |
+
),
|
348 |
+
500,
|
349 |
+
)
|
350 |
+
|
351 |
+
body = request.get_json(force=True)
|
352 |
+
repo_id = (body.get("repo_id") or "").strip()
|
353 |
+
filename = (body.get("filename") or "").strip()
|
354 |
+
if not repo_id or not filename:
|
355 |
+
return jsonify({"error": {"message": "repo_id and filename are required"}}), 400
|
356 |
+
|
357 |
+
try:
|
358 |
+
# Respecte HF_TOKEN éventuellement défini dans l'env.
|
359 |
+
local_cached = hf_hub_download(repo_id=repo_id, filename=filename)
|
360 |
+
target_path = os.path.join(MODELS_DIR, os.path.basename(filename))
|
361 |
+
if os.path.abspath(local_cached) != os.path.abspath(target_path):
|
362 |
+
# Copie (chunkée) depuis le cache HF vers MODELS_DIR
|
363 |
+
with open(local_cached, "rb") as src, open(target_path, "wb") as dst:
|
364 |
+
while True:
|
365 |
+
chunk = src.read(1024 * 1024)
|
366 |
+
if not chunk:
|
367 |
+
break
|
368 |
+
dst.write(chunk)
|
369 |
+
return jsonify(
|
370 |
+
{
|
371 |
+
"status": "downloaded",
|
372 |
+
"repo_id": repo_id,
|
373 |
+
"filename": filename,
|
374 |
+
"path": target_path,
|
375 |
+
"hint": 'POST /reload {"model_path": "%s"}' % target_path,
|
376 |
+
}
|
377 |
+
)
|
378 |
+
except Exception as e:
|
379 |
+
return jsonify({"error": {"message": str(e)}}), 500
|
380 |
+
|
381 |
+
|
382 |
+
@app.post("/reload")
|
383 |
+
def reload_model() -> Any:
|
384 |
+
"""
|
385 |
+
Recharge le modèle avec un nouveau MODEL_PATH et/ou nouveaux hyperparamètres.
|
386 |
+
Body: {"model_path": str, "n_ctx": int, "n_threads": int, "n_batch": int, "n_gpu_layers": int, "verbose": bool}
|
387 |
+
"""
|
388 |
+
payload = request.get_json(silent=True) or {}
|
389 |
+
model_path = payload.get("model_path", manager.config.model_path)
|
390 |
+
|
391 |
+
cfg = ModelConfig(
|
392 |
+
model_path=model_path,
|
393 |
+
n_ctx=int(payload.get("n_ctx", manager.config.n_ctx)),
|
394 |
+
n_threads=int(payload.get("n_threads", manager.config.n_threads)),
|
395 |
+
n_batch=int(payload.get("n_batch", manager.config.n_batch)),
|
396 |
+
n_gpu_layers=int(payload.get("n_gpu_layers", manager.config.n_gpu_layers)),
|
397 |
+
verbose=bool(payload.get("verbose", manager.config.verbose)),
|
398 |
+
)
|
399 |
+
try:
|
400 |
+
info = manager.reload(cfg)
|
401 |
+
return jsonify(info)
|
402 |
+
except Exception as e:
|
403 |
+
return jsonify({"status": "error", "error": str(e)}), 400
|
404 |
+
|
405 |
+
|
406 |
+
# -------------------------------
|
407 |
+
# Chat Completions (OpenAI-like) + SSE
|
408 |
+
# -------------------------------
|
409 |
+
def sse_format(data: Dict[str, Any]) -> str:
|
410 |
+
return "data: " + json.dumps(data, ensure_ascii=False) + "\n\n"
|
411 |
+
|
412 |
+
|
413 |
+
@app.post("/v1/chat/completions")
|
414 |
+
def chat_completions() -> Any:
|
415 |
+
"""
|
416 |
+
Endpoint OpenAI-like (non-stream ou SSE si {"stream": true}).
|
417 |
+
Body minimal:
|
418 |
+
{
|
419 |
+
"messages": [{"role":"system","content":"..."}, {"role":"user","content":"..."}],
|
420 |
+
"stream": true|false,
|
421 |
+
"temperature": 0.7,
|
422 |
+
"top_p": 0.95,
|
423 |
+
"top_k": 64,
|
424 |
+
"min_p": 0.001,
|
425 |
+
"max_tokens": 256,
|
426 |
+
"stop": ["..."],
|
427 |
+
"response_format": {...}
|
428 |
+
}
|
429 |
+
"""
|
430 |
+
llm = manager.get_llm()
|
431 |
+
body = request.get_json(force=True)
|
432 |
+
|
433 |
+
messages: List[Dict[str, str]] = body.get("messages", [])
|
434 |
+
if not messages:
|
435 |
+
return jsonify({"error": {"message": "messages[] is required"}}), 400
|
436 |
+
|
437 |
+
stream = bool(body.get("stream", False))
|
438 |
+
|
439 |
+
params: Dict[str, Any] = {}
|
440 |
+
for key in ("temperature", "top_p", "top_k", "min_p", "max_tokens", "stop"):
|
441 |
+
if key in body:
|
442 |
+
params[key] = body[key]
|
443 |
+
|
444 |
+
if "response_format" in body:
|
445 |
+
params["response_format"] = body["response_format"]
|
446 |
+
|
447 |
+
if not stream:
|
448 |
+
t0 = time.time()
|
449 |
+
resp = llm.create_chat_completion(messages=messages, **params)
|
450 |
+
dt = time.time() - t0
|
451 |
+
out = {
|
452 |
+
"id": resp.get("id", f"chatcmpl-{int(time.time()*1000)}"),
|
453 |
+
"object": "chat.completion",
|
454 |
+
"created": int(time.time()),
|
455 |
+
"model": os.path.basename(manager.config.model_path or "local.gguf"),
|
456 |
+
"choices": resp.get("choices", []),
|
457 |
+
"usage": resp.get("usage", {}),
|
458 |
+
"latency_ms": int(dt * 1000),
|
459 |
+
}
|
460 |
+
return jsonify(out)
|
461 |
+
|
462 |
+
# Streaming SSE
|
463 |
+
def generate() -> Generator[str, None, None]:
|
464 |
+
t0 = time.time()
|
465 |
+
try:
|
466 |
+
for chunk in llm.create_chat_completion(
|
467 |
+
messages=messages, stream=True, **params
|
468 |
+
):
|
469 |
+
# Chunks similaires au format OpenAI
|
470 |
+
out = {
|
471 |
+
"id": chunk.get("id", f"chatcmpl-{int(time.time()*1000)}"),
|
472 |
+
"object": "chat.completion.chunk",
|
473 |
+
"created": int(time.time()),
|
474 |
+
"model": os.path.basename(manager.config.model_path or "local.gguf"),
|
475 |
+
"choices": chunk.get("choices", []),
|
476 |
+
}
|
477 |
+
yield sse_format(out)
|
478 |
+
except Exception as e:
|
479 |
+
yield sse_format({"error": {"message": str(e)}})
|
480 |
+
finally:
|
481 |
+
yield sse_format({"done": True})
|
482 |
+
yield "data: [DONE]\n\n"
|
483 |
+
|
484 |
+
return Response(stream_with_context(generate()), mimetype="text/event-stream")
|
485 |
+
|
486 |
+
|
487 |
+
# -------------------------------
|
488 |
+
# Extraction JSON (non-stream)
|
489 |
+
# -------------------------------
|
490 |
+
@app.post("/v1/extract")
|
491 |
+
def extract() -> Any:
|
492 |
+
"""
|
493 |
+
Extraction JSON avec schéma optionnel.
|
494 |
+
Body:
|
495 |
+
- text: str (requis)
|
496 |
+
- schema: dict (optionnel) -> transmis via response_format si supporté
|
497 |
+
- system_prompt: str (optionnel)
|
498 |
+
- temperature/top_p/top_k/min_p/max_tokens/stop: optionnels
|
499 |
+
"""
|
500 |
+
llm = manager.get_llm()
|
501 |
+
body = request.get_json(force=True)
|
502 |
+
|
503 |
+
text: str = body.get("text", "").strip()
|
504 |
+
if not text:
|
505 |
+
return jsonify({"error": {"message": "'text' is required"}}), 400
|
506 |
+
|
507 |
+
schema = body.get("schema")
|
508 |
+
system_prompt = body.get(
|
509 |
+
"system_prompt",
|
510 |
+
"You are a helpful assistant that outputs ONLY valid minified JSON. Do not include prose.",
|
511 |
+
)
|
512 |
+
|
513 |
+
params: Dict[str, Any] = {}
|
514 |
+
for key in ("temperature", "top_p", "top_k", "min_p", "max_tokens", "stop"):
|
515 |
+
if key in body:
|
516 |
+
params[key] = body[key]
|
517 |
+
|
518 |
+
if isinstance(schema, dict):
|
519 |
+
params["response_format"] = {"type": "json_object", "schema": schema}
|
520 |
+
|
521 |
+
messages = [
|
522 |
+
{"role": "system", "content": system_prompt},
|
523 |
+
{"role": "user", "content": text},
|
524 |
+
]
|
525 |
+
|
526 |
+
try:
|
527 |
+
t0 = time.time()
|
528 |
+
resp = llm.create_chat_completion(messages=messages, **params)
|
529 |
+
dt = time.time() - t0
|
530 |
+
content = ((resp.get("choices") or [{}])[0].get("message") or {}).get(
|
531 |
+
"content", ""
|
532 |
+
)
|
533 |
+
try:
|
534 |
+
parsed = json.loads(content)
|
535 |
+
except Exception as pe:
|
536 |
+
return (
|
537 |
+
jsonify(
|
538 |
+
{
|
539 |
+
"error": {
|
540 |
+
"message": "Model did not produce valid JSON.",
|
541 |
+
"model_output": content,
|
542 |
+
"parse_error": str(pe),
|
543 |
+
}
|
544 |
+
}
|
545 |
+
),
|
546 |
+
422,
|
547 |
+
)
|
548 |
+
|
549 |
+
# Contrôle léger des clés attendues
|
550 |
+
if isinstance(schema, dict) and "properties" in schema:
|
551 |
+
missing = [
|
552 |
+
k
|
553 |
+
for k in schema["properties"].keys()
|
554 |
+
if not (isinstance(parsed, dict) and k in parsed)
|
555 |
+
]
|
556 |
+
if missing:
|
557 |
+
return (
|
558 |
+
jsonify(
|
559 |
+
{
|
560 |
+
"error": {
|
561 |
+
"message": "JSON is valid but missing required keys (best-effort check).",
|
562 |
+
"missing": missing,
|
563 |
+
"model_output": parsed,
|
564 |
+
}
|
565 |
+
}
|
566 |
+
),
|
567 |
+
206,
|
568 |
+
)
|
569 |
+
|
570 |
+
return jsonify(
|
571 |
+
{
|
572 |
+
"object": "extract.result",
|
573 |
+
"created": int(time.time()),
|
574 |
+
"model": os.path.basename(manager.config.model_path or "local.gguf"),
|
575 |
+
"latency_ms": int(dt * 1000),
|
576 |
+
"data": parsed,
|
577 |
+
}
|
578 |
+
)
|
579 |
+
except Exception as e:
|
580 |
+
return jsonify({"error": {"message": str(e)}}), 500
|
581 |
+
|
582 |
+
|
583 |
+
# -------------------------------
|
584 |
+
# Entrypoint
|
585 |
+
# -------------------------------
|
586 |
+
if __name__ == "__main__":
|
587 |
+
host = getenv_str("HOST", "0.0.0.0") or "0.0.0.0"
|
588 |
+
port = getenv_int("PORT", 8000)
|
589 |
+
debug = os.getenv("FLASK_DEBUG", "0") == "1"
|
590 |
+
print("Starting Flask on", host, port, "| MODELS_DIR=", MODELS_DIR)
|
591 |
+
app.run(host=host, port=port, debug=debug)
|