Geraldine commited on
Commit
228ceb2
·
verified ·
1 Parent(s): 7182373

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +591 -0
app.py ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import os
6
+ import threading
7
+ import time
8
+ from dataclasses import dataclass, asdict
9
+ from typing import Any, Dict, Generator, List, Optional
10
+
11
+ from flask import Flask, Response, jsonify, request, stream_with_context
12
+ from flask_cors import CORS
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv() # charge .env si présent
16
+
17
+
18
+ # -------------------------------
19
+ # Dépendances externes
20
+ # -------------------------------
21
+ try:
22
+ from llama_cpp import Llama
23
+ except Exception as e: # pragma: no cover
24
+ raise RuntimeError(
25
+ "llama-cpp-python is required. Install with `pip install --upgrade llama-cpp-python`."
26
+ ) from e
27
+
28
+ try:
29
+ from huggingface_hub import hf_hub_download
30
+ HAS_HF = True
31
+ except Exception:
32
+ HAS_HF = False
33
+
34
+
35
+ # -------------------------------
36
+ # Helpers ENV
37
+ # -------------------------------
38
+ def getenv_str(name: str, default: Optional[str] = None) -> Optional[str]:
39
+ v = os.getenv(name, default)
40
+ return v if (v is None or v.strip() != "") else default
41
+
42
+
43
+ def getenv_int(name: str, default: int) -> int:
44
+ try:
45
+ return int(os.getenv(name, default))
46
+ except Exception:
47
+ return default
48
+
49
+
50
+ def getenv_bool(name: str, default: bool = False) -> bool:
51
+ v = os.getenv(name)
52
+ if v is None:
53
+ return default
54
+ return v.strip().lower() in ("1", "true", "yes", "y", "on")
55
+
56
+
57
+ # -------------------------------
58
+ # Modèle & Configuration
59
+ # -------------------------------
60
+ @dataclass
61
+ class ModelConfig:
62
+ # Chemin local vers un GGUF (sous MODELS_DIR)
63
+ model_path: Optional[str] = getenv_str("MODEL_PATH")
64
+
65
+ # Paramètres d'inférence
66
+ n_ctx: int = getenv_int("N_CTX", 4096)
67
+ n_threads: int = getenv_int("N_THREADS", os.cpu_count() or 4)
68
+ n_batch: int = getenv_int("N_BATCH", 256)
69
+ n_gpu_layers: int = getenv_int("N_GPU_LAYERS", 0) # CPU par défaut
70
+ verbose: bool = os.getenv("LLM_VERBOSE", "0") == "1"
71
+
72
+ def to_display(self) -> Dict[str, Any]:
73
+ d = asdict(self)
74
+ if d.get("model_path"):
75
+ d["model_path"] = os.path.basename(str(d["model_path"]))
76
+ return d
77
+
78
+
79
+ class ModelManager:
80
+ """Gestion thread-safe du modèle (lazy load + reload)."""
81
+
82
+ def __init__(self, config: Optional[ModelConfig] = None):
83
+ self._lock = threading.RLock()
84
+ self._llm: Optional[Llama] = None
85
+ self._config = config or ModelConfig()
86
+
87
+ @property
88
+ def config(self) -> ModelConfig:
89
+ return self._config
90
+
91
+ def is_loaded(self) -> bool:
92
+ with self._lock:
93
+ return self._llm is not None
94
+
95
+ def set_config(self, new_cfg: ModelConfig) -> None:
96
+ """Met à jour la config sans charger en mémoire."""
97
+ with self._lock:
98
+ self._config = new_cfg
99
+ # ne touche pas à _llm
100
+
101
+ def get_llm(self) -> Llama:
102
+ with self._lock:
103
+ if self._llm is not None:
104
+ return self._llm
105
+ self._llm = self._load_model(self._config)
106
+ return self._llm
107
+
108
+ def _load_model(self, cfg: ModelConfig) -> Llama:
109
+ if not cfg.model_path or not os.path.exists(cfg.model_path):
110
+ raise RuntimeError(
111
+ f"MODEL_PATH does not exist: {cfg.model_path!r}. Use /load to fetch a model or set MODEL_PATH."
112
+ )
113
+ t0 = time.time()
114
+ llm = Llama(
115
+ model_path=cfg.model_path,
116
+ n_ctx=cfg.n_ctx,
117
+ n_threads=cfg.n_threads,
118
+ n_batch=cfg.n_batch,
119
+ n_gpu_layers=cfg.n_gpu_layers,
120
+ verbose=cfg.verbose,
121
+ )
122
+ dt = time.time() - t0
123
+ print(f"[LLM] Loaded model in {dt:.2f}s | cfg={cfg.to_display()}")
124
+ return llm
125
+
126
+ def reload(self, new_cfg: Optional[ModelConfig] = None) -> Dict[str, Any]:
127
+ with self._lock:
128
+ self._llm = None
129
+ if new_cfg is not None:
130
+ self._config = new_cfg
131
+ _ = self.get_llm() # force le chargement pour valider
132
+ return {"status": "reloaded", "config": self._config.to_display()}
133
+
134
+
135
+ # -------------------------------
136
+ # Flask app
137
+ # -------------------------------
138
+ app = Flask(__name__)
139
+ CORS(app)
140
+
141
+ MODELS_DIR = getenv_str("MODELS_DIR", "/models") or "/models"
142
+ os.makedirs(MODELS_DIR, exist_ok=True)
143
+
144
+ # Options d'auto-load au démarrage
145
+ AUTO_REPO_ID = getenv_str("AUTO_REPO_ID")
146
+ AUTO_FILENAME = getenv_str("AUTO_FILENAME") # ex: "gemma-3-270m-it-Q4_K_M.gguf"
147
+ AUTO_IF_MISSING = getenv_bool("AUTO_IF_MISSING", True) # ne télécharge que si le fichier manque
148
+ AUTO_PRELOAD = getenv_bool("AUTO_PRELOAD", False) # charge en mémoire au boot
149
+
150
+ manager = ModelManager()
151
+
152
+
153
+ def _bootstrap_autoload() -> Dict[str, Any]:
154
+ """
155
+ Si MODEL_PATH n'existe pas et que AUTO_REPO_ID/AUTO_FILENAME sont fournis,
156
+ tente un téléchargement dans MODELS_DIR, puis prépare MODEL_PATH.
157
+ Si AUTO_PRELOAD=True, charge aussi le modèle en mémoire (via reload()).
158
+ """
159
+ info: Dict[str, Any] = {
160
+ "models_dir": MODELS_DIR,
161
+ "auto_repo_id": AUTO_REPO_ID,
162
+ "auto_filename": AUTO_FILENAME,
163
+ "auto_if_missing": AUTO_IF_MISSING,
164
+ "auto_preload": AUTO_PRELOAD,
165
+ "performed": False,
166
+ "path": None,
167
+ "error": None,
168
+ }
169
+
170
+ cfg = manager.config
171
+ target_path = cfg.model_path
172
+
173
+ # Si MODEL_PATH n'est pas défini mais qu'on a un AUTO_FILENAME, on propose une cible.
174
+ if (not target_path) and AUTO_FILENAME:
175
+ target_path = os.path.join(MODELS_DIR, os.path.basename(AUTO_FILENAME))
176
+
177
+ # Si on a déjà un fichier présent, pas besoin de télécharger.
178
+ if target_path and os.path.exists(target_path):
179
+ # Met à jour la config (sans précharger)
180
+ manager.set_config(
181
+ ModelConfig(
182
+ model_path=target_path,
183
+ n_ctx=cfg.n_ctx,
184
+ n_threads=cfg.n_threads,
185
+ n_batch=cfg.n_batch,
186
+ n_gpu_layers=cfg.n_gpu_layers,
187
+ verbose=cfg.verbose,
188
+ )
189
+ )
190
+ if AUTO_PRELOAD:
191
+ try:
192
+ manager.reload() # charge en mémoire
193
+ except Exception as e:
194
+ info["error"] = f"preload_failed: {e}"
195
+ info["performed"] = False
196
+ info["path"] = target_path
197
+ print(f"[BOOT] Model file already present at {target_path}")
198
+ return info
199
+
200
+ # Ici, le fichier n'existe pas : on tente un auto-load si possible.
201
+ if AUTO_IF_MISSING and AUTO_REPO_ID and AUTO_FILENAME and HAS_HF:
202
+ try:
203
+ print(f"[BOOT] Auto-load from HF: {AUTO_REPO_ID}/{AUTO_FILENAME}")
204
+ local_cached = hf_hub_download(repo_id=AUTO_REPO_ID, filename=AUTO_FILENAME)
205
+ target_path = os.path.join(MODELS_DIR, os.path.basename(AUTO_FILENAME))
206
+ if os.path.abspath(local_cached) != os.path.abspath(target_path):
207
+ # Copie chunkée
208
+ with open(local_cached, "rb") as src, open(target_path, "wb") as dst:
209
+ while True:
210
+ chunk = src.read(1024 * 1024)
211
+ if not chunk:
212
+ break
213
+ dst.write(chunk)
214
+
215
+ new_cfg = ModelConfig(
216
+ model_path=target_path,
217
+ n_ctx=cfg.n_ctx,
218
+ n_threads=cfg.n_threads,
219
+ n_batch=cfg.n_batch,
220
+ n_gpu_layers=cfg.n_gpu_layers,
221
+ verbose=cfg.verbose,
222
+ )
223
+
224
+ if AUTO_PRELOAD:
225
+ manager.reload(new_cfg) # charge en mémoire tout de suite
226
+ else:
227
+ manager.set_config(new_cfg) # défini le chemin, chargera à la 1re requête
228
+
229
+ info["performed"] = True
230
+ info["path"] = target_path
231
+ print(f"[BOOT] Auto-loaded model to {target_path} (preload={AUTO_PRELOAD})")
232
+ return info
233
+ except Exception as e:
234
+ info["error"] = str(e)
235
+ print(f"[BOOT][ERROR] Auto-load failed: {e}")
236
+ return info
237
+
238
+ # Pas d'auto-load possible / activé
239
+ return info
240
+
241
+
242
+ # Effectue l'auto-load au démarrage (non bloquant au niveau de la disponibilité HTTP)
243
+ BOOT_INFO = _bootstrap_autoload()
244
+
245
+
246
+ # -------------------------------
247
+ # Endpoints d'info
248
+ # -------------------------------
249
+ @app.get("/health")
250
+ def health() -> Any:
251
+ """
252
+ Health check "soft":
253
+ - Si MODEL_PATH absent ou fichier manquant -> status="unloaded" (HTTP 200), sans 500.
254
+ - Si fichier présent mais modèle non encore chargé -> status="available".
255
+ - Si modèle en mémoire -> status="ready".
256
+ """
257
+ try:
258
+ cfg = manager.config
259
+ path = cfg.model_path
260
+ if not path or not os.path.exists(path):
261
+ return jsonify(
262
+ {
263
+ "status": "unloaded",
264
+ "reason": "MODEL_PATH is not set or file not found",
265
+ "model": cfg.to_display(),
266
+ "exists": bool(path and os.path.exists(path)),
267
+ "boot_info": BOOT_INFO,
268
+ "hint": 'Use /load then /reload with {"model_path": "/models/your.gguf"} '
269
+ 'or set AUTO_REPO_ID/AUTO_FILENAME in .env for auto-load.',
270
+ }
271
+ )
272
+ if not manager.is_loaded():
273
+ # Fichier OK, modèle pas encore chargé
274
+ return jsonify(
275
+ {
276
+ "status": "available",
277
+ "model": cfg.to_display(),
278
+ "path": path,
279
+ "boot_info": BOOT_INFO,
280
+ "hint": 'Call /reload with {"model_path": "%s"}' % path,
281
+ }
282
+ )
283
+ # Modèle déjà en mémoire
284
+ return jsonify({"status": "ready", "model": cfg.to_display(), "path": path})
285
+ except Exception as e:
286
+ # Cas vraiment inattendu
287
+ return jsonify({"status": "error", "error": str(e)}), 500
288
+
289
+
290
+ @app.get("/models")
291
+ def models() -> Any:
292
+ """
293
+ Liste les GGUF disponibles et l'état du modèle actif.
294
+ """
295
+ cfg = manager.config
296
+
297
+ available = []
298
+ for f in os.listdir(MODELS_DIR):
299
+ if f.lower().endswith(".gguf"):
300
+ full = os.path.join(MODELS_DIR, f)
301
+ try:
302
+ size = os.path.getsize(full)
303
+ except Exception:
304
+ size = None
305
+ available.append({"name": f, "path": full, "size_bytes": size})
306
+
307
+ active_path = cfg.model_path
308
+ active_exists = bool(active_path and os.path.exists(active_path))
309
+ state = (
310
+ "ready"
311
+ if manager.is_loaded()
312
+ else ("available" if active_exists else "unloaded")
313
+ )
314
+
315
+ return jsonify(
316
+ {
317
+ "object": "list",
318
+ "active": {
319
+ "path": active_path,
320
+ "basename": os.path.basename(active_path) if active_path else None,
321
+ "exists": active_exists,
322
+ "loaded": manager.is_loaded(),
323
+ "state": state,
324
+ "config": cfg.to_display(),
325
+ },
326
+ "available": available,
327
+ "hint": 'Switch with POST /reload {"model_path": "/models/<file>.gguf"}',
328
+ "boot_info": BOOT_INFO,
329
+ }
330
+ )
331
+
332
+
333
+ # -------------------------------
334
+ # Gestion des modèles (load/reload)
335
+ # -------------------------------
336
+ @app.post("/load")
337
+ def load_model_file() -> Any:
338
+ """
339
+ Télécharge un GGUF depuis Hugging Face Hub vers MODELS_DIR.
340
+ Body: {"repo_id": str, "filename": str}
341
+ Ne recharge pas automatiquement le modèle — appeler /reload ensuite.
342
+ """
343
+ if not HAS_HF:
344
+ return (
345
+ jsonify(
346
+ {"error": {"message": "huggingface_hub is not installed on server."}}
347
+ ),
348
+ 500,
349
+ )
350
+
351
+ body = request.get_json(force=True)
352
+ repo_id = (body.get("repo_id") or "").strip()
353
+ filename = (body.get("filename") or "").strip()
354
+ if not repo_id or not filename:
355
+ return jsonify({"error": {"message": "repo_id and filename are required"}}), 400
356
+
357
+ try:
358
+ # Respecte HF_TOKEN éventuellement défini dans l'env.
359
+ local_cached = hf_hub_download(repo_id=repo_id, filename=filename)
360
+ target_path = os.path.join(MODELS_DIR, os.path.basename(filename))
361
+ if os.path.abspath(local_cached) != os.path.abspath(target_path):
362
+ # Copie (chunkée) depuis le cache HF vers MODELS_DIR
363
+ with open(local_cached, "rb") as src, open(target_path, "wb") as dst:
364
+ while True:
365
+ chunk = src.read(1024 * 1024)
366
+ if not chunk:
367
+ break
368
+ dst.write(chunk)
369
+ return jsonify(
370
+ {
371
+ "status": "downloaded",
372
+ "repo_id": repo_id,
373
+ "filename": filename,
374
+ "path": target_path,
375
+ "hint": 'POST /reload {"model_path": "%s"}' % target_path,
376
+ }
377
+ )
378
+ except Exception as e:
379
+ return jsonify({"error": {"message": str(e)}}), 500
380
+
381
+
382
+ @app.post("/reload")
383
+ def reload_model() -> Any:
384
+ """
385
+ Recharge le modèle avec un nouveau MODEL_PATH et/ou nouveaux hyperparamètres.
386
+ Body: {"model_path": str, "n_ctx": int, "n_threads": int, "n_batch": int, "n_gpu_layers": int, "verbose": bool}
387
+ """
388
+ payload = request.get_json(silent=True) or {}
389
+ model_path = payload.get("model_path", manager.config.model_path)
390
+
391
+ cfg = ModelConfig(
392
+ model_path=model_path,
393
+ n_ctx=int(payload.get("n_ctx", manager.config.n_ctx)),
394
+ n_threads=int(payload.get("n_threads", manager.config.n_threads)),
395
+ n_batch=int(payload.get("n_batch", manager.config.n_batch)),
396
+ n_gpu_layers=int(payload.get("n_gpu_layers", manager.config.n_gpu_layers)),
397
+ verbose=bool(payload.get("verbose", manager.config.verbose)),
398
+ )
399
+ try:
400
+ info = manager.reload(cfg)
401
+ return jsonify(info)
402
+ except Exception as e:
403
+ return jsonify({"status": "error", "error": str(e)}), 400
404
+
405
+
406
+ # -------------------------------
407
+ # Chat Completions (OpenAI-like) + SSE
408
+ # -------------------------------
409
+ def sse_format(data: Dict[str, Any]) -> str:
410
+ return "data: " + json.dumps(data, ensure_ascii=False) + "\n\n"
411
+
412
+
413
+ @app.post("/v1/chat/completions")
414
+ def chat_completions() -> Any:
415
+ """
416
+ Endpoint OpenAI-like (non-stream ou SSE si {"stream": true}).
417
+ Body minimal:
418
+ {
419
+ "messages": [{"role":"system","content":"..."}, {"role":"user","content":"..."}],
420
+ "stream": true|false,
421
+ "temperature": 0.7,
422
+ "top_p": 0.95,
423
+ "top_k": 64,
424
+ "min_p": 0.001,
425
+ "max_tokens": 256,
426
+ "stop": ["..."],
427
+ "response_format": {...}
428
+ }
429
+ """
430
+ llm = manager.get_llm()
431
+ body = request.get_json(force=True)
432
+
433
+ messages: List[Dict[str, str]] = body.get("messages", [])
434
+ if not messages:
435
+ return jsonify({"error": {"message": "messages[] is required"}}), 400
436
+
437
+ stream = bool(body.get("stream", False))
438
+
439
+ params: Dict[str, Any] = {}
440
+ for key in ("temperature", "top_p", "top_k", "min_p", "max_tokens", "stop"):
441
+ if key in body:
442
+ params[key] = body[key]
443
+
444
+ if "response_format" in body:
445
+ params["response_format"] = body["response_format"]
446
+
447
+ if not stream:
448
+ t0 = time.time()
449
+ resp = llm.create_chat_completion(messages=messages, **params)
450
+ dt = time.time() - t0
451
+ out = {
452
+ "id": resp.get("id", f"chatcmpl-{int(time.time()*1000)}"),
453
+ "object": "chat.completion",
454
+ "created": int(time.time()),
455
+ "model": os.path.basename(manager.config.model_path or "local.gguf"),
456
+ "choices": resp.get("choices", []),
457
+ "usage": resp.get("usage", {}),
458
+ "latency_ms": int(dt * 1000),
459
+ }
460
+ return jsonify(out)
461
+
462
+ # Streaming SSE
463
+ def generate() -> Generator[str, None, None]:
464
+ t0 = time.time()
465
+ try:
466
+ for chunk in llm.create_chat_completion(
467
+ messages=messages, stream=True, **params
468
+ ):
469
+ # Chunks similaires au format OpenAI
470
+ out = {
471
+ "id": chunk.get("id", f"chatcmpl-{int(time.time()*1000)}"),
472
+ "object": "chat.completion.chunk",
473
+ "created": int(time.time()),
474
+ "model": os.path.basename(manager.config.model_path or "local.gguf"),
475
+ "choices": chunk.get("choices", []),
476
+ }
477
+ yield sse_format(out)
478
+ except Exception as e:
479
+ yield sse_format({"error": {"message": str(e)}})
480
+ finally:
481
+ yield sse_format({"done": True})
482
+ yield "data: [DONE]\n\n"
483
+
484
+ return Response(stream_with_context(generate()), mimetype="text/event-stream")
485
+
486
+
487
+ # -------------------------------
488
+ # Extraction JSON (non-stream)
489
+ # -------------------------------
490
+ @app.post("/v1/extract")
491
+ def extract() -> Any:
492
+ """
493
+ Extraction JSON avec schéma optionnel.
494
+ Body:
495
+ - text: str (requis)
496
+ - schema: dict (optionnel) -> transmis via response_format si supporté
497
+ - system_prompt: str (optionnel)
498
+ - temperature/top_p/top_k/min_p/max_tokens/stop: optionnels
499
+ """
500
+ llm = manager.get_llm()
501
+ body = request.get_json(force=True)
502
+
503
+ text: str = body.get("text", "").strip()
504
+ if not text:
505
+ return jsonify({"error": {"message": "'text' is required"}}), 400
506
+
507
+ schema = body.get("schema")
508
+ system_prompt = body.get(
509
+ "system_prompt",
510
+ "You are a helpful assistant that outputs ONLY valid minified JSON. Do not include prose.",
511
+ )
512
+
513
+ params: Dict[str, Any] = {}
514
+ for key in ("temperature", "top_p", "top_k", "min_p", "max_tokens", "stop"):
515
+ if key in body:
516
+ params[key] = body[key]
517
+
518
+ if isinstance(schema, dict):
519
+ params["response_format"] = {"type": "json_object", "schema": schema}
520
+
521
+ messages = [
522
+ {"role": "system", "content": system_prompt},
523
+ {"role": "user", "content": text},
524
+ ]
525
+
526
+ try:
527
+ t0 = time.time()
528
+ resp = llm.create_chat_completion(messages=messages, **params)
529
+ dt = time.time() - t0
530
+ content = ((resp.get("choices") or [{}])[0].get("message") or {}).get(
531
+ "content", ""
532
+ )
533
+ try:
534
+ parsed = json.loads(content)
535
+ except Exception as pe:
536
+ return (
537
+ jsonify(
538
+ {
539
+ "error": {
540
+ "message": "Model did not produce valid JSON.",
541
+ "model_output": content,
542
+ "parse_error": str(pe),
543
+ }
544
+ }
545
+ ),
546
+ 422,
547
+ )
548
+
549
+ # Contrôle léger des clés attendues
550
+ if isinstance(schema, dict) and "properties" in schema:
551
+ missing = [
552
+ k
553
+ for k in schema["properties"].keys()
554
+ if not (isinstance(parsed, dict) and k in parsed)
555
+ ]
556
+ if missing:
557
+ return (
558
+ jsonify(
559
+ {
560
+ "error": {
561
+ "message": "JSON is valid but missing required keys (best-effort check).",
562
+ "missing": missing,
563
+ "model_output": parsed,
564
+ }
565
+ }
566
+ ),
567
+ 206,
568
+ )
569
+
570
+ return jsonify(
571
+ {
572
+ "object": "extract.result",
573
+ "created": int(time.time()),
574
+ "model": os.path.basename(manager.config.model_path or "local.gguf"),
575
+ "latency_ms": int(dt * 1000),
576
+ "data": parsed,
577
+ }
578
+ )
579
+ except Exception as e:
580
+ return jsonify({"error": {"message": str(e)}}), 500
581
+
582
+
583
+ # -------------------------------
584
+ # Entrypoint
585
+ # -------------------------------
586
+ if __name__ == "__main__":
587
+ host = getenv_str("HOST", "0.0.0.0") or "0.0.0.0"
588
+ port = getenv_int("PORT", 8000)
589
+ debug = os.getenv("FLASK_DEBUG", "0") == "1"
590
+ print("Starting Flask on", host, port, "| MODELS_DIR=", MODELS_DIR)
591
+ app.run(host=host, port=port, debug=debug)