Serg4451D commited on
Commit
ebe16a8
·
verified ·
1 Parent(s): 6a212af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +189 -85
app.py CHANGED
@@ -1,13 +1,15 @@
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
- Минималистичный визуальный чат как в мессенджерах:
5
- - Внизу компактная строка ввода с маленькой кнопкой добавления изображений.
6
- - Авто-подпись к изображению (<MORE_DETAILED_CAPTION>) через NVIDIA Florence-2 (NIM API).
7
- - Ответ LLM (стриминг) через NVIDIA Integrate (OpenAI-совместимый API).
8
- - Без WebGPU/wasm, без громоздких панелей.
9
-
10
- Требуется в Secrets HF Space: NV_API_KEY
 
 
11
  """
12
 
13
  import os
@@ -38,7 +40,6 @@ def _guess_mime(path: str) -> str:
38
  return mimetypes.guess_type(path)[0] or "image/jpeg"
39
 
40
  def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
41
- # 1) авторизация на загрузку
42
  auth = requests.post(
43
  NVCF_ASSETS_URL,
44
  headers={
@@ -52,7 +53,6 @@ def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
52
  auth.raise_for_status()
53
  up_url = auth.json()["uploadUrl"]
54
  asset_id = str(auth.json()["assetId"])
55
- # 2) загрузка бинарника
56
  with open(image_path, "rb") as f:
57
  put = requests.put(
58
  up_url,
@@ -66,43 +66,93 @@ def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
66
  put.raise_for_status()
67
  return asset_id
68
 
69
- def _vlm_content_more_detailed_caption(asset_id: str) -> str:
70
- # Формат: "<TASK_PROMPT><img>"
71
- return f'<MORE_DETAILED_CAPTION><img src="data:image/jpeg;asset_id,{asset_id}" />'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- def _parse_vlm_response(resp: requests.Response) -> str:
74
  """
75
- Возвращает извлечённый текст (caption/ocr/description), если доступен.
76
- Florence-2 может отдавать JSON или ZIP с файлами.
77
  """
 
78
  ct = (resp.headers.get("content-type") or "").lower()
79
  data = resp.content
80
 
81
- def extract_text_from_json(obj: Any) -> Optional[str]:
82
- keys = ["more_detailed_caption", "detailed_caption", "caption", "text", "ocr", "description"]
83
- def walk(o):
84
- res = []
85
- if isinstance(o, dict):
86
- for k in keys:
87
- if k in o and isinstance(o[k], str) and o[k].strip():
88
- res.append(o[k].strip())
89
- for v in o.values():
90
- res.extend(walk(v))
91
- elif isinstance(o, list):
92
- for it in o:
93
- res.extend(walk(it))
94
- elif isinstance(o, str):
95
- if o.strip():
96
- res.append(o.strip())
97
- return res
98
- arr = walk(obj)
99
- return arr[0] if arr else None
100
-
101
- # JSON
102
  if "application/json" in ct and not data.startswith(b"PK"):
103
  try:
104
  obj = resp.json()
105
- return extract_text_from_json(obj) or json.dumps(obj, ensure_ascii=False)
 
 
 
 
106
  except Exception:
107
  pass
108
 
@@ -110,40 +160,64 @@ def _parse_vlm_response(resp: requests.Response) -> str:
110
  if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
111
  try:
112
  with zipfile.ZipFile(io.BytesIO(data), "r") as z:
113
- primary = None
114
- for name in z.namelist():
115
- with z.open(name) as f:
116
- raw = f.read()
117
- if name.lower().endswith(".json"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  try:
119
- obj = json.loads(raw.decode("utf-8", errors="ignore"))
120
- primary = extract_text_from_json(obj) or primary
 
 
121
  except Exception:
122
- pass
123
- elif name.lower().endswith(".txt") and primary is None:
124
- txt = raw.decode("utf-8", errors="ignore").strip()
125
- if txt:
126
- primary = txt
127
- return primary or "[Нет текстового результата]"
128
  except Exception:
129
  pass
130
 
131
- # Фоллбэк: текст
132
  try:
133
- return data.decode("utf-8", errors="ignore")
 
134
  except Exception:
135
- return "[Не удалось разобрать ответ Florence-2]"
136
-
137
- def get_more_detailed_caption(image_path: str) -> Tuple[str, str]:
138
- """
139
- Возвращает (caption, asset_id) для заданного изображения.
140
- """
141
- asset_id = nvcf_upload_asset(image_path)
142
- content = _vlm_content_more_detailed_caption(asset_id)
 
 
 
 
 
 
 
 
143
  payload = {"messages": [{"role": "user", "content": content}]}
144
  headers = {
145
  "Authorization": f"Bearer {NV_API_KEY}",
146
- "Accept": "application/json, application/zip, */*",
147
  "Content-Type": "application/json",
148
  "NVCF-INPUT-ASSET-REFERENCES": asset_id,
149
  "NVCF-FUNCTION-ASSET-IDS": asset_id,
@@ -151,8 +225,32 @@ def get_more_detailed_caption(image_path: str) -> Tuple[str, str]:
151
  resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
152
  if not resp.ok:
153
  raise RuntimeError(f"VLM HTTP {resp.status_code}: {resp.text}")
154
- caption = _parse_vlm_response(resp)
155
- return caption, asset_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  # --------------------- LLM streaming utils ---------------------
158
  def _extract_text_from_stream_chunk(chunk: Any) -> str:
@@ -187,7 +285,6 @@ def respond(
187
  ):
188
  """
189
  message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
190
- Возвращает generator с потоковым ответом LLM.
191
  """
192
  text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
193
  files = (message or {}).get("files", []) if isinstance(message, dict) else []
@@ -195,7 +292,6 @@ def respond(
195
  def first_image_path(files) -> Optional[str]:
196
  for f in files:
197
  if isinstance(f, dict) and f.get("path"):
198
- # gradio dict
199
  mt = f.get("mime_type") or _guess_mime(f["path"])
200
  if mt.startswith("image/"):
201
  return f["path"]
@@ -206,7 +302,7 @@ def respond(
206
 
207
  img_path = first_image_path(files)
208
 
209
- # Сформируем видимое сообщение пользователя (эстетично и лаконично)
210
  parts = []
211
  if text and text.strip():
212
  parts.append(text.strip())
@@ -218,20 +314,30 @@ def respond(
218
  chat_history.append([user_visible, ""])
219
  yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id
220
 
221
- # Капшен изображения (если есть новое)
222
  caption = last_caption or ""
223
  asset_id = last_asset_id or ""
224
  try:
225
  if img_path:
226
- caption, asset_id = get_more_detailed_caption(img_path)
 
 
 
 
 
 
227
  except Exception as e:
228
- caption = f"[Ошибка автокапшена: {e}]"
 
 
 
229
 
230
- # Системный промпт
231
  if caption:
232
  system_prompt = (
233
- "You are a helpful multimodal assistant.\n"
234
- "Use the provided 'More Detailed Caption' as authoritative visual context.\n"
 
235
  "If something is not visible or uncertain, say so.\n\n"
236
  "Image Caption START >>>\n"
237
  f"{caption}\n"
@@ -239,10 +345,14 @@ def respond(
239
  )
240
  else:
241
  system_prompt = (
242
- "You are a helpful assistant. The user might have sent text-only message. "
243
- "If they refer to an image but no caption is available, ask to attach an image."
 
244
  )
245
 
 
 
 
246
  # Стрим LLM
247
  assistant_accum = ""
248
  try:
@@ -250,7 +360,7 @@ def respond(
250
  model="openai/gpt-oss-120b",
251
  messages=[
252
  {"role": "system", "content": system_prompt},
253
- {"role": "user", "content": text or "Describe the attached image."}
254
  ],
255
  temperature=0.7,
256
  top_p=1.0,
@@ -265,14 +375,14 @@ def respond(
265
  chat_history[-1][1] = assistant_accum
266
  yield {"text": "", "files": []}, chat_history, caption, asset_id
267
 
268
- except Exception as e:
269
- # Фоллбэк без стрима
270
  try:
271
  resp = llm.chat.completions.create(
272
  model="openai/gpt-oss-120b",
273
  messages=[
274
  {"role": "system", "content": system_prompt},
275
- {"role": "user", "content": text or "Describe the attached image."}
276
  ],
277
  temperature=0.7,
278
  top_p=1.0,
@@ -314,7 +424,6 @@ messenger_css = """
314
  #send { min-width: 44px; max-width: 44px; height: 44px; border-radius: 999px; }
315
  #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
316
  .gr-chatbot { border-radius: 0 !important; }
317
- .gr-chatbot .wrap.svelte-1cl0v3x { padding: 12px !important; } /* мягкие отступы (селектор может отличаться по версии) */
318
  """
319
 
320
  theme = gr.themes.Soft(
@@ -334,13 +443,8 @@ with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
334
  asset_state = gr.State(value="")
335
 
336
  with gr.Group(elem_id="chat-wrap"):
337
- chatbot = gr.Chatbot(
338
- label="",
339
- height=560,
340
- elem_id="chat"
341
- )
342
 
343
- # Нижняя компактная строка ввода с маленькой кнопкой вложений внутри
344
  with gr.Row(elem_id="bottom-bar"):
345
  msg = gr.MultimodalTextbox(
346
  show_label=False,
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ Элегантный чат как в мессенджерах:
5
+ - Кнопка добавления изображения прямо в строке ввода.
6
+ - Florence-2 (NIM API) создаёт подпись (<MORE_DETAILED_CAPTION>) серверно.
7
+ - Надёжный парсер: вытягивает текст из ZIP/JSON, синтезирует summary из детекций,
8
+ и имеет фолбэки <DETAILED_CAPTION> <CAPTION> → <OCR>.
9
+ - LLM-стриминг через NVIDIA Integrate (OpenAI-совместимый API).
10
+ - Без WebGPU.
11
+
12
+ Требуется: NV_API_KEY в Secrets HF Space.
13
  """
14
 
15
  import os
 
40
  return mimetypes.guess_type(path)[0] or "image/jpeg"
41
 
42
  def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
 
43
  auth = requests.post(
44
  NVCF_ASSETS_URL,
45
  headers={
 
53
  auth.raise_for_status()
54
  up_url = auth.json()["uploadUrl"]
55
  asset_id = str(auth.json()["assetId"])
 
56
  with open(image_path, "rb") as f:
57
  put = requests.put(
58
  up_url,
 
66
  put.raise_for_status()
67
  return asset_id
68
 
69
+ def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> str:
70
+ # "<TASK_PROMPT><text_prompt (когда нужен)><img>"
71
+ parts = [task_token]
72
+ if text_prompt and text_prompt.strip():
73
+ parts.append(text_prompt.strip())
74
+ parts.append(f'<img src="data:image/jpeg;asset_id,{asset_id}" />')
75
+ return "".join(parts)
76
+
77
+ PRIORITY_TEXT_KEYS = [
78
+ "more_detailed_caption", "detailed_caption", "caption",
79
+ "generated_text", "text", "ocr", "description",
80
+ "output_text", "result_text",
81
+ ]
82
+ LABEL_KEYS = ["label", "name", "category", "class", "text"]
83
+
84
+ def _deep_text_candidates(obj: Any) -> List[str]:
85
+ out = []
86
+ def walk(o):
87
+ if isinstance(o, dict):
88
+ # Сначала — приоритетные ключи
89
+ for k in PRIORITY_TEXT_KEYS:
90
+ if k in o and isinstance(o[k], str) and o[k].strip():
91
+ out.append(o[k].strip())
92
+ # Затем любые строковые поля
93
+ for v in o.values():
94
+ walk(v)
95
+ elif isinstance(o, list):
96
+ for it in o:
97
+ walk(it)
98
+ elif isinstance(o, str):
99
+ if o.strip():
100
+ out.append(o.strip())
101
+ walk(obj)
102
+ return out
103
+
104
+ def _synthesize_from_detections(obj: Any) -> Optional[str]:
105
+ """
106
+ Если пришли детекции/объекты, собрать краткое резюме вида:
107
+ 'Обнаружено: person×2, dog×1'
108
+ """
109
+ labels = []
110
+ def walk(o):
111
+ if isinstance(o, dict):
112
+ # списки детекций под известными ключами
113
+ for key in ["detections", "predictions", "objects", "results"]:
114
+ if key in o and isinstance(o[key], list):
115
+ for it in o[key]:
116
+ if isinstance(it, dict):
117
+ label = None
118
+ for lk in LABEL_KEYS:
119
+ if lk in it and isinstance(it[lk], str):
120
+ label = it[lk]
121
+ break
122
+ if label:
123
+ labels.append(label)
124
+ for v in o.values():
125
+ walk(v)
126
+ elif isinstance(o, list):
127
+ for it in o:
128
+ walk(it)
129
+ walk(obj)
130
+ if not labels:
131
+ return None
132
+ # Подсчитать
133
+ from collections import Counter
134
+ c = Counter(labels)
135
+ parts = [f"{k}×{v}" for k, v in c.most_common()]
136
+ return "Обнаружено: " + ", ".join(parts)
137
 
138
+ def _parse_vlm_response_to_text(resp: requests.Response) -> Tuple[str, List[str]]:
139
  """
140
+ Возвращает (best_text, zip_listing).
141
+ Если нечего извлечь best_text = "" (важно для фолбэков).
142
  """
143
+ listing = []
144
  ct = (resp.headers.get("content-type") or "").lower()
145
  data = resp.content
146
 
147
+ # JSON inline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  if "application/json" in ct and not data.startswith(b"PK"):
149
  try:
150
  obj = resp.json()
151
+ cands = _deep_text_candidates(obj)
152
+ if cands:
153
+ return cands[0], listing
154
+ synth = _synthesize_from_detections(obj)
155
+ return (synth or ""), listing
156
  except Exception:
157
  pass
158
 
 
160
  if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
161
  try:
162
  with zipfile.ZipFile(io.BytesIO(data), "r") as z:
163
+ listing = z.namelist()
164
+ text_cands = []
165
+ synth_cand = None
166
+ # Сначала попробуем JSON
167
+ for name in listing:
168
+ if not name.lower().endswith(".json"):
169
+ continue
170
+ try:
171
+ with z.open(name) as f:
172
+ obj = json.loads(f.read().decode("utf-8", errors="ignore"))
173
+ text_cands += _deep_text_candidates(obj)
174
+ synth = _synthesize_from_detections(obj)
175
+ synth_cand = synth_cand or synth
176
+ except Exception:
177
+ continue
178
+ if text_cands:
179
+ return text_cands[0], listing
180
+ # Затем TXT
181
+ for name in listing:
182
+ if name.lower().endswith(".txt"):
183
  try:
184
+ with z.open(name) as f:
185
+ txt = f.read().decode("utf-8", errors="ignore").strip()
186
+ if txt:
187
+ return txt, listing
188
  except Exception:
189
+ continue
190
+ # Если ничего попробуем синтез из детекций
191
+ if synth_cand:
192
+ return synth_cand, listing
 
 
193
  except Exception:
194
  pass
195
 
196
+ # Фолбэк: попытка как текст
197
  try:
198
+ txt = data.decode("utf-8", errors="ignore").strip()
199
+ return (txt if txt else ""), listing
200
  except Exception:
201
+ return "", listing
202
+
203
+ def _is_good_caption(text: str) -> bool:
204
+ if not text:
205
+ return False
206
+ t = text.strip()
207
+ if not t or len(t) < 3:
208
+ return False
209
+ # Отбросим наши старые плейсхолдеры
210
+ bad_markers = [
211
+ "Получено", "изображений-результатов", "[Result empty]", "[Результат пуст]"
212
+ ]
213
+ return not any(m.lower() in t.lower() for m in bad_markers)
214
+
215
+ def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, List[str]]:
216
+ content = _vlm_content(task_token, asset_id, text_prompt)
217
  payload = {"messages": [{"role": "user", "content": content}]}
218
  headers = {
219
  "Authorization": f"Bearer {NV_API_KEY}",
220
+ "Accept": "application/zip, application/json, */*",
221
  "Content-Type": "application/json",
222
  "NVCF-INPUT-ASSET-REFERENCES": asset_id,
223
  "NVCF-FUNCTION-ASSET-IDS": asset_id,
 
225
  resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
226
  if not resp.ok:
227
  raise RuntimeError(f"VLM HTTP {resp.status_code}: {resp.text}")
228
+ text, listing = _parse_vlm_response_to_text(resp)
229
+ return text, listing
230
+
231
+ def get_robust_caption(image_path: str) -> Tuple[str, str, List[str]]:
232
+ """
233
+ Пытаемся получить осмысленную подпись.
234
+ Возвращает (caption, asset_id, zip_listing)
235
+ """
236
+ asset_id = nvcf_upload_asset(image_path)
237
+ attempts = [
238
+ ("<MORE_DETAILED_CAPTION>", None),
239
+ ("<DETAILED_CAPTION>", None),
240
+ ("<CAPTION>", None),
241
+ ("<OCR>", None),
242
+ ]
243
+ last_listing: List[str] = []
244
+ for task, txt in attempts:
245
+ try:
246
+ caption, listing = _call_florence(task, asset_id, txt)
247
+ last_listing = listing or last_listing
248
+ if _is_good_caption(caption):
249
+ return caption, asset_id, listing
250
+ except Exception:
251
+ continue
252
+ # Если совсем ничего — пустая строка (важно для чата)
253
+ return "", asset_id, last_listing
254
 
255
  # --------------------- LLM streaming utils ---------------------
256
  def _extract_text_from_stream_chunk(chunk: Any) -> str:
 
285
  ):
286
  """
287
  message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
 
288
  """
289
  text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
290
  files = (message or {}).get("files", []) if isinstance(message, dict) else []
 
292
  def first_image_path(files) -> Optional[str]:
293
  for f in files:
294
  if isinstance(f, dict) and f.get("path"):
 
295
  mt = f.get("mime_type") or _guess_mime(f["path"])
296
  if mt.startswith("image/"):
297
  return f["path"]
 
302
 
303
  img_path = first_image_path(files)
304
 
305
+ # Сообщение пользователя (лаконично)
306
  parts = []
307
  if text and text.strip():
308
  parts.append(text.strip())
 
314
  chat_history.append([user_visible, ""])
315
  yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id
316
 
317
+ # Подпись к изображению
318
  caption = last_caption or ""
319
  asset_id = last_asset_id or ""
320
  try:
321
  if img_path:
322
+ # Показать пользователю, что генерируем подпись
323
+ chat_history[-1][1] = "🔎 Генерирую подпись к изображению…"
324
+ yield {"text": "", "files": []}, chat_history, caption, asset_id
325
+
326
+ caption, asset_id, _ = get_robust_caption(img_path)
327
+ if not _is_good_caption(caption):
328
+ caption = "" # не подсовываем пустышку в LLM
329
  except Exception as e:
330
+ caption = ""
331
+ # Лаконично сигналим об ошибке в подкапоте
332
+ chat_history[-1][1] = f"⚠️ Не удалось получить подпись: {e}"
333
+ yield {"text": "", "files": []}, chat_history, caption, asset_id
334
 
335
+ # Системный промпт (без «рассуждений»)
336
  if caption:
337
  system_prompt = (
338
+ "You are a helpful multimodal assistant. "
339
+ "Use the provided 'More Detailed Caption' as visual context. "
340
+ "Do not reveal your chain-of-thought. "
341
  "If something is not visible or uncertain, say so.\n\n"
342
  "Image Caption START >>>\n"
343
  f"{caption}\n"
 
345
  )
346
  else:
347
  system_prompt = (
348
+ "You are a helpful assistant. "
349
+ "If the user refers to an image but no caption is available, ask them to reattach the image. "
350
+ "Do not reveal your chain-of-thought."
351
  )
352
 
353
+ # Текст для модели (если совсем ничего не написали, но есть изображение)
354
+ user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
355
+
356
  # Стрим LLM
357
  assistant_accum = ""
358
  try:
 
360
  model="openai/gpt-oss-120b",
361
  messages=[
362
  {"role": "system", "content": system_prompt},
363
+ {"role": "user", "content": user_text_for_llm}
364
  ],
365
  temperature=0.7,
366
  top_p=1.0,
 
375
  chat_history[-1][1] = assistant_accum
376
  yield {"text": "", "files": []}, chat_history, caption, asset_id
377
 
378
+ except Exception:
379
+ # Фолбэк без стрима
380
  try:
381
  resp = llm.chat.completions.create(
382
  model="openai/gpt-oss-120b",
383
  messages=[
384
  {"role": "system", "content": system_prompt},
385
+ {"role": "user", "content": user_text_for_llm}
386
  ],
387
  temperature=0.7,
388
  top_p=1.0,
 
424
  #send { min-width: 44px; max-width: 44px; height: 44px; border-radius: 999px; }
425
  #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
426
  .gr-chatbot { border-radius: 0 !important; }
 
427
  """
428
 
429
  theme = gr.themes.Soft(
 
443
  asset_state = gr.State(value="")
444
 
445
  with gr.Group(elem_id="chat-wrap"):
446
+ chatbot = gr.Chatbot(label="", height=560, elem_id="chat")
 
 
 
 
447
 
 
448
  with gr.Row(elem_id="bottom-bar"):
449
  msg = gr.MultimodalTextbox(
450
  show_label=False,