Spaces:

Serg4451D
/

gpt-oss-multimodal

Sleeping

App Files Files Community

Serg4451D commited on 17 days ago

Commit

ebe16a8

verified ·

1 Parent(s): 6a212af

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -85

app.py CHANGED Viewed

@@ -1,13 +1,15 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Минималистичный визуальный чат как в мессенджерах:
-- Внизу — компактная строка ввода с маленькой кнопкой добавления изображений.
-- Авто-подпись к изображению (<MORE_DETAILED_CAPTION>) через NVIDIA Florence-2 (NIM API).
-- Ответ LLM (стриминг) через NVIDIA Integrate (OpenAI-совместимый API).
-- Без WebGPU/wasm, без громоздких панелей.
-Требуется в Secrets HF Space: NV_API_KEY
 """
 import os
@@ -38,7 +40,6 @@ def _guess_mime(path: str) -> str:
     return mimetypes.guess_type(path)[0] or "image/jpeg"
 def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
-    # 1) авторизация на загрузку
     auth = requests.post(
         NVCF_ASSETS_URL,
         headers={
@@ -52,7 +53,6 @@ def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
     auth.raise_for_status()
     up_url = auth.json()["uploadUrl"]
     asset_id = str(auth.json()["assetId"])
-    # 2) загрузка бинарника
     with open(image_path, "rb") as f:
         put = requests.put(
             up_url,
@@ -66,43 +66,93 @@ def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
     put.raise_for_status()
     return asset_id
-def _vlm_content_more_detailed_caption(asset_id: str) -> str:
-    # Формат: "<TASK_PROMPT><img>"
-    return f'<MORE_DETAILED_CAPTION><img src="data:image/jpeg;asset_id,{asset_id}" />'
-def _parse_vlm_response(resp: requests.Response) -> str:
     """
-    Возвращает извлечённый текст (caption/ocr/description), если доступен.
-    Florence-2 может отдавать JSON или ZIP с файлами.
     """
     ct = (resp.headers.get("content-type") or "").lower()
     data = resp.content
-    def extract_text_from_json(obj: Any) -> Optional[str]:
-        keys = ["more_detailed_caption", "detailed_caption", "caption", "text", "ocr", "description"]
-        def walk(o):
-            res = []
-            if isinstance(o, dict):
-                for k in keys:
-                    if k in o and isinstance(o[k], str) and o[k].strip():
-                        res.append(o[k].strip())
-                for v in o.values():
-                    res.extend(walk(v))
-            elif isinstance(o, list):
-                for it in o:
-                    res.extend(walk(it))
-            elif isinstance(o, str):
-                if o.strip():
-                    res.append(o.strip())
-            return res
-        arr = walk(obj)
-        return arr[0] if arr else None
-    # JSON
     if "application/json" in ct and not data.startswith(b"PK"):
         try:
             obj = resp.json()
-            return extract_text_from_json(obj) or json.dumps(obj, ensure_ascii=False)
         except Exception:
             pass
@@ -110,40 +160,64 @@ def _parse_vlm_response(resp: requests.Response) -> str:
     if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
         try:
             with zipfile.ZipFile(io.BytesIO(data), "r") as z:
-                primary = None
-                for name in z.namelist():
-                    with z.open(name) as f:
-                        raw = f.read()
-                    if name.lower().endswith(".json"):
                         try:
-                            obj = json.loads(raw.decode("utf-8", errors="ignore"))
-                            primary = extract_text_from_json(obj) or primary
                         except Exception:
-                            pass
-                    elif name.lower().endswith(".txt") and primary is None:
-                        txt = raw.decode("utf-8", errors="ignore").strip()
-                        if txt:
-                            primary = txt
-                return primary or "[Нет текстового результата]"
         except Exception:
             pass
-    # Фоллбэк: текст
     try:
-        return data.decode("utf-8", errors="ignore")
     except Exception:
-        return "[Не удалось разобрать ответ Florence-2]"
-def get_more_detailed_caption(image_path: str) -> Tuple[str, str]:
-    """
-    Возвращает (caption, asset_id) для заданного изображения.
-    """
-    asset_id = nvcf_upload_asset(image_path)
-    content = _vlm_content_more_detailed_caption(asset_id)
     payload = {"messages": [{"role": "user", "content": content}]}
     headers = {
         "Authorization": f"Bearer {NV_API_KEY}",
-        "Accept": "application/json, application/zip, */*",
         "Content-Type": "application/json",
         "NVCF-INPUT-ASSET-REFERENCES": asset_id,
         "NVCF-FUNCTION-ASSET-IDS": asset_id,
@@ -151,8 +225,32 @@ def get_more_detailed_caption(image_path: str) -> Tuple[str, str]:
     resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
     if not resp.ok:
         raise RuntimeError(f"VLM HTTP {resp.status_code}: {resp.text}")
-    caption = _parse_vlm_response(resp)
-    return caption, asset_id
 # --------------------- LLM streaming utils ---------------------
 def _extract_text_from_stream_chunk(chunk: Any) -> str:
@@ -187,7 +285,6 @@ def respond(
 ):
     """
     message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
-    Возвращает generator с потоковым ответом LLM.
     """
     text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
     files = (message or {}).get("files", []) if isinstance(message, dict) else []
@@ -195,7 +292,6 @@ def respond(
     def first_image_path(files) -> Optional[str]:
         for f in files:
             if isinstance(f, dict) and f.get("path"):
-                # gradio dict
                 mt = f.get("mime_type") or _guess_mime(f["path"])
                 if mt.startswith("image/"):
                     return f["path"]
@@ -206,7 +302,7 @@ def respond(
     img_path = first_image_path(files)
-    # Сформируем видимое сообщение пользователя (эстетично и лаконично)
     parts = []
     if text and text.strip():
         parts.append(text.strip())
@@ -218,20 +314,30 @@ def respond(
     chat_history.append([user_visible, ""])
     yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id
-    # Капшен изображения (если есть новое)
     caption = last_caption or ""
     asset_id = last_asset_id or ""
     try:
         if img_path:
-            caption, asset_id = get_more_detailed_caption(img_path)
     except Exception as e:
-        caption = f"[Ошибка автокапшена: {e}]"
-    # Системный промпт
     if caption:
         system_prompt = (
-            "You are a helpful multimodal assistant.\n"
-            "Use the provided 'More Detailed Caption' as authoritative visual context.\n"
             "If something is not visible or uncertain, say so.\n\n"
             "Image Caption START >>>\n"
             f"{caption}\n"
@@ -239,10 +345,14 @@ def respond(
         )
     else:
         system_prompt = (
-            "You are a helpful assistant. The user might have sent text-only message. "
-            "If they refer to an image but no caption is available, ask to attach an image."
         )
     # Стрим LLM
     assistant_accum = ""
     try:
@@ -250,7 +360,7 @@ def respond(
             model="openai/gpt-oss-120b",
             messages=[
                 {"role": "system", "content": system_prompt},
-                {"role": "user", "content": text or "Describe the attached image."}
             ],
             temperature=0.7,
             top_p=1.0,
@@ -265,14 +375,14 @@ def respond(
             chat_history[-1][1] = assistant_accum
             yield {"text": "", "files": []}, chat_history, caption, asset_id
-    except Exception as e:
-        # Фоллбэк без стрима
         try:
             resp = llm.chat.completions.create(
                 model="openai/gpt-oss-120b",
                 messages=[
                     {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": text or "Describe the attached image."}
                 ],
                 temperature=0.7,
                 top_p=1.0,
@@ -314,7 +424,6 @@ messenger_css = """
 #send { min-width: 44px; max-width: 44px; height: 44px; border-radius: 999px; }
 #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
 .gr-chatbot { border-radius: 0 !important; }
-.gr-chatbot .wrap.svelte-1cl0v3x { padding: 12px !important; } /* мягкие отступы (селектор может отличаться по версии) */
 """
 theme = gr.themes.Soft(
@@ -334,13 +443,8 @@ with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
     asset_state = gr.State(value="")
     with gr.Group(elem_id="chat-wrap"):
-        chatbot = gr.Chatbot(
-            label="",
-            height=560,
-            elem_id="chat"
-        )
-        # Нижняя компактная строка ввода с маленькой кнопкой вложений внутри
         with gr.Row(elem_id="bottom-bar"):
             msg = gr.MultimodalTextbox(
                 show_label=False,

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+Элегантный чат как в мессенджерах:
+- Кнопка добавления изображения прямо в строке ввода.
+- Florence-2 (NIM API) создаёт подпись (<MORE_DETAILED_CAPTION>) серверно.
+- Надёжный парсер: вытягивает текст из ZIP/JSON, синтезирует summary из детекций,
+  и имеет фолбэки <DETAILED_CAPTION> → <CAPTION> → <OCR>.
+- LLM-стриминг через NVIDIA Integrate (OpenAI-совместимый API).
+- Без WebGPU.
+Требуется: NV_API_KEY в Secrets HF Space.
 """
 import os
     return mimetypes.guess_type(path)[0] or "image/jpeg"
 def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
     auth = requests.post(
         NVCF_ASSETS_URL,
         headers={
     auth.raise_for_status()
     up_url = auth.json()["uploadUrl"]
     asset_id = str(auth.json()["assetId"])
     with open(image_path, "rb") as f:
         put = requests.put(
             up_url,
     put.raise_for_status()
     return asset_id
+def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> str:
+    # "<TASK_PROMPT><text_prompt (когда нужен)><img>"
+    parts = [task_token]
+    if text_prompt and text_prompt.strip():
+        parts.append(text_prompt.strip())
+    parts.append(f'<img src="data:image/jpeg;asset_id,{asset_id}" />')
+    return "".join(parts)
+PRIORITY_TEXT_KEYS = [
+    "more_detailed_caption", "detailed_caption", "caption",
+    "generated_text", "text", "ocr", "description",
+    "output_text", "result_text",
+]
+LABEL_KEYS = ["label", "name", "category", "class", "text"]
+def _deep_text_candidates(obj: Any) -> List[str]:
+    out = []
+    def walk(o):
+        if isinstance(o, dict):
+            # Сначала — приоритетные ключи
+            for k in PRIORITY_TEXT_KEYS:
+                if k in o and isinstance(o[k], str) and o[k].strip():
+                    out.append(o[k].strip())
+            # Затем любые строковые поля
+            for v in o.values():
+                walk(v)
+        elif isinstance(o, list):
+            for it in o:
+                walk(it)
+        elif isinstance(o, str):
+            if o.strip():
+                out.append(o.strip())
+    walk(obj)
+    return out
+def _synthesize_from_detections(obj: Any) -> Optional[str]:
+    """
+    Если пришли детекции/объекты, собрать краткое резюме вида:
+    'Обнаружено: person×2, dog×1'
+    """
+    labels = []
+    def walk(o):
+        if isinstance(o, dict):
+            # списки детекций под известными ключами
+            for key in ["detections", "predictions", "objects", "results"]:
+                if key in o and isinstance(o[key], list):
+                    for it in o[key]:
+                        if isinstance(it, dict):
+                            label = None
+                            for lk in LABEL_KEYS:
+                                if lk in it and isinstance(it[lk], str):
+                                    label = it[lk]
+                                    break
+                            if label:
+                                labels.append(label)
+            for v in o.values():
+                walk(v)
+        elif isinstance(o, list):
+            for it in o:
+                walk(it)
+    walk(obj)
+    if not labels:
+        return None
+    # Подсчитать
+    from collections import Counter
+    c = Counter(labels)
+    parts = [f"{k}×{v}" for k, v in c.most_common()]
+    return "Обнаружено: " + ", ".join(parts)
+def _parse_vlm_response_to_text(resp: requests.Response) -> Tuple[str, List[str]]:
     """
+    Возвращает (best_text, zip_listing).
+    Если нечего извлечь — best_text = "" (важно для фолбэков).
     """
+    listing = []
     ct = (resp.headers.get("content-type") or "").lower()
     data = resp.content
+    # JSON inline
     if "application/json" in ct and not data.startswith(b"PK"):
         try:
             obj = resp.json()
+            cands = _deep_text_candidates(obj)
+            if cands:
+                return cands[0], listing
+            synth = _synthesize_from_detections(obj)
+            return (synth or ""), listing
         except Exception:
             pass
     if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
         try:
             with zipfile.ZipFile(io.BytesIO(data), "r") as z:
+                listing = z.namelist()
+                text_cands = []
+                synth_cand = None
+                # Сначала попробуем JSON
+                for name in listing:
+                    if not name.lower().endswith(".json"):
+                        continue
+                    try:
+                        with z.open(name) as f:
+                            obj = json.loads(f.read().decode("utf-8", errors="ignore"))
+                        text_cands += _deep_text_candidates(obj)
+                        synth = _synthesize_from_detections(obj)
+                        synth_cand = synth_cand or synth
+                    except Exception:
+                        continue
+                if text_cands:
+                    return text_cands[0], listing
+                # Затем TXT
+                for name in listing:
+                    if name.lower().endswith(".txt"):
                         try:
+                            with z.open(name) as f:
+                                txt = f.read().decode("utf-8", errors="ignore").strip()
+                            if txt:
+                                return txt, listing
                         except Exception:
+                            continue
+                # Если ничего — попробуем синтез из детекций
+                if synth_cand:
+                    return synth_cand, listing
         except Exception:
             pass
+    # Фолбэк: попытка как текст
     try:
+        txt = data.decode("utf-8", errors="ignore").strip()
+        return (txt if txt else ""), listing
     except Exception:
+        return "", listing
+def _is_good_caption(text: str) -> bool:
+    if not text:
+        return False
+    t = text.strip()
+    if not t or len(t) < 3:
+        return False
+    # Отбросим наши старые плейсхолдеры
+    bad_markers = [
+        "Получено", "изображений-результатов", "[Result empty]", "[Результат пуст]"
+    ]
+    return not any(m.lower() in t.lower() for m in bad_markers)
+def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, List[str]]:
+    content = _vlm_content(task_token, asset_id, text_prompt)
     payload = {"messages": [{"role": "user", "content": content}]}
     headers = {
         "Authorization": f"Bearer {NV_API_KEY}",
+        "Accept": "application/zip, application/json, */*",
         "Content-Type": "application/json",
         "NVCF-INPUT-ASSET-REFERENCES": asset_id,
         "NVCF-FUNCTION-ASSET-IDS": asset_id,
     resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
     if not resp.ok:
         raise RuntimeError(f"VLM HTTP {resp.status_code}: {resp.text}")
+    text, listing = _parse_vlm_response_to_text(resp)
+    return text, listing
+def get_robust_caption(image_path: str) -> Tuple[str, str, List[str]]:
+    """
+    Пытаемся получить осмысленную подпись.
+    Возвращает (caption, asset_id, zip_listing)
+    """
+    asset_id = nvcf_upload_asset(image_path)
+    attempts = [
+        ("<MORE_DETAILED_CAPTION>", None),
+        ("<DETAILED_CAPTION>", None),
+        ("<CAPTION>", None),
+        ("<OCR>", None),
+    ]
+    last_listing: List[str] = []
+    for task, txt in attempts:
+        try:
+            caption, listing = _call_florence(task, asset_id, txt)
+            last_listing = listing or last_listing
+            if _is_good_caption(caption):
+                return caption, asset_id, listing
+        except Exception:
+            continue
+    # Если совсем ничего — пустая строка (важно для чата)
+    return "", asset_id, last_listing
 # --------------------- LLM streaming utils ---------------------
 def _extract_text_from_stream_chunk(chunk: Any) -> str:
 ):
     """
     message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
     """
     text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
     files = (message or {}).get("files", []) if isinstance(message, dict) else []
     def first_image_path(files) -> Optional[str]:
         for f in files:
             if isinstance(f, dict) and f.get("path"):
                 mt = f.get("mime_type") or _guess_mime(f["path"])
                 if mt.startswith("image/"):
                     return f["path"]
     img_path = first_image_path(files)
+    # Сообщение пользователя (лаконично)
     parts = []
     if text and text.strip():
         parts.append(text.strip())
     chat_history.append([user_visible, ""])
     yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id
+    # Подпись к изображению
     caption = last_caption or ""
     asset_id = last_asset_id or ""
     try:
         if img_path:
+            # Показать пользователю, что генерируем подпись
+            chat_history[-1][1] = "🔎 Генерирую подпись к изображению…"
+            yield {"text": "", "files": []}, chat_history, caption, asset_id
+            caption, asset_id, _ = get_robust_caption(img_path)
+            if not _is_good_caption(caption):
+                caption = ""  # не подсовываем пустышку в LLM
     except Exception as e:
+        caption = ""
+        # Лаконично сигналим об ошибке в подкапоте
+        chat_history[-1][1] = f"⚠️ Не удалось получить подпись: {e}"
+        yield {"text": "", "files": []}, chat_history, caption, asset_id
+    # Системный промпт (без «рассуждений»)
     if caption:
         system_prompt = (
+            "You are a helpful multimodal assistant. "
+            "Use the provided 'More Detailed Caption' as visual context. "
+            "Do not reveal your chain-of-thought. "
             "If something is not visible or uncertain, say so.\n\n"
             "Image Caption START >>>\n"
             f"{caption}\n"
         )
     else:
         system_prompt = (
+            "You are a helpful assistant. "
+            "If the user refers to an image but no caption is available, ask them to reattach the image. "
+            "Do not reveal your chain-of-thought."
         )
+    # Текст для модели (если совсем ничего не написали, но есть изображение)
+    user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
     # Стрим LLM
     assistant_accum = ""
     try:
             model="openai/gpt-oss-120b",
             messages=[
                 {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_text_for_llm}
             ],
             temperature=0.7,
             top_p=1.0,
             chat_history[-1][1] = assistant_accum
             yield {"text": "", "files": []}, chat_history, caption, asset_id
+    except Exception:
+        # Фолбэк без стрима
         try:
             resp = llm.chat.completions.create(
                 model="openai/gpt-oss-120b",
                 messages=[
                     {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_text_for_llm}
                 ],
                 temperature=0.7,
                 top_p=1.0,
 #send { min-width: 44px; max-width: 44px; height: 44px; border-radius: 999px; }
 #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
 .gr-chatbot { border-radius: 0 !important; }
 """
 theme = gr.themes.Soft(
     asset_state = gr.State(value="")
     with gr.Group(elem_id="chat-wrap"):
+        chatbot = gr.Chatbot(label="", height=560, elem_id="chat")
         with gr.Row(elem_id="bottom-bar"):
             msg = gr.MultimodalTextbox(
                 show_label=False,