Spaces:

yejunliang23
/

ShapLLM-Omni

Running on Zero

App Files Files Community

yejunliang23 commited on May 26

Commit

eb04357

unverified ·

1 Parent(s): 3c5e0ed

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -8

app.py CHANGED Viewed

@@ -2,7 +2,8 @@ import os
 import torch
 from threading import Thread
 import gradio as gr
-from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 # 3D mesh dependencies
 import trimesh
@@ -13,20 +14,63 @@ import tempfile
 # --------- Configuration & Model Loading ---------
 MODEL_DIR = "Qwen/Qwen2-VL-7B-Instruct"
 # Load processor, tokenizer, model for Qwen2.5-VL
-processor = AutoProcessor.from_pretrained(MODEL_DIR)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
     MODEL_DIR,
     torch_dtype=torch.float16,
     device_map="auto",
     trust_remote_code=True
 )
-# Terminator tokens
-terminators = [tokenizer.eos_token_id]
 # --------- Chat Inference Function ---------
-def chat_qwen_vl(message: str, history: list, temperature: float = 0.7, max_new_tokens: int = 1024):
     """
     Stream chat response from local Qwen2.5-VL model.
     """

 import torch
 from threading import Thread
 import gradio as gr
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
 # 3D mesh dependencies
 import trimesh
 # --------- Configuration & Model Loading ---------
 MODEL_DIR = "Qwen/Qwen2-VL-7B-Instruct"
 # Load processor, tokenizer, model for Qwen2.5-VL
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_DIR,
     torch_dtype=torch.float16,
     device_map="auto",
     trust_remote_code=True
 )
+processor = AutoProcessor.from_pretrained(MODEL_DIR)
 # --------- Chat Inference Function ---------
+def chat_qwen_vl(messages):
+    # —— 原有多模态输入构造 —— #
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt"
+    ).to(model.device)
+    # —— 流式生成部分 —— #
+    # 1. 构造 streamer，用 processor.tokenizer（AutoProcessor 内部自带 tokenizer）
+    streamer = TextIteratorStreamer(
+        processor.tokenizer,
+        timeout=10.0,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
+    # 2. 把 streamer 和生成参数一起传给 model.generate
+    gen_kwargs = dict(
+        **inputs,           # 包含 input_ids, pixel_values, attention_mask 等
+        streamer=streamer,  # 关键：挂载 streamer
+        top_k=1024,
+        max_new_tokens=1280,
+        temperature=0.1,
+        top_p=0.1,
+        eos_token_id=terminators,  # 你的结束符 ID 列表
+    )
+    # 如果需要零温度贪心，则关闭采样
+    if gen_kwargs["temperature"] == 0:
+        gen_kwargs["do_sample"] = False
+    # 3. 在后台线程中启动生成
+    Thread(target=model.generate, kwargs=gen_kwargs).start()
+    # 4. 在主线程中实时读取并 yield
+    buffer = []
+    for chunk in streamer:
+        buffer.append(chunk)
+        # 每次拿到新片段就拼接并输出
+        yield "".join(buffer)
+def chat_qwen_vl_(message: str, history: list, temperature: float = 0.7, max_new_tokens: int = 1024):
     """
     Stream chat response from local Qwen2.5-VL model.
     """