Spaces:

Pectics
/

Softie

Sleeping

Pectics commited on Jan 23

Commit

aa819ab

verified ·

1 Parent(s): 1325e72

Fix ext threads invoking @GPU

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from threading import Thread
-from spaces import GPU
 from gradio import ChatInterface, Textbox, Slider
 from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor, BatchFeature
 from qwen_vl_utils import process_vision_info
@@ -9,7 +9,7 @@ model_path = "Pectics/Softie-VL-7B-250123"
 model = Qwen2VLForConditionalGeneration.from_pretrained(
     model_path,
-    torch_dtype="auto",
     attn_implementation="flash_attention_2",
     device_map="auto",
 )
@@ -18,9 +18,9 @@ max_pixels = 1280 * 28 * 28
 processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
 @GPU
-def infer(inputs: BatchFeature, **kwargs) -> None:
     inputs = inputs.to("cuda")
-    model.generate(**kwargs)
 def respond(
     message,
@@ -51,7 +51,7 @@ def respond(
         temperature=temperature,
         top_p=top_p,
     )
-    Thread(target=infer, kwargs=kwargs).start()
     response = ""
     for token in streamer:
         response += token

 from gradio import ChatInterface, Textbox, Slider
+from spaces import GPU
+from threading import Thread
+from torch import bfloat16
 from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor, BatchFeature
 from qwen_vl_utils import process_vision_info
 model = Qwen2VLForConditionalGeneration.from_pretrained(
     model_path,
+    torch_dtype=bfloat16,
     attn_implementation="flash_attention_2",
     device_map="auto",
 )
 processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
 @GPU
+def infer(inputs: BatchFeature, **kwargs):
     inputs = inputs.to("cuda")
+    Thread(target=model.generate, kwargs=kwargs).start()
 def respond(
     message,
         temperature=temperature,
         top_p=top_p,
     )
+    infer(**kwargs)
     response = ""
     for token in streamer:
         response += token