Vintern-3B-Demo

Running on Zero

qnguyen3 commited on Apr 8

Commit

6af2451

•

1 Parent(s): 3533245

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 from threading import Thread
 import re
 import time
@@ -9,6 +10,8 @@ import spaces
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 tokenizer = AutoTokenizer.from_pretrained(
     'qnguyen3/nanoLLaVA',
     trust_remote_code=True)
@@ -18,7 +21,7 @@ model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.float16,
     device_map='auto',
     trust_remote_code=True)
-model.to("cuda:0")
 @spaces.GPU
 def bot_streaming(message, history):
@@ -57,9 +60,10 @@ def bot_streaming(message, history):
         add_generation_prompt=True)
     text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
     input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
-    streamer = TextStreamer(tokenizer, **{"skip_special_tokens": True})
     image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
-    generation_kwargs = dict(inputs=input_ids, images=image_tensor, streamer=streamer, max_new_tokens=100)
     generated_text = ""
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()

 import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 import re
 import time
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+torch.set_default_device('cuda')
 tokenizer = AutoTokenizer.from_pretrained(
     'qnguyen3/nanoLLaVA',
     trust_remote_code=True)
     torch_dtype=torch.float16,
     device_map='auto',
     trust_remote_code=True)
 @spaces.GPU
 def bot_streaming(message, history):
         add_generation_prompt=True)
     text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
     input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens = True)
     image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
+    generation_kwargs = dict(input_ids=input_ids, images=image_tensor, streamer=streamer, max_new_tokens=100)
     generated_text = ""
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()