Spaces:

prithivMLmods
/

Doc-VLMs

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 15

Commit

908cadf

verified ·

1 Parent(s): f16ee26

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -6

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
 )
-model.eval()  # Ensure the model is in evaluation mode.
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
@@ -172,7 +172,7 @@ def process_history(history: list[dict]) -> list[dict]:
 def generate_thread(generate_kwargs):
-    # Empty cache to free up memory and run generation under no_grad.
     torch.cuda.empty_cache()
     with torch.no_grad():
         model.generate(**generate_kwargs)
@@ -190,13 +190,15 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
     messages.extend(process_history(history))
     messages.append({"role": "user", "content": process_new_user_message(message)})
-    inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
         return_tensors="pt",
-    ).to(device=model.device, dtype=torch.bfloat16)
     streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
@@ -204,7 +206,7 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
         streamer=streamer,
         max_new_tokens=max_new_tokens,
     )
-    # Launch generation in a separate thread using our no_grad wrapper.
     t = Thread(target=generate_thread, kwargs={"generate_kwargs": generate_kwargs})
     t.start()
@@ -364,4 +366,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    demo.launch()

 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
 )
+model.eval()  # Set model to evaluation mode.
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
 def generate_thread(generate_kwargs):
+    # Clear cache and run generation under no_grad.
     torch.cuda.empty_cache()
     with torch.no_grad():
         model.generate(**generate_kwargs)
     messages.extend(process_history(history))
     messages.append({"role": "user", "content": process_new_user_message(message)})
+    # Apply chat template and convert each tensor in the resulting dict.
+    raw_inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
         return_tensors="pt",
+    )
+    inputs = {k: v.to(device=model.device, dtype=torch.bfloat16) for k, v in raw_inputs.items()}
     streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         streamer=streamer,
         max_new_tokens=max_new_tokens,
     )
+    # Launch generation in a separate thread.
     t = Thread(target=generate_thread, kwargs={"generate_kwargs": generate_kwargs})
     t.start()
 )
 if __name__ == "__main__":
+    demo.launch(share=True)