Spaces:

ColdSlim
/

DermalCare

Sleeping

App Files Files Community

ColdSlim commited on 14 days ago

Commit

c818037

verified ·

1 Parent(s): 79837da

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -35

app.py CHANGED Viewed

@@ -1,32 +1,31 @@
 """
-    PetBull‑7B‑VL demo – ZeroGPU‑ready
 """
 import os
-import torch
 import spaces
 import gradio as gr
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForCausalLM
 from peft import PeftModel
 import transformers, accelerate, numpy as np
 print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
-# 0) Safer streaming for model shards
 os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
-# 1) Config
 BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
-ADAPTER_REPO = "ColdSlim/PetBull-7B"
 ADAPTER_REV  = "master"
 OFFLOAD_DIR  = "offload"
 DTYPE        = torch.float16
-# 2) Processor
 processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
-# 3) Load base model ON CPU (no AutoConfig; rely on remote code)
-base = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     torch_dtype=DTYPE,
     low_cpu_mem_usage=True,
@@ -35,7 +34,7 @@ base = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True,
 )
-# 4) Attach LoRA ON CPU
 model = PeftModel.from_pretrained(
     base,
     ADAPTER_REPO,
@@ -43,44 +42,59 @@ model = PeftModel.from_pretrained(
     device_map={"": "cpu"},
 ).eval()
-_model_on_gpu = False  # track once-per-session transfer
-# 5) Inference (request GPU only for this function)
 @spaces.GPU(duration=120)
-def generate_answer(
-    image,
-    question: str,
-    temperature: float = 0.7,
-    top_p: float = 0.95,
-    max_tokens: int = 256,
-) -> str:
     global _model_on_gpu
     if image is None:
         image = Image.new("RGB", (224, 224), color="white")
-    # Move model to GPU once (inside GPU-decorated function)
     if not _model_on_gpu:
         model.to("cuda")
         _model_on_gpu = True
-    # Prepare inputs on GPU
-    inputs = processor(text=[question], images=[image], return_tensors="pt")
-    inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}
     with torch.no_grad():
-        output_ids = model.generate(
             **inputs,
             max_new_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
         )
-    outputs = output_ids.to("cpu")
-    return processor.batch_decode(outputs, skip_special_tokens=True)[0]
-# 6) UI
-with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
     gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
     with gr.Row():
         with gr.Column():
@@ -93,10 +107,6 @@ with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
         with gr.Column():
             answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
-    ask.click(
-        generate_answer,
-        inputs=[img_in, txt_in, temp, topp, max_tok],
-        outputs=answer,
-    )
 demo.queue().launch()

 """
+    PetBull‑7B‑VL demo – ZeroGPU‑ready (Qwen2.5‑VL API)
 """
 import os
 import spaces
+import torch
 import gradio as gr
 from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 from peft import PeftModel
+from qwen_vl_utils import process_vision_info  # pip install qwen-vl-utils
 import transformers, accelerate, numpy as np
 print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
 os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
+# ---- Config ----
 BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
+ADAPTER_REPO = "ColdSlim/PetBull-7B"  # your LoRA
 ADAPTER_REV  = "master"
 OFFLOAD_DIR  = "offload"
 DTYPE        = torch.float16
+# ---- Processor (no GPU) ----
 processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
+# ---- Base model ON CPU (do NOT touch CUDA here) ----
+base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     BASE_MODEL,
     torch_dtype=DTYPE,
     low_cpu_mem_usage=True,
     trust_remote_code=True,
 )
+# ---- Attach LoRA ON CPU ----
 model = PeftModel.from_pretrained(
     base,
     ADAPTER_REPO,
     device_map={"": "cpu"},
 ).eval()
+_model_on_gpu = False  # once-per-session move
+# ---- Inference on GPU (ZeroGPU pattern) ----
 @spaces.GPU(duration=120)
+def generate_answer(image, question, temperature=0.7, top_p=0.95, max_tokens=256):
+    """
+    Uses Qwen2.5-VL chat template + qwen_vl_utils to prepare image+text, then generate.
+    """
     global _model_on_gpu
     if image is None:
         image = Image.new("RGB", (224, 224), color="white")
     if not _model_on_gpu:
         model.to("cuda")
         _model_on_gpu = True
+    # Build chat messages in Qwen format
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "image", "image": image},
+            {"type": "text",  "text": question or "Describe this image."},
+        ],
+    }]
+    # Processor helpers
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    # Pack tensors on GPU
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = {k: (v.to("cuda") if hasattr(v, "to") else v) for k, v in inputs.items()}
     with torch.no_grad():
+        out = model.generate(
             **inputs,
             max_new_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
         )
+    # Trim prompt tokens before decode (Qwen style)
+    trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out)]
+    return processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+# ---- UI ----
+with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU, Qwen2.5‑VL)") as demo:
     gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
     with gr.Row():
         with gr.Column():
         with gr.Column():
             answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
+    ask.click(generate_answer, inputs=[img_in, txt_in, temp, topp, max_tok], outputs=answer)
 demo.queue().launch()