Spaces:

ColdSlim
/

DermalCare

Running on Zero

App Files Files Community

ColdSlim commited on 16 days ago

Commit

c8b3c1b

verified ·

1 Parent(s): f257dd7

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -61

app.py CHANGED Viewed

@@ -1,101 +1,90 @@
 """
-PetBull-7B-VL demo – CPU-only, 16 GB-friendly
---------------------------------------------
-• Base model  : Qwen/Qwen2.5-VL-7B-Instruct
-• LoRA adapter: ColdSlim/PetBull-7B   (master branch)
-This script:
-  ✓ loads in bfloat16 (saves ~25 % RAM vs FP16)
-  ✓ streams weights to avoid peak memory spikes
-  ✓ off-loads large tensors to disk when RAM is tight
 """
-import os, torch, gradio as gr
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from peft import PeftModel
-# ---------------------------------------------------------------------
-# 0 Env tweaks for Hugging Face Accelerate
-# ---------------------------------------------------------------------
-os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"     # safer streaming
-# ---------------------------------------------------------------------
-# 1 Config
-# ---------------------------------------------------------------------
 BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
 ADAPTER_REPO = "ColdSlim/PetBull-7B"
-ADAPTER_REV  = "master"          # your model repo branch
-OFFLOAD_DIR  = "offload"         # folder on disk for big tensors
-device = "cpu"                   # force CPU
-dtype  = torch.bfloat16          # lighter than FP16 on modern CPUs
-# ---------------------------------------------------------------------
-# 2 Load processor (tiny)
-# ---------------------------------------------------------------------
 processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
-# ---------------------------------------------------------------------
-# 3 Load base model with memory-savvy flags
-# ---------------------------------------------------------------------
 base = AutoModelForVision2Seq.from_pretrained(
     BASE_MODEL,
     torch_dtype=dtype,
-    low_cpu_mem_usage=True,          # stream shards
-    device_map={"": "cpu"},          # everything on CPU
-    offload_folder=OFFLOAD_DIR,      # mmap big tensors to disk
-    trust_remote_code=True
 )
-# ---------------------------------------------------------------------
-# 4 Attach LoRA
-# ---------------------------------------------------------------------
 model = PeftModel.from_pretrained(
     base,
     ADAPTER_REPO,
     revision=ADAPTER_REV,
-    device_map={"": "cpu"}
 ).eval()
-# ---------------------------------------------------------------------
-# 5 Inference helper
-# ---------------------------------------------------------------------
-def generate_answer(
-    image,
-    question: str,
-    temperature: float = 0.7,
-    top_p: float = 0.95,
-    max_tokens: int = 256,           # keep small for RAM headroom
-) -> str:
     if image is None:
         image = Image.new("RGB", (224, 224), color="white")
-    inputs = processor(text=[question], images=[image], return_tensors="pt")
     with torch.no_grad():
-        output_ids = model.generate(
-            **inputs, max_new_tokens=max_tokens,
-            temperature=temperature, top_p=top_p
-        )
-    return processor.batch_decode(output_ids, skip_special_tokens=True)[0]
-# ---------------------------------------------------------------------
-# 6 Gradio UI
-# ---------------------------------------------------------------------
-with gr.Blocks(title="PetBull-7B-VL (CPU)") as demo:
     gr.Markdown(
-        "## 🐾 PetBull-7B-VL – Ask a Vet\n"
         "Upload a photo and/or type a question."
     )
     with gr.Row():
         with gr.Column():
             img_in  = gr.Image(type="pil", label="Pet photo (optional)")
             txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
             ask     = gr.Button("Ask PetBull")
             temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
-            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
             max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
         with gr.Column():
             answer  = gr.Textbox(lines=12, label="Assistant", interactive=False)

 """
+    PetBull‑7B‑VL demo – ZeroGPU‑ready
 """
+import os
+import torch
+import spaces                # <-- NEW: import spaces for ZeroGPU
+import gradio as gr
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from peft import PeftModel
+# 0. Environment tweaks for Accelerate (unchanged)
+os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
+# 1. Config
 BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
 ADAPTER_REPO = "ColdSlim/PetBull-7B"
+ADAPTER_REV  = "master"
+OFFLOAD_DIR  = "offload"
+dtype = torch.float16        # <-- use float16 for GPU
+# 2. Load processor
 processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
+# 3. Load base model on CPU; stream shards to save RAM
 base = AutoModelForVision2Seq.from_pretrained(
     BASE_MODEL,
     torch_dtype=dtype,
+    low_cpu_mem_usage=True,
+    device_map={"": "cpu"},
+    offload_folder=OFFLOAD_DIR,
+    trust_remote_code=True,
 )
+# 4. Attach LoRA adapter on CPU
 model = PeftModel.from_pretrained(
     base,
     ADAPTER_REPO,
     revision=ADAPTER_REV,
+    device_map={"": "cpu"},
 ).eval()
+# Keep track of whether the model has been moved to GPU
+_model_on_gpu = False
+# 5. Inference helper – run on GPU when called
+@spaces.GPU                   # <-- NEW: request GPU for this function:contentReference[oaicite:3]{index=3}
+def generate_answer(image, question: str,
+                    temperature: float = 0.7,
+                    top_p: float = 0.95,
+                    max_tokens: int = 256):
+    global _model_on_gpu
+    # provide a placeholder image if none was uploaded
     if image is None:
         image = Image.new("RGB", (224, 224), color="white")
+    # move model to GPU once
+    if not _model_on_gpu:
+        model.to("cuda")
+        _model_on_gpu = True
+    # prepare inputs on GPU
+    inputs = processor(text=[question], images=[image],
+                       return_tensors="pt").to("cuda")
     with torch.no_grad():
+        output_ids = model.generate(**inputs,
+                                    max_new_tokens=max_tokens,
+                                    temperature=temperature,
+                                    top_p=top_p)
+    # decode on CPU
+    outputs = output_ids.to("cpu")
+    return processor.batch_decode(outputs, skip_special_tokens=True)[0]
+# 6. Gradio UI (unchanged except for title)
+with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
     gr.Markdown(
+        "## PetBull‑7B‑VL – Ask a Vet\n"
         "Upload a photo and/or type a question."
     )
     with gr.Row():
         with gr.Column():
             img_in  = gr.Image(type="pil", label="Pet photo (optional)")
             txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
             ask     = gr.Button("Ask PetBull")
             temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
+            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
             max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
         with gr.Column():
             answer  = gr.Textbox(lines=12, label="Assistant", interactive=False)