Spaces:

ColdSlim
/

DermalCare

Sleeping

App Files Files Community

ColdSlim commited on 14 days ago

Commit

79837da

verified ·

1 Parent(s): e0809e5

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -40

app.py CHANGED Viewed

@@ -3,41 +3,39 @@
 """
 import os
 import torch
-import spaces                # <-- NEW: import spaces for ZeroGPU
 import gradio as gr
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForVision2Seq, AutoConfig, AutoModelForCausalLM
 from peft import PeftModel
-import transformers, accelerate, torch, numpy as np
 print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
-# 0. Environment tweaks for Accelerate (unchanged)
 os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
-# 1. Config
 BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
 ADAPTER_REPO = "ColdSlim/PetBull-7B"
 ADAPTER_REV  = "master"
 OFFLOAD_DIR  = "offload"
-dtype = torch.float16        # <-- use float16 for GPU
-# 2. Load processor
 processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
-cfg = AutoConfig.from_pretrained(BASE_MODEL, trust_remote_code=True)
-# 3. Load base model on CPU; stream shards to save RAM
 base = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
-    config=cfg,
-    torch_dtype=dtype,
     low_cpu_mem_usage=True,
     device_map={"": "cpu"},
     offload_folder=OFFLOAD_DIR,
     trust_remote_code=True,
 )
-# 4. Attach LoRA adapter on CPU
 model = PeftModel.from_pretrained(
     base,
     ADAPTER_REPO,
@@ -45,44 +43,45 @@ model = PeftModel.from_pretrained(
     device_map={"": "cpu"},
 ).eval()
-# Keep track of whether the model has been moved to GPU
-_model_on_gpu = False
-# 5. Inference helper – run on GPU when called
-@spaces.GPU                   # <-- NEW: request GPU for this function:contentReference[oaicite:3]{index=3}
-def generate_answer(image, question: str,
-                    temperature: float = 0.7,
-                    top_p: float = 0.95,
-                    max_tokens: int = 256):
     global _model_on_gpu
-    # provide a placeholder image if none was uploaded
     if image is None:
         image = Image.new("RGB", (224, 224), color="white")
-    # move model to GPU once
     if not _model_on_gpu:
         model.to("cuda")
         _model_on_gpu = True
-    # prepare inputs on GPU
-    inputs = processor(text=[question], images=[image],
-                       return_tensors="pt").to("cuda")
     with torch.no_grad():
-        output_ids = model.generate(**inputs,
-                                    max_new_tokens=max_tokens,
-                                    temperature=temperature,
-                                    top_p=top_p)
-    # decode on CPU
     outputs = output_ids.to("cpu")
     return processor.batch_decode(outputs, skip_special_tokens=True)[0]
-# 6. Gradio UI (unchanged except for title)
 with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
-    gr.Markdown(
-        "## PetBull‑7B‑VL – Ask a Vet\n"
-        "Upload a photo and/or type a question."
-    )
     with gr.Row():
         with gr.Column():
             img_in  = gr.Image(type="pil", label="Pet photo (optional)")
@@ -92,10 +91,12 @@ with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
             topp    = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
             max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
         with gr.Column():
-            answer  = gr.Textbox(lines=12, label="Assistant", interactive=False)
-    ask.click(generate_answer,
-              inputs=[img_in, txt_in, temp, topp, max_tok],
-              outputs=answer)
-demo.queue().launch()

 """
 import os
 import torch
+import spaces
 import gradio as gr
 from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM
 from peft import PeftModel
+import transformers, accelerate, numpy as np
 print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
+# 0) Safer streaming for model shards
 os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
+# 1) Config
 BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
 ADAPTER_REPO = "ColdSlim/PetBull-7B"
 ADAPTER_REV  = "master"
 OFFLOAD_DIR  = "offload"
+DTYPE        = torch.float16
+# 2) Processor
 processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
+# 3) Load base model ON CPU (no AutoConfig; rely on remote code)
 base = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
+    torch_dtype=DTYPE,
     low_cpu_mem_usage=True,
     device_map={"": "cpu"},
     offload_folder=OFFLOAD_DIR,
     trust_remote_code=True,
 )
+# 4) Attach LoRA ON CPU
 model = PeftModel.from_pretrained(
     base,
     ADAPTER_REPO,
     device_map={"": "cpu"},
 ).eval()
+_model_on_gpu = False  # track once-per-session transfer
+# 5) Inference (request GPU only for this function)
+@spaces.GPU(duration=120)
+def generate_answer(
+    image,
+    question: str,
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+    max_tokens: int = 256,
+) -> str:
     global _model_on_gpu
     if image is None:
         image = Image.new("RGB", (224, 224), color="white")
+    # Move model to GPU once (inside GPU-decorated function)
     if not _model_on_gpu:
         model.to("cuda")
         _model_on_gpu = True
+    # Prepare inputs on GPU
+    inputs = processor(text=[question], images=[image], return_tensors="pt")
+    inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}
     with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        )
     outputs = output_ids.to("cpu")
     return processor.batch_decode(outputs, skip_special_tokens=True)[0]
+# 6) UI
 with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
+    gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
     with gr.Row():
         with gr.Column():
             img_in  = gr.Image(type="pil", label="Pet photo (optional)")
             topp    = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
             max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
         with gr.Column():
+            answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
+    ask.click(
+        generate_answer,
+        inputs=[img_in, txt_in, temp, topp, max_tok],
+        outputs=answer,
+    )
+demo.queue().launch()