ColdSlim commited on
Commit
c8b3c1b
·
verified ·
1 Parent(s): f257dd7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -61
app.py CHANGED
@@ -1,101 +1,90 @@
1
  """
2
- PetBull-7B-VL demo – CPU-only, 16 GB-friendly
3
- --------------------------------------------
4
-
5
- • Base model : Qwen/Qwen2.5-VL-7B-Instruct
6
- • LoRA adapter: ColdSlim/PetBull-7B (master branch)
7
-
8
- This script:
9
- ✓ loads in bfloat16 (saves ~25 % RAM vs FP16)
10
- ✓ streams weights to avoid peak memory spikes
11
- ✓ off-loads large tensors to disk when RAM is tight
12
  """
13
-
14
- import os, torch, gradio as gr
 
 
15
  from PIL import Image
16
  from transformers import AutoProcessor, AutoModelForVision2Seq
17
  from peft import PeftModel
18
 
19
- # ---------------------------------------------------------------------
20
- # 0 Env tweaks for Hugging Face Accelerate
21
- # ---------------------------------------------------------------------
22
- os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true" # safer streaming
23
 
24
- # ---------------------------------------------------------------------
25
- # 1 Config
26
- # ---------------------------------------------------------------------
27
  BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
28
  ADAPTER_REPO = "ColdSlim/PetBull-7B"
29
- ADAPTER_REV = "master" # your model repo branch
30
- OFFLOAD_DIR = "offload" # folder on disk for big tensors
31
-
32
- device = "cpu" # force CPU
33
- dtype = torch.bfloat16 # lighter than FP16 on modern CPUs
34
 
35
- # ---------------------------------------------------------------------
36
- # 2Load processor (tiny)
37
- # ---------------------------------------------------------------------
38
  processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
39
 
40
- # ---------------------------------------------------------------------
41
- # 3 Load base model with memory-savvy flags
42
- # ---------------------------------------------------------------------
43
  base = AutoModelForVision2Seq.from_pretrained(
44
  BASE_MODEL,
45
  torch_dtype=dtype,
46
- low_cpu_mem_usage=True, # stream shards
47
- device_map={"": "cpu"}, # everything on CPU
48
- offload_folder=OFFLOAD_DIR, # mmap big tensors to disk
49
- trust_remote_code=True
50
  )
51
 
52
- # ---------------------------------------------------------------------
53
- # 4 Attach LoRA
54
- # ---------------------------------------------------------------------
55
  model = PeftModel.from_pretrained(
56
  base,
57
  ADAPTER_REPO,
58
  revision=ADAPTER_REV,
59
- device_map={"": "cpu"}
60
  ).eval()
61
 
62
- # ---------------------------------------------------------------------
63
- # 5 Inference helper
64
- # ---------------------------------------------------------------------
65
- def generate_answer(
66
- image,
67
- question: str,
68
- temperature: float = 0.7,
69
- top_p: float = 0.95,
70
- max_tokens: int = 256, # keep small for RAM headroom
71
- ) -> str:
 
72
  if image is None:
73
  image = Image.new("RGB", (224, 224), color="white")
74
 
75
- inputs = processor(text=[question], images=[image], return_tensors="pt")
 
 
 
 
 
 
 
 
76
  with torch.no_grad():
77
- output_ids = model.generate(
78
- **inputs, max_new_tokens=max_tokens,
79
- temperature=temperature, top_p=top_p
80
- )
81
- return processor.batch_decode(output_ids, skip_special_tokens=True)[0]
 
 
82
 
83
- # ---------------------------------------------------------------------
84
- # 6 Gradio UI
85
- # ---------------------------------------------------------------------
86
- with gr.Blocks(title="PetBull-7B-VL (CPU)") as demo:
87
  gr.Markdown(
88
- "## 🐾 PetBull-7B-VL – Ask a Vet\n"
89
  "Upload a photo and/or type a question."
90
  )
91
-
92
  with gr.Row():
93
  with gr.Column():
94
  img_in = gr.Image(type="pil", label="Pet photo (optional)")
95
  txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
96
  ask = gr.Button("Ask PetBull")
97
  temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
98
- topp = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
99
  max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
100
  with gr.Column():
101
  answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
 
1
  """
2
+ PetBull7BVL demo – ZeroGPU‑ready
 
 
 
 
 
 
 
 
 
3
  """
4
+ import os
5
+ import torch
6
+ import spaces # <-- NEW: import spaces for ZeroGPU
7
+ import gradio as gr
8
  from PIL import Image
9
  from transformers import AutoProcessor, AutoModelForVision2Seq
10
  from peft import PeftModel
11
 
12
+ # 0. Environment tweaks for Accelerate (unchanged)
13
+ os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
 
 
14
 
15
+ # 1. Config
 
 
16
  BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
17
  ADAPTER_REPO = "ColdSlim/PetBull-7B"
18
+ ADAPTER_REV = "master"
19
+ OFFLOAD_DIR = "offload"
 
 
 
20
 
21
+ dtype = torch.float16 # <-- use float16 for GPU
22
+ # 2. Load processor
 
23
  processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
24
 
25
+ # 3. Load base model on CPU; stream shards to save RAM
 
 
26
  base = AutoModelForVision2Seq.from_pretrained(
27
  BASE_MODEL,
28
  torch_dtype=dtype,
29
+ low_cpu_mem_usage=True,
30
+ device_map={"": "cpu"},
31
+ offload_folder=OFFLOAD_DIR,
32
+ trust_remote_code=True,
33
  )
34
 
35
+ # 4. Attach LoRA adapter on CPU
 
 
36
  model = PeftModel.from_pretrained(
37
  base,
38
  ADAPTER_REPO,
39
  revision=ADAPTER_REV,
40
+ device_map={"": "cpu"},
41
  ).eval()
42
 
43
+ # Keep track of whether the model has been moved to GPU
44
+ _model_on_gpu = False
45
+
46
+ # 5. Inference helper – run on GPU when called
47
+ @spaces.GPU # <-- NEW: request GPU for this function:contentReference[oaicite:3]{index=3}
48
+ def generate_answer(image, question: str,
49
+ temperature: float = 0.7,
50
+ top_p: float = 0.95,
51
+ max_tokens: int = 256):
52
+ global _model_on_gpu
53
+ # provide a placeholder image if none was uploaded
54
  if image is None:
55
  image = Image.new("RGB", (224, 224), color="white")
56
 
57
+ # move model to GPU once
58
+ if not _model_on_gpu:
59
+ model.to("cuda")
60
+ _model_on_gpu = True
61
+
62
+ # prepare inputs on GPU
63
+ inputs = processor(text=[question], images=[image],
64
+ return_tensors="pt").to("cuda")
65
+
66
  with torch.no_grad():
67
+ output_ids = model.generate(**inputs,
68
+ max_new_tokens=max_tokens,
69
+ temperature=temperature,
70
+ top_p=top_p)
71
+ # decode on CPU
72
+ outputs = output_ids.to("cpu")
73
+ return processor.batch_decode(outputs, skip_special_tokens=True)[0]
74
 
75
+ # 6. Gradio UI (unchanged except for title)
76
+ with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
 
 
77
  gr.Markdown(
78
+ "## PetBull7BVL – Ask a Vet\n"
79
  "Upload a photo and/or type a question."
80
  )
 
81
  with gr.Row():
82
  with gr.Column():
83
  img_in = gr.Image(type="pil", label="Pet photo (optional)")
84
  txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
85
  ask = gr.Button("Ask PetBull")
86
  temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
87
+ topp = gr.Slider(0.1, 1.0, 0.95, label="Topp")
88
  max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
89
  with gr.Column():
90
  answer = gr.Textbox(lines=12, label="Assistant", interactive=False)