ColdSlim commited on
Commit
c818037
·
verified ·
1 Parent(s): 79837da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -35
app.py CHANGED
@@ -1,32 +1,31 @@
1
  """
2
- PetBull‑7B‑VL demo – ZeroGPU‑ready
3
  """
4
  import os
5
- import torch
6
  import spaces
 
7
  import gradio as gr
8
  from PIL import Image
9
- from transformers import AutoProcessor, AutoModelForCausalLM
10
  from peft import PeftModel
 
11
  import transformers, accelerate, numpy as np
12
 
13
  print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
14
-
15
- # 0) Safer streaming for model shards
16
  os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
17
 
18
- # 1) Config
19
  BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
20
- ADAPTER_REPO = "ColdSlim/PetBull-7B"
21
  ADAPTER_REV = "master"
22
  OFFLOAD_DIR = "offload"
23
  DTYPE = torch.float16
24
 
25
- # 2) Processor
26
  processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
27
 
28
- # 3) Load base model ON CPU (no AutoConfig; rely on remote code)
29
- base = AutoModelForCausalLM.from_pretrained(
30
  BASE_MODEL,
31
  torch_dtype=DTYPE,
32
  low_cpu_mem_usage=True,
@@ -35,7 +34,7 @@ base = AutoModelForCausalLM.from_pretrained(
35
  trust_remote_code=True,
36
  )
37
 
38
- # 4) Attach LoRA ON CPU
39
  model = PeftModel.from_pretrained(
40
  base,
41
  ADAPTER_REPO,
@@ -43,44 +42,59 @@ model = PeftModel.from_pretrained(
43
  device_map={"": "cpu"},
44
  ).eval()
45
 
46
- _model_on_gpu = False # track once-per-session transfer
47
 
48
- # 5) Inference (request GPU only for this function)
49
  @spaces.GPU(duration=120)
50
- def generate_answer(
51
- image,
52
- question: str,
53
- temperature: float = 0.7,
54
- top_p: float = 0.95,
55
- max_tokens: int = 256,
56
- ) -> str:
57
  global _model_on_gpu
58
-
59
  if image is None:
60
  image = Image.new("RGB", (224, 224), color="white")
61
 
62
- # Move model to GPU once (inside GPU-decorated function)
63
  if not _model_on_gpu:
64
  model.to("cuda")
65
  _model_on_gpu = True
66
 
67
- # Prepare inputs on GPU
68
- inputs = processor(text=[question], images=[image], return_tensors="pt")
69
- inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  with torch.no_grad():
72
- output_ids = model.generate(
73
  **inputs,
74
  max_new_tokens=max_tokens,
75
  temperature=temperature,
76
  top_p=top_p,
77
  )
78
 
79
- outputs = output_ids.to("cpu")
80
- return processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
81
 
82
- # 6) UI
83
- with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
84
  gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
85
  with gr.Row():
86
  with gr.Column():
@@ -93,10 +107,6 @@ with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
93
  with gr.Column():
94
  answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
95
 
96
- ask.click(
97
- generate_answer,
98
- inputs=[img_in, txt_in, temp, topp, max_tok],
99
- outputs=answer,
100
- )
101
 
102
  demo.queue().launch()
 
1
  """
2
+ PetBull‑7B‑VL demo – ZeroGPU‑ready (Qwen2.5‑VL API)
3
  """
4
  import os
 
5
  import spaces
6
+ import torch
7
  import gradio as gr
8
  from PIL import Image
9
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
10
  from peft import PeftModel
11
+ from qwen_vl_utils import process_vision_info # pip install qwen-vl-utils
12
  import transformers, accelerate, numpy as np
13
 
14
  print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
 
 
15
  os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
16
 
17
+ # ---- Config ----
18
  BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
19
+ ADAPTER_REPO = "ColdSlim/PetBull-7B" # your LoRA
20
  ADAPTER_REV = "master"
21
  OFFLOAD_DIR = "offload"
22
  DTYPE = torch.float16
23
 
24
+ # ---- Processor (no GPU) ----
25
  processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
26
 
27
+ # ---- Base model ON CPU (do NOT touch CUDA here) ----
28
+ base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
29
  BASE_MODEL,
30
  torch_dtype=DTYPE,
31
  low_cpu_mem_usage=True,
 
34
  trust_remote_code=True,
35
  )
36
 
37
+ # ---- Attach LoRA ON CPU ----
38
  model = PeftModel.from_pretrained(
39
  base,
40
  ADAPTER_REPO,
 
42
  device_map={"": "cpu"},
43
  ).eval()
44
 
45
+ _model_on_gpu = False # once-per-session move
46
 
47
+ # ---- Inference on GPU (ZeroGPU pattern) ----
48
  @spaces.GPU(duration=120)
49
+ def generate_answer(image, question, temperature=0.7, top_p=0.95, max_tokens=256):
50
+ """
51
+ Uses Qwen2.5-VL chat template + qwen_vl_utils to prepare image+text, then generate.
52
+ """
 
 
 
53
  global _model_on_gpu
 
54
  if image is None:
55
  image = Image.new("RGB", (224, 224), color="white")
56
 
 
57
  if not _model_on_gpu:
58
  model.to("cuda")
59
  _model_on_gpu = True
60
 
61
+ # Build chat messages in Qwen format
62
+ messages = [{
63
+ "role": "user",
64
+ "content": [
65
+ {"type": "image", "image": image},
66
+ {"type": "text", "text": question or "Describe this image."},
67
+ ],
68
+ }]
69
+
70
+ # Processor helpers
71
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
72
+ image_inputs, video_inputs = process_vision_info(messages)
73
+
74
+ # Pack tensors on GPU
75
+ inputs = processor(
76
+ text=[text],
77
+ images=image_inputs,
78
+ videos=video_inputs,
79
+ padding=True,
80
+ return_tensors="pt",
81
+ )
82
+ inputs = {k: (v.to("cuda") if hasattr(v, "to") else v) for k, v in inputs.items()}
83
 
84
  with torch.no_grad():
85
+ out = model.generate(
86
  **inputs,
87
  max_new_tokens=max_tokens,
88
  temperature=temperature,
89
  top_p=top_p,
90
  )
91
 
92
+ # Trim prompt tokens before decode (Qwen style)
93
+ trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out)]
94
+ return processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
95
 
96
+ # ---- UI ----
97
+ with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU, Qwen2.5‑VL)") as demo:
98
  gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
99
  with gr.Row():
100
  with gr.Column():
 
107
  with gr.Column():
108
  answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
109
 
110
+ ask.click(generate_answer, inputs=[img_in, txt_in, temp, topp, max_tok], outputs=answer)
 
 
 
 
111
 
112
  demo.queue().launch()