ColdSlim commited on
Commit
79837da
·
verified ·
1 Parent(s): e0809e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -40
app.py CHANGED
@@ -3,41 +3,39 @@
3
  """
4
  import os
5
  import torch
6
- import spaces # <-- NEW: import spaces for ZeroGPU
7
  import gradio as gr
8
  from PIL import Image
9
- from transformers import AutoProcessor, AutoModelForVision2Seq, AutoConfig, AutoModelForCausalLM
10
  from peft import PeftModel
11
- import transformers, accelerate, torch, numpy as np
12
 
13
  print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
14
 
15
- # 0. Environment tweaks for Accelerate (unchanged)
16
  os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
17
 
18
- # 1. Config
19
  BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
20
  ADAPTER_REPO = "ColdSlim/PetBull-7B"
21
  ADAPTER_REV = "master"
22
  OFFLOAD_DIR = "offload"
 
23
 
24
- dtype = torch.float16 # <-- use float16 for GPU
25
- # 2. Load processor
26
  processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
27
 
28
- cfg = AutoConfig.from_pretrained(BASE_MODEL, trust_remote_code=True)
29
- # 3. Load base model on CPU; stream shards to save RAM
30
  base = AutoModelForCausalLM.from_pretrained(
31
  BASE_MODEL,
32
- config=cfg,
33
- torch_dtype=dtype,
34
  low_cpu_mem_usage=True,
35
  device_map={"": "cpu"},
36
  offload_folder=OFFLOAD_DIR,
37
  trust_remote_code=True,
38
  )
39
 
40
- # 4. Attach LoRA adapter on CPU
41
  model = PeftModel.from_pretrained(
42
  base,
43
  ADAPTER_REPO,
@@ -45,44 +43,45 @@ model = PeftModel.from_pretrained(
45
  device_map={"": "cpu"},
46
  ).eval()
47
 
48
- # Keep track of whether the model has been moved to GPU
49
- _model_on_gpu = False
50
 
51
- # 5. Inference helper run on GPU when called
52
- @spaces.GPU # <-- NEW: request GPU for this function:contentReference[oaicite:3]{index=3}
53
- def generate_answer(image, question: str,
54
- temperature: float = 0.7,
55
- top_p: float = 0.95,
56
- max_tokens: int = 256):
 
 
 
57
  global _model_on_gpu
58
- # provide a placeholder image if none was uploaded
59
  if image is None:
60
  image = Image.new("RGB", (224, 224), color="white")
61
 
62
- # move model to GPU once
63
  if not _model_on_gpu:
64
  model.to("cuda")
65
  _model_on_gpu = True
66
 
67
- # prepare inputs on GPU
68
- inputs = processor(text=[question], images=[image],
69
- return_tensors="pt").to("cuda")
70
 
71
  with torch.no_grad():
72
- output_ids = model.generate(**inputs,
73
- max_new_tokens=max_tokens,
74
- temperature=temperature,
75
- top_p=top_p)
76
- # decode on CPU
 
 
77
  outputs = output_ids.to("cpu")
78
  return processor.batch_decode(outputs, skip_special_tokens=True)[0]
79
 
80
- # 6. Gradio UI (unchanged except for title)
81
  with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
82
- gr.Markdown(
83
- "## PetBull‑7B‑VL – Ask a Vet\n"
84
- "Upload a photo and/or type a question."
85
- )
86
  with gr.Row():
87
  with gr.Column():
88
  img_in = gr.Image(type="pil", label="Pet photo (optional)")
@@ -92,10 +91,12 @@ with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
92
  topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
93
  max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
94
  with gr.Column():
95
- answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
96
 
97
- ask.click(generate_answer,
98
- inputs=[img_in, txt_in, temp, topp, max_tok],
99
- outputs=answer)
 
 
100
 
101
- demo.queue().launch()
 
3
  """
4
  import os
5
  import torch
6
+ import spaces
7
  import gradio as gr
8
  from PIL import Image
9
+ from transformers import AutoProcessor, AutoModelForCausalLM
10
  from peft import PeftModel
11
+ import transformers, accelerate, numpy as np
12
 
13
  print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
14
 
15
+ # 0) Safer streaming for model shards
16
  os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
17
 
18
+ # 1) Config
19
  BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
20
  ADAPTER_REPO = "ColdSlim/PetBull-7B"
21
  ADAPTER_REV = "master"
22
  OFFLOAD_DIR = "offload"
23
+ DTYPE = torch.float16
24
 
25
+ # 2) Processor
 
26
  processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
27
 
28
+ # 3) Load base model ON CPU (no AutoConfig; rely on remote code)
 
29
  base = AutoModelForCausalLM.from_pretrained(
30
  BASE_MODEL,
31
+ torch_dtype=DTYPE,
 
32
  low_cpu_mem_usage=True,
33
  device_map={"": "cpu"},
34
  offload_folder=OFFLOAD_DIR,
35
  trust_remote_code=True,
36
  )
37
 
38
+ # 4) Attach LoRA ON CPU
39
  model = PeftModel.from_pretrained(
40
  base,
41
  ADAPTER_REPO,
 
43
  device_map={"": "cpu"},
44
  ).eval()
45
 
46
+ _model_on_gpu = False # track once-per-session transfer
 
47
 
48
+ # 5) Inference (request GPU only for this function)
49
+ @spaces.GPU(duration=120)
50
+ def generate_answer(
51
+ image,
52
+ question: str,
53
+ temperature: float = 0.7,
54
+ top_p: float = 0.95,
55
+ max_tokens: int = 256,
56
+ ) -> str:
57
  global _model_on_gpu
58
+
59
  if image is None:
60
  image = Image.new("RGB", (224, 224), color="white")
61
 
62
+ # Move model to GPU once (inside GPU-decorated function)
63
  if not _model_on_gpu:
64
  model.to("cuda")
65
  _model_on_gpu = True
66
 
67
+ # Prepare inputs on GPU
68
+ inputs = processor(text=[question], images=[image], return_tensors="pt")
69
+ inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}
70
 
71
  with torch.no_grad():
72
+ output_ids = model.generate(
73
+ **inputs,
74
+ max_new_tokens=max_tokens,
75
+ temperature=temperature,
76
+ top_p=top_p,
77
+ )
78
+
79
  outputs = output_ids.to("cpu")
80
  return processor.batch_decode(outputs, skip_special_tokens=True)[0]
81
 
82
+ # 6) UI
83
  with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
84
+ gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
 
 
 
85
  with gr.Row():
86
  with gr.Column():
87
  img_in = gr.Image(type="pil", label="Pet photo (optional)")
 
91
  topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
92
  max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
93
  with gr.Column():
94
+ answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
95
 
96
+ ask.click(
97
+ generate_answer,
98
+ inputs=[img_in, txt_in, temp, topp, max_tok],
99
+ outputs=answer,
100
+ )
101
 
102
+ demo.queue().launch()