salma-remyx commited on
Commit
a9f74e5
Β·
1 Parent(s): fbb4e74

initial commit

Browse files
Files changed (4) hide show
  1. README.md +3 -3
  2. app.py +225 -0
  3. examples/warehouse_rgb.jpg +0 -0
  4. requirements.txt +9 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: SpaceQwen2.5 VL 3B Instruct
3
- emoji: πŸ”₯
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.15.0
8
  app_file: app.py
 
1
  ---
2
  title: SpaceQwen2.5 VL 3B Instruct
3
+ emoji: πŸ‘‘
4
+ colorFrom: indigo
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.15.0
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import torch
3
+ import time
4
+ import gradio as gr
5
+ from PIL import Image
6
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
7
+ from typing import List
8
+
9
+ MODEL_ID = "remyxai/SpaceQwen2.5-VL-3B-Instruct"
10
+
11
+ @spaces.GPU
12
+ def load_model():
13
+ print("Loading model and processor...")
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
16
+ MODEL_ID,
17
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
18
+ ).to(device)
19
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
20
+ return model, processor
21
+
22
+ model, processor = load_model()
23
+
24
+ def process_image(image_path_or_obj):
25
+ """Loads, resizes, and preprocesses an image path or Pillow Image."""
26
+ if isinstance(image_path_or_obj, str):
27
+ # Path on disk or from history
28
+ image = Image.open(image_path_or_obj).convert("RGB")
29
+ elif isinstance(image_path_or_obj, Image.Image):
30
+ image = image_path_or_obj.convert("RGB")
31
+ else:
32
+ raise ValueError("process_image expects a file path (str) or PIL.Image")
33
+
34
+ max_width = 512
35
+ if image.width > max_width:
36
+ aspect_ratio = image.height / image.width
37
+ new_height = int(max_width * aspect_ratio)
38
+ image = image.resize((max_width, new_height), Image.Resampling.LANCZOS)
39
+ print(f"Resized image to: {max_width}x{new_height}")
40
+ return image
41
+
42
+ def get_latest_image(history):
43
+ """
44
+ Look from the end to find the last user-uploaded image (stored as (file_path,) ).
45
+ Return None if not found.
46
+ """
47
+ for user_msg, _assistant_msg in reversed(history):
48
+ if isinstance(user_msg, tuple) and len(user_msg) > 0:
49
+ return user_msg[0]
50
+ return None
51
+
52
+ def only_assistant_text(full_text: str) -> str:
53
+ """
54
+ Helper to strip out any lines containing 'system', 'user', etc.,
55
+ and return only the final assistant answer.
56
+ Adjust this parsing if your model's output format differs.
57
+ """
58
+ # Example output might look like:
59
+ # system
60
+ # ...
61
+ # user
62
+ # ...
63
+ # assistant
64
+ # The final answer
65
+ #
66
+ # We'll just split on 'assistant' and return everything after it.
67
+ if "assistant" in full_text:
68
+ parts = full_text.split("assistant", 1)
69
+ result = parts[-1].strip()
70
+ # Remove any leading punctuation (like a colon)
71
+ result = result.lstrip(":").strip()
72
+ return result
73
+ return full_text.strip()
74
+
75
+ def run_inference(image, prompt):
76
+ """Runs Qwen2.5-VL inference on a single image and text prompt."""
77
+ system_msg = (
78
+ "You are a Vision Language Model specialized in interpreting visual data from images. "
79
+ "Your task is to analyze the provided image and respond to queries with concise answers."
80
+ )
81
+ conversation = [
82
+ {
83
+ "role": "system",
84
+ "content": [{"type": "text", "text": system_msg}],
85
+ },
86
+ {
87
+ "role": "user",
88
+ "content": [
89
+ {"type": "image", "image": image},
90
+ {"type": "text", "text": prompt},
91
+ ],
92
+ },
93
+ ]
94
+ text_input = processor.apply_chat_template(
95
+ conversation, tokenize=False, add_generation_prompt=True
96
+ )
97
+
98
+ inputs = processor(text=[text_input], images=[image], return_tensors="pt").to(model.device)
99
+ generated_ids = model.generate(**inputs, max_new_tokens=1024)
100
+ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
101
+ # Parse out only the final assistant text
102
+ return only_assistant_text(output_text)
103
+
104
+ def add_message(history, user_input):
105
+ """
106
+ Step 1 (triggered by user's 'Submit' or 'Send'):
107
+ - Save new text or images into `history`.
108
+ - The Chatbot display uses pairs: [user_text_or_image, assistant_reply].
109
+ """
110
+ if not isinstance(history, list):
111
+ history = []
112
+
113
+ files = user_input.get("files", [])
114
+ text = user_input.get("text", "")
115
+
116
+ # Store images
117
+ for f in files:
118
+ # Each image is stored as `[(file_path,), None]`
119
+ history.append([(f,), None])
120
+
121
+ # Store text
122
+ if text:
123
+ history.append([text, None])
124
+
125
+ return history, gr.MultimodalTextbox(value=None)
126
+
127
+ def inference_interface(history):
128
+ """
129
+ Step 2: Use the most recent text + the most recent image to run Qwen2.5-VL.
130
+ Instead of adding another entry, we fill the assistant's answer into
131
+ the last user text entry.
132
+ """
133
+ if not history:
134
+ return history, gr.MultimodalTextbox(value=None)
135
+
136
+ # 1) Get the user's most recent text
137
+ user_text = ""
138
+ # We'll search from the end for the first str we find
139
+ for idx in range(len(history) - 1, -1, -1):
140
+ user_msg, assistant_msg = history[idx]
141
+ if isinstance(user_msg, str):
142
+ user_text = user_msg
143
+ # We'll also keep track of this index so we can fill in the assistant reply
144
+ user_idx = idx
145
+ break
146
+ else:
147
+ # No user text found
148
+ print("No user text found in history. Skipping inference.")
149
+ return history, gr.MultimodalTextbox(value=None)
150
+
151
+ # 2) Get the latest image from the entire conversation
152
+ latest_image = get_latest_image(history)
153
+ if not latest_image:
154
+ # No image found => can't run the model
155
+ print("No image found in history. Skipping inference.")
156
+ return history, gr.MultimodalTextbox(value=None)
157
+
158
+ # 3) Process the image
159
+ pil_image = process_image(latest_image)
160
+
161
+ # 4) Run inference
162
+ assistant_reply = run_inference(pil_image, user_text)
163
+
164
+ # 5) Fill that assistant reply back into the last user text entry
165
+ history[user_idx][1] = assistant_reply
166
+ return history, gr.MultimodalTextbox(value=None)
167
+
168
+ def build_demo():
169
+ with gr.Blocks() as demo:
170
+ gr.Markdown("# SpaceQwen2.5-VL Image Prompt Chatbot")
171
+
172
+ chatbot = gr.Chatbot([], line_breaks=True)
173
+ chat_input = gr.MultimodalTextbox(
174
+ interactive=True,
175
+ file_types=["image"],
176
+ placeholder="Enter text or upload an image (or both).",
177
+ show_label=True
178
+ )
179
+
180
+ # When the user presses Enter in the MultimodalTextbox:
181
+ submit_event = chat_input.submit(
182
+ fn=add_message, # Step 1: store user data
183
+ inputs=[chatbot, chat_input],
184
+ outputs=[chatbot, chat_input]
185
+ )
186
+ # After storing, run inference
187
+ submit_event.then(
188
+ fn=inference_interface, # Step 2: run Qwen2.5-VL
189
+ inputs=[chatbot],
190
+ outputs=[chatbot, chat_input]
191
+ )
192
+
193
+ # Same logic for a "Send" button
194
+ with gr.Row():
195
+ send_button = gr.Button("Send")
196
+ clear_button = gr.ClearButton([chatbot, chat_input])
197
+
198
+ send_click = send_button.click(
199
+ fn=add_message,
200
+ inputs=[chatbot, chat_input],
201
+ outputs=[chatbot, chat_input]
202
+ )
203
+ send_click.then(
204
+ fn=inference_interface,
205
+ inputs=[chatbot],
206
+ outputs=[chatbot, chat_input]
207
+ )
208
+
209
+ # Example
210
+ gr.Examples(
211
+ examples=[
212
+ {
213
+ "text": "Give me the height of the man in the red hat in feet.",
214
+ "files": ["./examples/warehouse_rgb.jpg"]
215
+ }
216
+ ],
217
+ inputs=[chat_input],
218
+ )
219
+
220
+ return demo
221
+
222
+ if __name__ == "__main__":
223
+ demo = build_demo()
224
+ demo.launch(share=True)
225
+
examples/warehouse_rgb.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers>=4.41.0
3
+ Pillow
4
+ gradio==5.15.0
5
+ spaces
6
+ multiprocess
7
+ requests
8
+ accelerate>=0.26.0
9
+ git+https://github.com/huggingface/transformers.git