qnguyen3 commited on
Commit
50bd3d6
·
verified ·
1 Parent(s): b6c6d0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -34
app.py CHANGED
@@ -68,14 +68,8 @@ def bot_streaming(message, history):
68
 
69
  # Get image path
70
  image = None
71
- if message["files"]:
72
  image = message["files"][-1]["path"]
73
- else:
74
- for i, hist in enumerate(history):
75
- if type(hist[0])==tuple:
76
- image = hist[0][0]
77
- image_turn = i
78
- break
79
 
80
  # Check if image is available
81
  if image is None:
@@ -83,22 +77,16 @@ def bot_streaming(message, history):
83
 
84
  # Prepare conversation messages
85
  messages = []
86
- if len(history) > 0 and image is not None:
87
- messages.append({"role": "user", "content": f'<image>\n{history[1][0]}'})
88
- messages.append({"role": "assistant", "content": history[1][1] })
89
- for human, assistant in history[2:]:
90
- messages.append({"role": "user", "content": human })
91
- messages.append({"role": "assistant", "content": assistant })
92
- messages.append({"role": "user", "content": message['text']})
93
- elif len(history) > 0 and image is None:
94
  for human, assistant in history:
95
- messages.append({"role": "user", "content": human })
96
- messages.append({"role": "assistant", "content": assistant })
97
- messages.append({"role": "user", "content": message['text']})
98
- elif len(history) == 0 and image is not None:
 
 
 
99
  messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
100
- elif len(history) == 0 and image is None:
101
- messages.append({"role": "user", "content": message['text'] })
102
 
103
  # Process image
104
  image = Image.open(image).convert("RGB")
@@ -108,8 +96,24 @@ def bot_streaming(message, history):
108
  messages,
109
  tokenize=False,
110
  add_generation_prompt=True)
111
- text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
112
- input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  # Prepare stopping criteria
115
  stop_str = '<|im_end|>'
@@ -140,16 +144,65 @@ def bot_streaming(message, history):
140
  yield generated_text_without_prompt
141
 
142
 
143
- demo = gr.ChatInterface(
144
- fn=bot_streaming,
145
- title="🚀nanoLLaVA-1.5",
146
- examples=[
147
- {"text": "Who is this guy?", "files":["./demo_1.jpg"]},
148
- {"text": "What does the text say?", "files":["./demo_2.jpeg"]}
149
- ],
150
- description="Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
151
- stop_btn="Stop Generation",
152
- multimodal=True
153
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
 
155
  demo.queue().launch()
 
68
 
69
  # Get image path
70
  image = None
71
+ if "files" in message and message["files"]:
72
  image = message["files"][-1]["path"]
 
 
 
 
 
 
73
 
74
  # Check if image is available
75
  if image is None:
 
77
 
78
  # Prepare conversation messages
79
  messages = []
80
+ if len(history) > 0:
 
 
 
 
 
 
 
81
  for human, assistant in history:
82
+ # Skip None responses (which can happen during streaming)
83
+ if assistant is not None:
84
+ messages.append({"role": "user", "content": human})
85
+ messages.append({"role": "assistant", "content": assistant})
86
+ # Add the current message
87
+ messages.append({"role": "user", "content": f"<image>\n{message['text']}" if len(messages) == 0 else message['text']})
88
+ else:
89
  messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
 
 
90
 
91
  # Process image
92
  image = Image.open(image).convert("RGB")
 
96
  messages,
97
  tokenize=False,
98
  add_generation_prompt=True)
99
+
100
+ # Handle image embedding in text
101
+ if '<image>' in text:
102
+ text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
103
+ input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
104
+ else:
105
+ # If no <image> tag was added (possible in some chat templates), add it manually
106
+ input_ids = tokenizer(text).input_ids
107
+ # Find the position to insert the image token
108
+ # For simplicity, insert after the user message start
109
+ user_start_pos = 0
110
+ for i, token in enumerate(input_ids):
111
+ if tokenizer.decode([token]) == '<|im_start|>user':
112
+ user_start_pos = i + 2 # +2 to get past the tag
113
+ break
114
+ # Insert image token
115
+ input_ids = input_ids[:user_start_pos] + [-200] + input_ids[user_start_pos:]
116
+ input_ids = torch.tensor([input_ids], dtype=torch.long)
117
 
118
  # Prepare stopping criteria
119
  stop_str = '<|im_end|>'
 
144
  yield generated_text_without_prompt
145
 
146
 
147
+ # Create a gradio Blocks interface instead of ChatInterface
148
+ # This avoids the schema validation issues
149
+ with gr.Blocks(title="🚀nanoLLaVA-1.5") as demo:
150
+ gr.Markdown("## 🚀nanoLLaVA-1.5")
151
+ gr.Markdown("Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.")
152
+
153
+ chatbot = gr.Chatbot(height=500)
154
+ with gr.Row():
155
+ with gr.Column(scale=0.8):
156
+ msg = gr.Textbox(
157
+ show_label=False,
158
+ placeholder="Enter text and upload an image",
159
+ container=False
160
+ )
161
+ with gr.Column(scale=0.2):
162
+ btn = gr.Button("Submit")
163
+ stop_btn = gr.Button("Stop Generation")
164
+
165
+ upload_btn = gr.UploadButton("Upload Image", file_types=["image"])
166
+ current_img = gr.State(None)
167
+
168
+ # Example images
169
+ examples = gr.Examples(
170
+ examples=[
171
+ ["Who is this guy?", "./demo_1.jpg"],
172
+ ["What does the text say?", "./demo_2.jpeg"]
173
+ ],
174
+ inputs=[msg, upload_btn]
175
+ )
176
+
177
+ def upload_image(image):
178
+ return image
179
+
180
+ def add_text(history, text, image):
181
+ if image is None and (not history or type(history[0][0]) != tuple):
182
+ return history + [[text, "Please upload an image first."]]
183
+ return history + [[text, None]]
184
+
185
+ def bot_response(history, image):
186
+ message = {"text": history[-1][0], "files": [{"path": image}] if image else []}
187
+ history_format = history[:-1] # All except the last message
188
+
189
+ response = ""
190
+ for chunk in bot_streaming(message, history_format):
191
+ response = chunk
192
+ history[-1][1] = response
193
+ yield history
194
+
195
+ upload_btn.upload(upload_image, upload_btn, current_img)
196
+
197
+ msg.submit(add_text, [chatbot, msg, current_img], chatbot).then(
198
+ bot_response, [chatbot, current_img], chatbot
199
+ )
200
+
201
+ btn.click(add_text, [chatbot, msg, current_img], chatbot).then(
202
+ bot_response, [chatbot, current_img], chatbot
203
+ )
204
+
205
+ stop_btn.click(None, None, None, cancels=[bot_response])
206
 
207
+ # Launch the app with queuing
208
  demo.queue().launch()