Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -126,6 +126,15 @@ def load_image(image_file, input_size=448, max_num=12):
|
|
126 |
pixel_values = torch.stack(pixel_values)
|
127 |
return pixel_values
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
model = AutoModel.from_pretrained(
|
130 |
"5CD-AI/Vintern-3B-R-beta",
|
131 |
torch_dtype=torch.bfloat16,
|
@@ -136,35 +145,60 @@ model = AutoModel.from_pretrained(
|
|
136 |
tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-3B-R-beta", trust_remote_code=True, use_fast=False)
|
137 |
|
138 |
global_think_mode =False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
@spaces.GPU
|
141 |
def chat(message, history):
|
142 |
global global_think_mode
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
146 |
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
if len(history) == 0 and len(message["files"]) != 0:
|
152 |
-
if "path" in message["files"][0]:
|
153 |
-
test_image = message["files"][0]["path"]
|
154 |
-
else:
|
155 |
-
test_image = message["files"][0]
|
156 |
-
pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
|
157 |
-
elif len(history) == 0 and len(message["files"]) == 0:
|
158 |
-
pixel_values = None
|
159 |
-
elif history[0][0][0] is not None and os.path.isfile(history[0][0][0]):
|
160 |
-
test_image = history[0][0][0]
|
161 |
-
pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
|
162 |
else:
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
-
|
166 |
-
generation_config = dict(max_new_tokens= 700, do_sample=False, num_beams = 3, repetition_penalty=2.
|
167 |
-
|
168 |
if len(history) == 0:
|
169 |
if pixel_values is not None:
|
170 |
question = '<image>\n'+message["text"]
|
@@ -199,15 +233,40 @@ def chat(message, history):
|
|
199 |
time.sleep(0.02)
|
200 |
yield generated_text_without_prompt
|
201 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
buffer = ""
|
203 |
-
thinking =
|
204 |
-
I am processing your request carefully. First, I need to understand the question clearly.
|
205 |
-
Then, I retrieve relevant information and analyze different possibilities.
|
206 |
-
Finally, I generate a structured response that best fits your input.
|
207 |
-
\nThis process ensures that I provide the most accurate and meaningful answer possible.
|
208 |
-
"""
|
209 |
|
210 |
-
accumulated_text = "💡 **Thinking process
|
211 |
accumulated_text += "<pre><code>\n"
|
212 |
|
213 |
temp_text = ""
|
@@ -218,6 +277,17 @@ Finally, I generate a structured response that best fits your input.
|
|
218 |
|
219 |
accumulated_text += temp_text + "\n</code></pre>\n\n---\n"
|
220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
CSS ="""
|
223 |
#component-10 {
|
@@ -318,7 +388,7 @@ def toggle_think_mode(current_state):
|
|
318 |
global global_think_mode
|
319 |
new_state = not current_state
|
320 |
global_think_mode = not global_think_mode
|
321 |
-
button_label = "💡
|
322 |
return new_state, button_label
|
323 |
|
324 |
demo = gr.Blocks(css=CSS,js=js, theme='NoCrypt/miku')
|
|
|
126 |
pixel_values = torch.stack(pixel_values)
|
127 |
return pixel_values
|
128 |
|
129 |
+
def extract_conclusion(text):
|
130 |
+
match = re.search(r"<CONCLUSION>(.*?)</CONCLUSION>", text, re.DOTALL)
|
131 |
+
return match.group(1).strip() if match else ""
|
132 |
+
|
133 |
+
def extract_think(text):
|
134 |
+
text = re.sub(r"<.*?>", "", text.split("<CONCLUSION>")[0]) # Loại bỏ tất cả các tag <...>
|
135 |
+
conclusion_part = extract_conclusion(text)
|
136 |
+
return text.replace(conclusion_part, "").strip()
|
137 |
+
|
138 |
model = AutoModel.from_pretrained(
|
139 |
"5CD-AI/Vintern-3B-R-beta",
|
140 |
torch_dtype=torch.bfloat16,
|
|
|
145 |
tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-3B-R-beta", trust_remote_code=True, use_fast=False)
|
146 |
|
147 |
global_think_mode =False
|
148 |
+
think_prompt = """Bạn là người rất cẩn thận và đa nghi, vui lòng trả lời câu hỏi dưới đây bằng tiếng Việt. Khi suy luận bạn thường liệt kê ra các bằng chứng để chỉ ra các đáp án khả thi, suy luận và giải thích tại sao lại lựa chọn và loại bỏ trước khi đưa ra câu trả lời cuối cùng.
|
149 |
+
|
150 |
+
Câu hỏi:
|
151 |
+
{question_input}
|
152 |
+
|
153 |
+
Hãy trả lời rất dài theo định dạng sau:
|
154 |
+
<SUMMARY>...</SUMMARY>
|
155 |
+
|
156 |
+
<CAPTION>...</CAPTION>
|
157 |
+
|
158 |
+
<FIND_CANDIDATES_REASONING>...</FIND_CANDIDATES_REASONING>
|
159 |
+
|
160 |
+
<TOP3_CANDIDATES>...</TOP3_CANDIDATES>
|
161 |
+
|
162 |
+
<REASONING_PLAN>...</REASONING_PLAN>
|
163 |
+
|
164 |
+
<REASONING>...</REASONING>
|
165 |
+
|
166 |
+
<COUNTER_ARGUMENTS>...</COUNTER_ARGUMENTS>
|
167 |
+
|
168 |
+
<VALIDATION_REASONING>...</VALIDATION_REASONING>
|
169 |
+
|
170 |
+
<CONCLUSION>...</CONCLUSION>
|
171 |
+
"""
|
172 |
+
|
173 |
|
174 |
@spaces.GPU
|
175 |
def chat(message, history):
|
176 |
global global_think_mode
|
177 |
+
|
178 |
+
print("history",history)
|
179 |
+
print("message",message)
|
180 |
+
|
181 |
+
if len(history) != 0 and len(message["files"]) != 0:
|
182 |
+
return """Chúng tôi hiện chỉ hổ trợ 1 ảnh ở đầu ngữ cảnh! Vui lòng tạo mới cuộc trò chuyện.
|
183 |
+
We currently only support one image at the start of the context! Please start a new conversation."""
|
184 |
|
185 |
+
if len(history) == 0 and len(message["files"]) != 0:
|
186 |
+
if "path" in message["files"][0]:
|
187 |
+
test_image = message["files"][0]["path"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
else:
|
189 |
+
test_image = message["files"][0]
|
190 |
+
pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
|
191 |
+
elif len(history) == 0 and len(message["files"]) == 0:
|
192 |
+
pixel_values = None
|
193 |
+
elif history[0][0][0] is not None and os.path.isfile(history[0][0][0]):
|
194 |
+
test_image = history[0][0][0]
|
195 |
+
pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
|
196 |
+
else:
|
197 |
+
pixel_values = None
|
198 |
|
199 |
+
if not global_think_mode:
|
200 |
+
generation_config = dict(max_new_tokens= 700, do_sample=False, num_beams = 3, repetition_penalty=2.0)
|
201 |
+
|
202 |
if len(history) == 0:
|
203 |
if pixel_values is not None:
|
204 |
question = '<image>\n'+message["text"]
|
|
|
233 |
time.sleep(0.02)
|
234 |
yield generated_text_without_prompt
|
235 |
else:
|
236 |
+
generation_config = dict(max_new_tokens= 2000, do_sample=False, num_beams = 3, repetition_penalty=2.0)
|
237 |
+
|
238 |
+
if len(history) == 0:
|
239 |
+
if pixel_values is not None:
|
240 |
+
question = '<image>\n'+ think_prompt.format(question_input=message["text"])
|
241 |
+
else:
|
242 |
+
question = think_prompt.format(question_input=message["text"])
|
243 |
+
response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
244 |
+
else:
|
245 |
+
conv_history = []
|
246 |
+
if history[0][0][0] is not None and os.path.isfile(history[0][0][0]):
|
247 |
+
start_index = 1
|
248 |
+
else:
|
249 |
+
start_index = 0
|
250 |
+
|
251 |
+
for i, chat_pair in enumerate(history[start_index:]):
|
252 |
+
if i == 0 and start_index == 1:
|
253 |
+
conv_history.append(tuple(['<image>\n'+chat_pair[0],chat_pair[1]]))
|
254 |
+
else:
|
255 |
+
conv_history.append(tuple(chat_pair))
|
256 |
+
|
257 |
+
|
258 |
+
print("conv_history",conv_history)
|
259 |
+
question = message["text"]
|
260 |
+
response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=conv_history, return_history=True)
|
261 |
+
|
262 |
+
print(f'User: {question}\nAssistant: {response}')
|
263 |
+
think_part = extract_think(response)
|
264 |
+
conclusion_part = extract_conclusion(response)
|
265 |
+
|
266 |
buffer = ""
|
267 |
+
thinking = think_part
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
+
accumulated_text = "💡 **Thinking process:**\n\n"
|
270 |
accumulated_text += "<pre><code>\n"
|
271 |
|
272 |
temp_text = ""
|
|
|
277 |
|
278 |
accumulated_text += temp_text + "\n</code></pre>\n\n---\n"
|
279 |
|
280 |
+
# Yield phần kết luận
|
281 |
+
accumulated_text += "🎯 **Conclusion:**\n\n"
|
282 |
+
|
283 |
+
temp_text = ""
|
284 |
+
for char in conclusion_part:
|
285 |
+
temp_text += char
|
286 |
+
yield accumulated_text + temp_text + "\n\n---\n"
|
287 |
+
time.sleep(0.02)
|
288 |
+
|
289 |
+
accumulated_text += temp_text + "\n\n---\n"
|
290 |
+
|
291 |
|
292 |
CSS ="""
|
293 |
#component-10 {
|
|
|
388 |
global global_think_mode
|
389 |
new_state = not current_state
|
390 |
global_think_mode = not global_think_mode
|
391 |
+
button_label = "🧠Think💡" if new_state else "🧠Think"
|
392 |
return new_state, button_label
|
393 |
|
394 |
demo = gr.Blocks(css=CSS,js=js, theme='NoCrypt/miku')
|