Spaces:
Running
Running
# Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed | |
import os | |
import re | |
import cv2 | |
import json | |
import time | |
import numpy as np | |
import gradio as gr | |
from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN | |
from visualizer import draw_boxes_points_with_labels | |
infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'), api_key=os.getenv('API_KEY')) | |
label_translations = { | |
"gr_chatinterface_ofl": { | |
"English": "Chatbot", | |
"中文": "对话界面" | |
}, | |
"gr_chatinterface_ol": { | |
"English": "Chatbot", | |
"中文": "对话界面" | |
}, | |
"gr_tab_ol": { | |
"English": "Online", | |
"中文": "在线模式" | |
}, | |
"gr_tab_ofl": { | |
"English": "Offline", | |
"中文": "离线模式" | |
}, | |
"gr_thinking": { | |
"English": ConversationModeI18N.D, | |
"中文": ConversationModeCN.D, | |
}, | |
"gr_temperature": { | |
"English": "Temperature", | |
"中文": "温度系数" | |
}, | |
"gr_webcam_image": { | |
"English": "🤳 Open Webcam", | |
"中文": "🤳 打开摄像头" | |
}, | |
"gr_webcam_images": { | |
"English": "📹 Recorded Frames", | |
"中文": "📹 录制的视频帧" | |
}, | |
"gr_chatinterface_ofl.textbox.placeholder": { | |
"English": | |
"Ask me anything. You can also drop in images and .mp4 videos.", | |
"中文": "有什么想问的?支持上传图片和.mp4视频。" | |
}, | |
"gr_chatinterface_ol.textbox.placeholder": { | |
"English": "Ask me anything...", | |
"中文": "有什么想问的?" | |
}, | |
"gr_clear_button": { | |
"English": "🧹 Clear History", | |
"中文": "🧹 清除历史对话" | |
} | |
} | |
def add_escape(text: str): | |
return text.replace('<', '\<').replace('>', '\>') | |
def remove_escape(text: str): | |
return text.replace('\<', '<').replace('\>', '>') | |
def plot_boxes_points_detections(image_path, message): | |
detection_pattern = r'\[\s*{.*?}\s*\]' | |
detection_matches = re.finditer(detection_pattern, message, flags=re.DOTALL) | |
bboxes, categories = [], [] | |
for match in detection_matches: | |
matched_str = match.group(0) | |
detections = json.loads(matched_str) | |
for detection in detections: | |
cat, bbox_str = detection['category'], detection['bbox'] | |
bbox_str = bbox_str.replace('<bbox>', '').replace('</bbox>', '').replace('</bbox', '') | |
bbox = list(map(float, bbox_str.split(' '))) | |
bboxes.append(bbox) | |
categories.append(cat) | |
if not bboxes: | |
box_pattern = r'<bbox>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</bbox>' | |
box_matches = re.finditer(box_pattern, message) | |
bboxes = [ | |
[float(match.group(1)), float(match.group(2)), | |
float(match.group(3)), float(match.group(4))] | |
for match in box_matches | |
] | |
points = [] | |
if not bboxes: | |
point_pattern = r'<point>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</point>' | |
point_matches = re.finditer(point_pattern, message) | |
points = [ | |
[float(match.group(1)), float(match.group(2))] | |
for match in point_matches | |
] | |
if not bboxes and not points: | |
return | |
bboxes = np.array(bboxes, dtype='float') / 1000 | |
points = np.array(points, dtype='float') / 1000 | |
image = cv2.imread(image_path) | |
h, w, c = image.shape | |
if bboxes.size: | |
bboxes[:, 0::2] *= w | |
bboxes[:, 1::2] *= h | |
if points.size: | |
points[:, 0] *= w | |
points[:, 1] *= h | |
output_image = draw_boxes_points_with_labels(image, bboxes, points, categories) | |
return output_image | |
def general_chat(inputs: dict, gr_history: list, infer_history: list, | |
if_thinking: bool, temperature: float, online: bool = False): | |
if 'text' in inputs: | |
inputs['text'] = remove_escape(inputs['text']) | |
mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G | |
for response_text, infer_history, finished in infer(inputs=inputs, | |
history=infer_history, | |
mode=mode, | |
temperature=temperature, | |
online=online): | |
if if_thinking: | |
reasoning_text, response_text = response_text.split('</think>') | |
reasoning_text = reasoning_text.lstrip('<think>') | |
response_message = [{ | |
"role": "assistant", | |
"content": add_escape(reasoning_text), | |
'metadata': { | |
'title': '🤔 Thinking' | |
} | |
}, { | |
"role": "assistant", | |
"content": add_escape(response_text) | |
}] | |
else: | |
response_message = [{ | |
"role": "assistant", | |
"content": add_escape(response_text) | |
}] | |
if finished and len(inputs.get('files', [])) == 1 and not inputs['files'][0].endswith('.mp4'): | |
image_path = inputs['files'][0] | |
response_text = infer_history[-1]['content'] | |
try: | |
if if_thinking: | |
reasoning_text, response_text = response_text.split('</think>') | |
output_image = plot_boxes_points_detections(image_path, response_text) | |
if output_image is not None: | |
response_message.append({ | |
"role": "assistant", | |
"content": gr.Image(output_image), | |
}) | |
except Exception as e: | |
print(e) | |
yield response_message, infer_history | |
def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, | |
gr_counter: int, infer_history: list, if_thinking: bool, | |
temperature: float): | |
if not gr_webcam_images: | |
gr_webcam_images = [] | |
gr_webcam_images = gr_webcam_images[gr_counter:] | |
inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]} | |
yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len( | |
gr_webcam_images), infer_history | |
for response_message, infer_history in general_chat( | |
inputs, gr_history, infer_history, if_thinking, temperature, online=True): | |
yield response_message, gr.skip(), infer_history | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
gr_title = gr.Markdown('# Seed1.5-VL') | |
with gr.Row(): | |
gr.Markdown( | |
""" | |
<div style="display:flex; flex-direction:column; gap:10px;"> | |
<a | |
href="https://github.com/ByteDance-Seed/Seed1.5-VL" | |
target="_blank" | |
style=" | |
display: inline-flex; | |
align-items: center; | |
gap: 8px; | |
white-space: nowrap; | |
text-decoration: none; | |
" | |
> | |
<img | |
src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/github/github-original.svg" | |
alt="GitHub" | |
width="24" | |
> | |
Seed1.5-VL Cookbook | |
</a> | |
</div> | |
""" | |
) | |
gr.Markdown( | |
""" | |
<div style="display:flex; flex-direction:column; gap:10px;"> | |
<a | |
href="https://huggingface.co/papers/2505.07062" | |
target="_blank" | |
style=" | |
display: inline-flex; | |
align-items: center; | |
gap: 8px; | |
white-space: nowrap; | |
text-decoration: none; | |
" | |
> | |
<img | |
src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" | |
alt="Paper" | |
width="24" | |
> | |
Seed1.5-VL Paper | |
</a> | |
</div> | |
""", | |
) | |
gr.Markdown('') | |
gr.Markdown('') | |
gr.Markdown('') | |
gr_lang_selector = gr.Dropdown(choices=["English", "中文"], | |
value="English", | |
label="🌐 English Interface/中文界面", | |
interactive=True, | |
min_width=400, | |
scale=0) | |
with gr.Tabs(): | |
with gr.Tab("Offline") as gr_tab_ofl: | |
gr_infer_history = gr.State([]) | |
gr_thinking_hidden = gr.Checkbox(value=True, visible=False) | |
gr_temperature_hidden = gr.Slider(minimum=0.0, | |
maximum=2.0, | |
step=0.1, | |
value=0.0, | |
interactive=True, | |
visible=False) | |
gr_chatinterface_ofl = gr.ChatInterface( | |
fn=general_chat, | |
type="messages", | |
multimodal=True, | |
chatbot=gr.Chatbot(height=600), | |
textbox=gr.MultimodalTextbox( | |
file_count="multiple", | |
file_types=["image", ".mp4"], | |
sources=["upload"], | |
stop_btn=True, | |
placeholder=label_translations[ | |
'gr_chatinterface_ofl.textbox.placeholder']['English'], | |
), | |
additional_inputs=[ | |
gr_infer_history, gr_thinking_hidden, gr_temperature_hidden | |
], | |
additional_outputs=[gr_infer_history], | |
) | |
def add_escape_fn(inputs: dict): | |
if inputs and 'text' in inputs: | |
inputs['text'] = add_escape(inputs['text']) | |
return inputs | |
gr_chatinterface_ofl.textbox.submit( | |
fn=add_escape_fn, | |
inputs=[gr_chatinterface_ofl.saved_input], | |
outputs=[gr_chatinterface_ofl.saved_input] | |
) | |
gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear], | |
fn=lambda: [], | |
outputs=[gr_infer_history]) | |
with gr.Row(): | |
gr_thinking_ofl = gr.Checkbox( | |
value=True, | |
label=label_translations['gr_thinking']['English'], | |
) | |
gr_thinking_ofl.change(lambda x: x, | |
inputs=gr_thinking_ofl, | |
outputs=gr_thinking_hidden) | |
gr_temperature_ofl = gr.Slider( | |
minimum=0.0, | |
maximum=2.0, | |
step=0.1, | |
value=0.0, | |
label=label_translations['gr_temperature']['English'], | |
interactive=True) | |
gr_temperature_ofl.change(lambda x: x, | |
inputs=gr_temperature_ofl, | |
outputs=gr_temperature_hidden) | |
gr_clear_button_ofl = gr.Button(value=label_translations['gr_clear_button']['English']) | |
def clear_history_fn(): | |
return None, [], [], [], [] | |
gr_clear_button_ofl.click( | |
fn=clear_history_fn, | |
outputs=[ | |
gr_chatinterface_ofl.conversation_id, | |
gr_chatinterface_ofl.saved_conversations, | |
gr_chatinterface_ofl.chatbot, | |
gr_chatinterface_ofl.chatbot_state, | |
gr_infer_history | |
] | |
) | |
with gr.Column(visible=True) as gr_examples_en: | |
gr.Examples( | |
label='7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.', | |
examples=[ | |
{ | |
"text": "Who are you?", | |
"files": [] | |
}, | |
{ | |
"text": "Introduce this.", | |
"files": ["examples/bancopy.jpg"] | |
}, | |
{ | |
"text": | |
"""Find Curry's "Good Night" celebration time.""", | |
"files": | |
["examples/I7pTpMjqNRM_1080p_small.mp4"] | |
}, | |
{ | |
"text": | |
"Share your feelings.", | |
"files": [ | |
"examples/newyork.jpg", | |
"examples/beijing.jpg" | |
] | |
}, | |
{ | |
"text": "Look and answer.", | |
"files": ["examples/puzzle.jpg"] | |
}, | |
{ | |
"text": "Please point out all the hats on people's heads in the image, output concatenated point coordinates like <point>x y</point><point>x y</point>", | |
"files": ["examples/000000001000.jpeg"] | |
}, | |
{ | |
"text": """Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""", | |
"files": ["examples/000000018380.jpeg"] | |
} | |
], | |
inputs=[gr_chatinterface_ofl.textbox], | |
) | |
with gr.Column(visible=False) as gr_examples_cn: | |
gr.Examples( | |
label='七个示例:文本,图像,视频,多个图像/视频,视觉解谜,坐标定位,开放式物体检测。', | |
examples=[ | |
{ | |
"text": "你是谁?", | |
"files": [] | |
}, | |
{ | |
"text": "介绍一下。", | |
"files": ["examples/bancopy.jpg"] | |
}, | |
{ | |
"text": | |
"找到库里的“晚安”庆祝时间段。", | |
"files": | |
["examples/I7pTpMjqNRM_1080p_small.mp4"] | |
}, | |
{ | |
"text": | |
"你有什么感想?", | |
"files": [ | |
"examples/newyork.jpg", | |
"examples/beijing.jpg" | |
] | |
}, | |
{ | |
"text": "看图回答。", | |
"files": ["examples/puzzle.jpg"] | |
}, | |
{ | |
"text": "请点出图像中所有戴在头上的帽子, 输出串联的点坐标<point>x y</point><point>x y</point>", | |
"files": ["examples/000000001000.jpeg"] | |
}, | |
{ | |
"text": """请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表,就像:[{"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""", | |
"files": ["examples/000000018380.jpeg"] | |
} | |
], | |
inputs=[gr_chatinterface_ofl.textbox], | |
) | |
with gr.Tab("Online") as gr_tab_ol: | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr_infer_history_ol = gr.State([]) | |
gr_thinking_hidden = gr.Checkbox(value=True, visible=False) | |
gr_temperature_hidden = gr.Slider(minimum=0.0, | |
maximum=2.0, | |
step=0.1, | |
value=1.0, | |
interactive=True, | |
visible=False) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr_webcam_image = gr.Image( | |
label=label_translations['gr_webcam_image'] | |
['English'], | |
sources="webcam", | |
height=250, | |
type='filepath') | |
gr_webcam_images = gr.Gallery( | |
label=label_translations['gr_webcam_images'] | |
['English'], | |
show_label=True, | |
format='webp', | |
columns=1, | |
height=250, | |
preview=True, | |
interactive=False) | |
gr_counter = gr.Number(value=0, visible=False) | |
with gr.Column(scale=3): | |
gr_chatinterface_ol = gr.ChatInterface( | |
fn=online_record_chat, | |
type="messages", | |
multimodal=False, | |
chatbot=gr.Chatbot(height=600), | |
textbox=gr. | |
Textbox(placeholder=label_translations[ | |
'gr_chatinterface_ol.textbox.placeholder'] | |
['English'], | |
submit_btn=True, | |
stop_btn=True), | |
additional_inputs=[ | |
gr_webcam_images, gr_counter, | |
gr_infer_history_ol, gr_thinking_hidden, | |
gr_temperature_hidden | |
], | |
additional_outputs=[ | |
gr_counter, gr_infer_history_ol | |
], | |
) | |
def cache_webcam(recorded_image: str, | |
recorded_images: list): | |
if not recorded_images: | |
recorded_images = [] | |
return recorded_images + [recorded_image] | |
gr_webcam_image.stream( | |
fn=cache_webcam, | |
inputs=[gr_webcam_image, gr_webcam_images], | |
outputs=[gr_webcam_images], | |
stream_every=1, | |
concurrency_limit=30, | |
) | |
with gr.Row(): | |
gr_thinking_ol = gr.Checkbox( | |
value=True, | |
label=label_translations['gr_thinking'] | |
['English'], | |
) | |
gr_thinking_ol.change( | |
lambda x: x, | |
inputs=gr_thinking_ol, | |
outputs=gr_thinking_hidden) | |
gr_temperature_ol = gr.Slider( | |
minimum=0.0, | |
maximum=2.0, | |
step=0.1, | |
value=1.0, | |
label=label_translations['gr_temperature'] | |
['English'], | |
interactive=True) | |
gr_temperature_ol.change( | |
lambda x: x, | |
inputs=gr_temperature_ol, | |
outputs=gr_temperature_hidden) | |
gr_clear_button_ol = gr.Button(value=label_translations['gr_clear_button']['English']) | |
def clear_history_fn(): | |
return None, [], [], [], [] | |
gr_clear_button_ol.click( | |
fn=clear_history_fn, | |
outputs=[ | |
gr_chatinterface_ol.conversation_id, | |
gr_chatinterface_ol.saved_conversations, | |
gr_chatinterface_ol.chatbot, | |
gr_chatinterface_ol.chatbot_state, | |
gr_infer_history_ol | |
] | |
) | |
def update_lang(lang: str): | |
return ( | |
gr.update(label=label_translations['gr_chatinterface_ofl'][lang]), | |
gr.update(label=label_translations['gr_chatinterface_ol'][lang]), | |
gr.update(placeholder=label_translations[ | |
'gr_chatinterface_ofl.textbox.placeholder'][lang]), | |
gr.update(placeholder=label_translations[ | |
'gr_chatinterface_ol.textbox.placeholder'][lang]), | |
gr.update(label=label_translations['gr_tab_ofl'][lang]), | |
gr.update(label=label_translations['gr_tab_ol'][lang]), | |
gr.update(label=label_translations['gr_thinking'][lang]), | |
gr.update(label=label_translations['gr_thinking'][lang]), | |
gr.update(label=label_translations['gr_temperature'][lang]), | |
gr.update(label=label_translations['gr_temperature'][lang]), | |
gr.update(visible=lang == 'English'), | |
gr.update(visible=lang != 'English'), | |
gr.update(label=label_translations['gr_webcam_image'][lang]), | |
gr.update(label=label_translations['gr_webcam_images'][lang]), | |
gr.update(value=label_translations['gr_clear_button'][lang]), | |
gr.update(value=label_translations['gr_clear_button'][lang]), | |
) | |
gr_lang_selector.change(fn=update_lang, | |
inputs=[gr_lang_selector], | |
outputs=[ | |
gr_chatinterface_ofl.chatbot, | |
gr_chatinterface_ol.chatbot, | |
gr_chatinterface_ofl.textbox, | |
gr_chatinterface_ol.textbox, | |
gr_tab_ofl, | |
gr_tab_ol, | |
gr_thinking_ofl, | |
gr_thinking_ol, | |
gr_temperature_ofl, | |
gr_temperature_ol, | |
gr_examples_en, | |
gr_examples_cn, | |
gr_webcam_image, | |
gr_webcam_images, | |
gr_clear_button_ofl, | |
gr_clear_button_ol, | |
]) | |
demo.queue(default_concurrency_limit=100, max_size=100).launch(share=True, | |
max_threads=100, | |
ssr_mode=False) | |