# Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed
import os
import re
import cv2
import json
import time
import numpy as np
import gradio as gr
from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN
from visualizer import draw_boxes_points_with_labels
infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'), api_key=os.getenv('API_KEY'))
label_translations = {
"gr_chatinterface_ofl": {
"English": "Chatbot",
"中文": "对话界面"
},
"gr_chatinterface_ol": {
"English": "Chatbot",
"中文": "对话界面"
},
"gr_tab_ol": {
"English": "Online",
"中文": "在线模式"
},
"gr_tab_ofl": {
"English": "Offline",
"中文": "离线模式"
},
"gr_thinking": {
"English": ConversationModeI18N.D,
"中文": ConversationModeCN.D,
},
"gr_temperature": {
"English": "Temperature",
"中文": "温度系数"
},
"gr_webcam_image": {
"English": "🤳 Open Webcam",
"中文": "🤳 打开摄像头"
},
"gr_webcam_images": {
"English": "📹 Recorded Frames",
"中文": "📹 录制的视频帧"
},
"gr_chatinterface_ofl.textbox.placeholder": {
"English":
"Ask me anything. You can also drop in images and .mp4 videos.",
"中文": "有什么想问的?支持上传图片和.mp4视频。"
},
"gr_chatinterface_ol.textbox.placeholder": {
"English": "Ask me anything...",
"中文": "有什么想问的?"
},
"gr_clear_button": {
"English": "🧹 Clear History",
"中文": "🧹 清除历史对话"
}
}
def add_escape(text: str):
return text.replace('<', '\<').replace('>', '\>')
def remove_escape(text: str):
return text.replace('\<', '<').replace('\>', '>')
def plot_boxes_points_detections(image_path, message):
detection_pattern = r'\[\s*{.*?}\s*\]'
detection_matches = re.finditer(detection_pattern, message, flags=re.DOTALL)
bboxes, categories = [], []
for match in detection_matches:
matched_str = match.group(0)
detections = json.loads(matched_str)
for detection in detections:
cat, bbox_str = detection['category'], detection['bbox']
bbox_str = bbox_str.replace('', '').replace('', '').replace('(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)'
box_matches = re.finditer(box_pattern, message)
bboxes = [
[float(match.group(1)), float(match.group(2)),
float(match.group(3)), float(match.group(4))]
for match in box_matches
]
points = []
if not bboxes:
point_pattern = r'(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)'
point_matches = re.finditer(point_pattern, message)
points = [
[float(match.group(1)), float(match.group(2))]
for match in point_matches
]
if not bboxes and not points:
return
bboxes = np.array(bboxes, dtype='float') / 1000
points = np.array(points, dtype='float') / 1000
image = cv2.imread(image_path)
h, w, c = image.shape
if bboxes.size:
bboxes[:, 0::2] *= w
bboxes[:, 1::2] *= h
if points.size:
points[:, 0] *= w
points[:, 1] *= h
output_image = draw_boxes_points_with_labels(image, bboxes, points, categories)
return output_image
def general_chat(inputs: dict, gr_history: list, infer_history: list,
if_thinking: bool, temperature: float, online: bool = False):
if 'text' in inputs:
inputs['text'] = remove_escape(inputs['text'])
mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G
for response_text, infer_history, finished in infer(inputs=inputs,
history=infer_history,
mode=mode,
temperature=temperature,
online=online):
if if_thinking:
reasoning_text, response_text = response_text.split('')
reasoning_text = reasoning_text.lstrip('')
response_message = [{
"role": "assistant",
"content": add_escape(reasoning_text),
'metadata': {
'title': '🤔 Thinking'
}
}, {
"role": "assistant",
"content": add_escape(response_text)
}]
else:
response_message = [{
"role": "assistant",
"content": add_escape(response_text)
}]
if finished and len(inputs.get('files', [])) == 1 and not inputs['files'][0].endswith('.mp4'):
image_path = inputs['files'][0]
response_text = infer_history[-1]['content']
try:
if if_thinking:
reasoning_text, response_text = response_text.split('')
output_image = plot_boxes_points_detections(image_path, response_text)
if output_image is not None:
response_message.append({
"role": "assistant",
"content": gr.Image(output_image),
})
except Exception as e:
print(e)
yield response_message, infer_history
def online_record_chat(text: str, gr_history: list, gr_webcam_images: list,
gr_counter: int, infer_history: list, if_thinking: bool,
temperature: float):
if not gr_webcam_images:
gr_webcam_images = []
gr_webcam_images = gr_webcam_images[gr_counter:]
inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len(
gr_webcam_images), infer_history
for response_message, infer_history in general_chat(
inputs, gr_history, infer_history, if_thinking, temperature, online=True):
yield response_message, gr.skip(), infer_history
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
gr_title = gr.Markdown('# Seed1.5-VL')
with gr.Row():
gr.Markdown(
"""
"""
)
gr.Markdown(
"""
""",
)
gr.Markdown('')
gr.Markdown('')
gr.Markdown('')
gr_lang_selector = gr.Dropdown(choices=["English", "中文"],
value="English",
label="🌐 English Interface/中文界面",
interactive=True,
min_width=400,
scale=0)
with gr.Tabs():
with gr.Tab("Offline") as gr_tab_ofl:
gr_infer_history = gr.State([])
gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
gr_temperature_hidden = gr.Slider(minimum=0.0,
maximum=2.0,
step=0.1,
value=0.0,
interactive=True,
visible=False)
gr_chatinterface_ofl = gr.ChatInterface(
fn=general_chat,
type="messages",
multimodal=True,
chatbot=gr.Chatbot(height=600),
textbox=gr.MultimodalTextbox(
file_count="multiple",
file_types=["image", ".mp4"],
sources=["upload"],
stop_btn=True,
placeholder=label_translations[
'gr_chatinterface_ofl.textbox.placeholder']['English'],
),
additional_inputs=[
gr_infer_history, gr_thinking_hidden, gr_temperature_hidden
],
additional_outputs=[gr_infer_history],
)
def add_escape_fn(inputs: dict):
if inputs and 'text' in inputs:
inputs['text'] = add_escape(inputs['text'])
return inputs
gr_chatinterface_ofl.textbox.submit(
fn=add_escape_fn,
inputs=[gr_chatinterface_ofl.saved_input],
outputs=[gr_chatinterface_ofl.saved_input]
)
gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
fn=lambda: [],
outputs=[gr_infer_history])
with gr.Row():
gr_thinking_ofl = gr.Checkbox(
value=True,
label=label_translations['gr_thinking']['English'],
)
gr_thinking_ofl.change(lambda x: x,
inputs=gr_thinking_ofl,
outputs=gr_thinking_hidden)
gr_temperature_ofl = gr.Slider(
minimum=0.0,
maximum=2.0,
step=0.1,
value=0.0,
label=label_translations['gr_temperature']['English'],
interactive=True)
gr_temperature_ofl.change(lambda x: x,
inputs=gr_temperature_ofl,
outputs=gr_temperature_hidden)
gr_clear_button_ofl = gr.Button(value=label_translations['gr_clear_button']['English'])
def clear_history_fn():
return None, [], [], [], []
gr_clear_button_ofl.click(
fn=clear_history_fn,
outputs=[
gr_chatinterface_ofl.conversation_id,
gr_chatinterface_ofl.saved_conversations,
gr_chatinterface_ofl.chatbot,
gr_chatinterface_ofl.chatbot_state,
gr_infer_history
]
)
with gr.Column(visible=True) as gr_examples_en:
gr.Examples(
label='7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.',
examples=[
{
"text": "Who are you?",
"files": []
},
{
"text": "Introduce this.",
"files": ["examples/bancopy.jpg"]
},
{
"text":
"""Find Curry's "Good Night" celebration time.""",
"files":
["examples/I7pTpMjqNRM_1080p_small.mp4"]
},
{
"text":
"Share your feelings.",
"files": [
"examples/newyork.jpg",
"examples/beijing.jpg"
]
},
{
"text": "Look and answer.",
"files": ["examples/puzzle.jpg"]
},
{
"text": "Please point out all the hats on people's heads in the image, output concatenated point coordinates like x yx y",
"files": ["examples/000000001000.jpeg"]
},
{
"text": """Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "x1 y1 x2 y2"}, {"category": category, "bbox": "x1 y1 x2 y2"}]""",
"files": ["examples/000000018380.jpeg"]
}
],
inputs=[gr_chatinterface_ofl.textbox],
)
with gr.Column(visible=False) as gr_examples_cn:
gr.Examples(
label='七个示例:文本,图像,视频,多个图像/视频,视觉解谜,坐标定位,开放式物体检测。',
examples=[
{
"text": "你是谁?",
"files": []
},
{
"text": "介绍一下。",
"files": ["examples/bancopy.jpg"]
},
{
"text":
"找到库里的“晚安”庆祝时间段。",
"files":
["examples/I7pTpMjqNRM_1080p_small.mp4"]
},
{
"text":
"你有什么感想?",
"files": [
"examples/newyork.jpg",
"examples/beijing.jpg"
]
},
{
"text": "看图回答。",
"files": ["examples/puzzle.jpg"]
},
{
"text": "请点出图像中所有戴在头上的帽子, 输出串联的点坐标x yx y",
"files": ["examples/000000001000.jpeg"]
},
{
"text": """请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表,就像:[{"category": 类别, "bbox": "x1 y1 x2 y2"}, {"category": 类别, "bbox": "x1 y1 x2 y2"}]""",
"files": ["examples/000000018380.jpeg"]
}
],
inputs=[gr_chatinterface_ofl.textbox],
)
with gr.Tab("Online") as gr_tab_ol:
with gr.Row():
with gr.Column(scale=1):
gr_infer_history_ol = gr.State([])
gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
gr_temperature_hidden = gr.Slider(minimum=0.0,
maximum=2.0,
step=0.1,
value=1.0,
interactive=True,
visible=False)
with gr.Row():
with gr.Column(scale=1):
gr_webcam_image = gr.Image(
label=label_translations['gr_webcam_image']
['English'],
sources="webcam",
height=250,
type='filepath')
gr_webcam_images = gr.Gallery(
label=label_translations['gr_webcam_images']
['English'],
show_label=True,
format='webp',
columns=1,
height=250,
preview=True,
interactive=False)
gr_counter = gr.Number(value=0, visible=False)
with gr.Column(scale=3):
gr_chatinterface_ol = gr.ChatInterface(
fn=online_record_chat,
type="messages",
multimodal=False,
chatbot=gr.Chatbot(height=600),
textbox=gr.
Textbox(placeholder=label_translations[
'gr_chatinterface_ol.textbox.placeholder']
['English'],
submit_btn=True,
stop_btn=True),
additional_inputs=[
gr_webcam_images, gr_counter,
gr_infer_history_ol, gr_thinking_hidden,
gr_temperature_hidden
],
additional_outputs=[
gr_counter, gr_infer_history_ol
],
)
def cache_webcam(recorded_image: str,
recorded_images: list):
if not recorded_images:
recorded_images = []
return recorded_images + [recorded_image]
gr_webcam_image.stream(
fn=cache_webcam,
inputs=[gr_webcam_image, gr_webcam_images],
outputs=[gr_webcam_images],
stream_every=1,
concurrency_limit=30,
)
with gr.Row():
gr_thinking_ol = gr.Checkbox(
value=True,
label=label_translations['gr_thinking']
['English'],
)
gr_thinking_ol.change(
lambda x: x,
inputs=gr_thinking_ol,
outputs=gr_thinking_hidden)
gr_temperature_ol = gr.Slider(
minimum=0.0,
maximum=2.0,
step=0.1,
value=1.0,
label=label_translations['gr_temperature']
['English'],
interactive=True)
gr_temperature_ol.change(
lambda x: x,
inputs=gr_temperature_ol,
outputs=gr_temperature_hidden)
gr_clear_button_ol = gr.Button(value=label_translations['gr_clear_button']['English'])
def clear_history_fn():
return None, [], [], [], []
gr_clear_button_ol.click(
fn=clear_history_fn,
outputs=[
gr_chatinterface_ol.conversation_id,
gr_chatinterface_ol.saved_conversations,
gr_chatinterface_ol.chatbot,
gr_chatinterface_ol.chatbot_state,
gr_infer_history_ol
]
)
def update_lang(lang: str):
return (
gr.update(label=label_translations['gr_chatinterface_ofl'][lang]),
gr.update(label=label_translations['gr_chatinterface_ol'][lang]),
gr.update(placeholder=label_translations[
'gr_chatinterface_ofl.textbox.placeholder'][lang]),
gr.update(placeholder=label_translations[
'gr_chatinterface_ol.textbox.placeholder'][lang]),
gr.update(label=label_translations['gr_tab_ofl'][lang]),
gr.update(label=label_translations['gr_tab_ol'][lang]),
gr.update(label=label_translations['gr_thinking'][lang]),
gr.update(label=label_translations['gr_thinking'][lang]),
gr.update(label=label_translations['gr_temperature'][lang]),
gr.update(label=label_translations['gr_temperature'][lang]),
gr.update(visible=lang == 'English'),
gr.update(visible=lang != 'English'),
gr.update(label=label_translations['gr_webcam_image'][lang]),
gr.update(label=label_translations['gr_webcam_images'][lang]),
gr.update(value=label_translations['gr_clear_button'][lang]),
gr.update(value=label_translations['gr_clear_button'][lang]),
)
gr_lang_selector.change(fn=update_lang,
inputs=[gr_lang_selector],
outputs=[
gr_chatinterface_ofl.chatbot,
gr_chatinterface_ol.chatbot,
gr_chatinterface_ofl.textbox,
gr_chatinterface_ol.textbox,
gr_tab_ofl,
gr_tab_ol,
gr_thinking_ofl,
gr_thinking_ol,
gr_temperature_ofl,
gr_temperature_ol,
gr_examples_en,
gr_examples_cn,
gr_webcam_image,
gr_webcam_images,
gr_clear_button_ofl,
gr_clear_button_ol,
])
demo.queue(default_concurrency_limit=100, max_size=100).launch(share=True,
max_threads=100,
ssr_mode=False)