# Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed import os import re import cv2 import json import time import numpy as np import gradio as gr from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN from visualizer import draw_boxes_points_with_labels infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'), api_key=os.getenv('API_KEY')) label_translations = { "gr_chatinterface_ofl": { "English": "Chatbot", "中文": "对话界面" }, "gr_chatinterface_ol": { "English": "Chatbot", "中文": "对话界面" }, "gr_tab_ol": { "English": "Online", "中文": "在线模式" }, "gr_tab_ofl": { "English": "Offline", "中文": "离线模式" }, "gr_thinking": { "English": ConversationModeI18N.D, "中文": ConversationModeCN.D, }, "gr_temperature": { "English": "Temperature", "中文": "温度系数" }, "gr_webcam_image": { "English": "🤳 Open Webcam", "中文": "🤳 打开摄像头" }, "gr_webcam_images": { "English": "📹 Recorded Frames", "中文": "📹 录制的视频帧" }, "gr_chatinterface_ofl.textbox.placeholder": { "English": "Ask me anything. You can also drop in images and .mp4 videos.", "中文": "有什么想问的?支持上传图片和.mp4视频。" }, "gr_chatinterface_ol.textbox.placeholder": { "English": "Ask me anything...", "中文": "有什么想问的?" }, "gr_clear_button": { "English": "🧹 Clear History", "中文": "🧹 清除历史对话" } } def add_escape(text: str): return text.replace('<', '\<').replace('>', '\>') def remove_escape(text: str): return text.replace('\<', '<').replace('\>', '>') def plot_boxes_points_detections(image_path, message): detection_pattern = r'\[\s*{.*?}\s*\]' detection_matches = re.finditer(detection_pattern, message, flags=re.DOTALL) bboxes, categories = [], [] for match in detection_matches: matched_str = match.group(0) detections = json.loads(matched_str) for detection in detections: cat, bbox_str = detection['category'], detection['bbox'] bbox_str = bbox_str.replace('', '').replace('', '').replace('(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)' box_matches = re.finditer(box_pattern, message) bboxes = [ [float(match.group(1)), float(match.group(2)), float(match.group(3)), float(match.group(4))] for match in box_matches ] points = [] if not bboxes: point_pattern = r'(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)' point_matches = re.finditer(point_pattern, message) points = [ [float(match.group(1)), float(match.group(2))] for match in point_matches ] if not bboxes and not points: return bboxes = np.array(bboxes, dtype='float') / 1000 points = np.array(points, dtype='float') / 1000 image = cv2.imread(image_path) h, w, c = image.shape if bboxes.size: bboxes[:, 0::2] *= w bboxes[:, 1::2] *= h if points.size: points[:, 0] *= w points[:, 1] *= h output_image = draw_boxes_points_with_labels(image, bboxes, points, categories) return output_image def general_chat(inputs: dict, gr_history: list, infer_history: list, if_thinking: bool, temperature: float, online: bool = False): if 'text' in inputs: inputs['text'] = remove_escape(inputs['text']) mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G for response_text, infer_history, finished in infer(inputs=inputs, history=infer_history, mode=mode, temperature=temperature, online=online): if if_thinking: reasoning_text, response_text = response_text.split('') reasoning_text = reasoning_text.lstrip('') response_message = [{ "role": "assistant", "content": add_escape(reasoning_text), 'metadata': { 'title': '🤔 Thinking' } }, { "role": "assistant", "content": add_escape(response_text) }] else: response_message = [{ "role": "assistant", "content": add_escape(response_text) }] if finished and len(inputs.get('files', [])) == 1 and not inputs['files'][0].endswith('.mp4'): image_path = inputs['files'][0] response_text = infer_history[-1]['content'] try: if if_thinking: reasoning_text, response_text = response_text.split('') output_image = plot_boxes_points_detections(image_path, response_text) if output_image is not None: response_message.append({ "role": "assistant", "content": gr.Image(output_image), }) except Exception as e: print(e) yield response_message, infer_history def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, gr_counter: int, infer_history: list, if_thinking: bool, temperature: float): if not gr_webcam_images: gr_webcam_images = [] gr_webcam_images = gr_webcam_images[gr_counter:] inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]} yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len( gr_webcam_images), infer_history for response_message, infer_history in general_chat( inputs, gr_history, infer_history, if_thinking, temperature, online=True): yield response_message, gr.skip(), infer_history with gr.Blocks() as demo: with gr.Row(): with gr.Column(): gr_title = gr.Markdown('# Seed1.5-VL') with gr.Row(): gr.Markdown( """
GitHub Seed1.5-VL Cookbook
""" ) gr.Markdown( """
Paper Seed1.5-VL Paper
""", ) gr.Markdown('') gr.Markdown('') gr.Markdown('') gr_lang_selector = gr.Dropdown(choices=["English", "中文"], value="English", label="🌐 English Interface/中文界面", interactive=True, min_width=400, scale=0) with gr.Tabs(): with gr.Tab("Offline") as gr_tab_ofl: gr_infer_history = gr.State([]) gr_thinking_hidden = gr.Checkbox(value=True, visible=False) gr_temperature_hidden = gr.Slider(minimum=0.0, maximum=2.0, step=0.1, value=0.0, interactive=True, visible=False) gr_chatinterface_ofl = gr.ChatInterface( fn=general_chat, type="messages", multimodal=True, chatbot=gr.Chatbot(height=600), textbox=gr.MultimodalTextbox( file_count="multiple", file_types=["image", ".mp4"], sources=["upload"], stop_btn=True, placeholder=label_translations[ 'gr_chatinterface_ofl.textbox.placeholder']['English'], ), additional_inputs=[ gr_infer_history, gr_thinking_hidden, gr_temperature_hidden ], additional_outputs=[gr_infer_history], ) def add_escape_fn(inputs: dict): if inputs and 'text' in inputs: inputs['text'] = add_escape(inputs['text']) return inputs gr_chatinterface_ofl.textbox.submit( fn=add_escape_fn, inputs=[gr_chatinterface_ofl.saved_input], outputs=[gr_chatinterface_ofl.saved_input] ) gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear], fn=lambda: [], outputs=[gr_infer_history]) with gr.Row(): gr_thinking_ofl = gr.Checkbox( value=True, label=label_translations['gr_thinking']['English'], ) gr_thinking_ofl.change(lambda x: x, inputs=gr_thinking_ofl, outputs=gr_thinking_hidden) gr_temperature_ofl = gr.Slider( minimum=0.0, maximum=2.0, step=0.1, value=0.0, label=label_translations['gr_temperature']['English'], interactive=True) gr_temperature_ofl.change(lambda x: x, inputs=gr_temperature_ofl, outputs=gr_temperature_hidden) gr_clear_button_ofl = gr.Button(value=label_translations['gr_clear_button']['English']) def clear_history_fn(): return None, [], [], [], [] gr_clear_button_ofl.click( fn=clear_history_fn, outputs=[ gr_chatinterface_ofl.conversation_id, gr_chatinterface_ofl.saved_conversations, gr_chatinterface_ofl.chatbot, gr_chatinterface_ofl.chatbot_state, gr_infer_history ] ) with gr.Column(visible=True) as gr_examples_en: gr.Examples( label='7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.', examples=[ { "text": "Who are you?", "files": [] }, { "text": "Introduce this.", "files": ["examples/bancopy.jpg"] }, { "text": """Find Curry's "Good Night" celebration time.""", "files": ["examples/I7pTpMjqNRM_1080p_small.mp4"] }, { "text": "Share your feelings.", "files": [ "examples/newyork.jpg", "examples/beijing.jpg" ] }, { "text": "Look and answer.", "files": ["examples/puzzle.jpg"] }, { "text": "Please point out all the hats on people's heads in the image, output concatenated point coordinates like x yx y", "files": ["examples/000000001000.jpeg"] }, { "text": """Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "x1 y1 x2 y2"}, {"category": category, "bbox": "x1 y1 x2 y2"}]""", "files": ["examples/000000018380.jpeg"] } ], inputs=[gr_chatinterface_ofl.textbox], ) with gr.Column(visible=False) as gr_examples_cn: gr.Examples( label='七个示例:文本,图像,视频,多个图像/视频,视觉解谜,坐标定位,开放式物体检测。', examples=[ { "text": "你是谁?", "files": [] }, { "text": "介绍一下。", "files": ["examples/bancopy.jpg"] }, { "text": "找到库里的“晚安”庆祝时间段。", "files": ["examples/I7pTpMjqNRM_1080p_small.mp4"] }, { "text": "你有什么感想?", "files": [ "examples/newyork.jpg", "examples/beijing.jpg" ] }, { "text": "看图回答。", "files": ["examples/puzzle.jpg"] }, { "text": "请点出图像中所有戴在头上的帽子, 输出串联的点坐标x yx y", "files": ["examples/000000001000.jpeg"] }, { "text": """请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表,就像:[{"category": 类别, "bbox": "x1 y1 x2 y2"}, {"category": 类别, "bbox": "x1 y1 x2 y2"}]""", "files": ["examples/000000018380.jpeg"] } ], inputs=[gr_chatinterface_ofl.textbox], ) with gr.Tab("Online") as gr_tab_ol: with gr.Row(): with gr.Column(scale=1): gr_infer_history_ol = gr.State([]) gr_thinking_hidden = gr.Checkbox(value=True, visible=False) gr_temperature_hidden = gr.Slider(minimum=0.0, maximum=2.0, step=0.1, value=1.0, interactive=True, visible=False) with gr.Row(): with gr.Column(scale=1): gr_webcam_image = gr.Image( label=label_translations['gr_webcam_image'] ['English'], sources="webcam", height=250, type='filepath') gr_webcam_images = gr.Gallery( label=label_translations['gr_webcam_images'] ['English'], show_label=True, format='webp', columns=1, height=250, preview=True, interactive=False) gr_counter = gr.Number(value=0, visible=False) with gr.Column(scale=3): gr_chatinterface_ol = gr.ChatInterface( fn=online_record_chat, type="messages", multimodal=False, chatbot=gr.Chatbot(height=600), textbox=gr. Textbox(placeholder=label_translations[ 'gr_chatinterface_ol.textbox.placeholder'] ['English'], submit_btn=True, stop_btn=True), additional_inputs=[ gr_webcam_images, gr_counter, gr_infer_history_ol, gr_thinking_hidden, gr_temperature_hidden ], additional_outputs=[ gr_counter, gr_infer_history_ol ], ) def cache_webcam(recorded_image: str, recorded_images: list): if not recorded_images: recorded_images = [] return recorded_images + [recorded_image] gr_webcam_image.stream( fn=cache_webcam, inputs=[gr_webcam_image, gr_webcam_images], outputs=[gr_webcam_images], stream_every=1, concurrency_limit=30, ) with gr.Row(): gr_thinking_ol = gr.Checkbox( value=True, label=label_translations['gr_thinking'] ['English'], ) gr_thinking_ol.change( lambda x: x, inputs=gr_thinking_ol, outputs=gr_thinking_hidden) gr_temperature_ol = gr.Slider( minimum=0.0, maximum=2.0, step=0.1, value=1.0, label=label_translations['gr_temperature'] ['English'], interactive=True) gr_temperature_ol.change( lambda x: x, inputs=gr_temperature_ol, outputs=gr_temperature_hidden) gr_clear_button_ol = gr.Button(value=label_translations['gr_clear_button']['English']) def clear_history_fn(): return None, [], [], [], [] gr_clear_button_ol.click( fn=clear_history_fn, outputs=[ gr_chatinterface_ol.conversation_id, gr_chatinterface_ol.saved_conversations, gr_chatinterface_ol.chatbot, gr_chatinterface_ol.chatbot_state, gr_infer_history_ol ] ) def update_lang(lang: str): return ( gr.update(label=label_translations['gr_chatinterface_ofl'][lang]), gr.update(label=label_translations['gr_chatinterface_ol'][lang]), gr.update(placeholder=label_translations[ 'gr_chatinterface_ofl.textbox.placeholder'][lang]), gr.update(placeholder=label_translations[ 'gr_chatinterface_ol.textbox.placeholder'][lang]), gr.update(label=label_translations['gr_tab_ofl'][lang]), gr.update(label=label_translations['gr_tab_ol'][lang]), gr.update(label=label_translations['gr_thinking'][lang]), gr.update(label=label_translations['gr_thinking'][lang]), gr.update(label=label_translations['gr_temperature'][lang]), gr.update(label=label_translations['gr_temperature'][lang]), gr.update(visible=lang == 'English'), gr.update(visible=lang != 'English'), gr.update(label=label_translations['gr_webcam_image'][lang]), gr.update(label=label_translations['gr_webcam_images'][lang]), gr.update(value=label_translations['gr_clear_button'][lang]), gr.update(value=label_translations['gr_clear_button'][lang]), ) gr_lang_selector.change(fn=update_lang, inputs=[gr_lang_selector], outputs=[ gr_chatinterface_ofl.chatbot, gr_chatinterface_ol.chatbot, gr_chatinterface_ofl.textbox, gr_chatinterface_ol.textbox, gr_tab_ofl, gr_tab_ol, gr_thinking_ofl, gr_thinking_ol, gr_temperature_ofl, gr_temperature_ol, gr_examples_en, gr_examples_cn, gr_webcam_image, gr_webcam_images, gr_clear_button_ofl, gr_clear_button_ol, ]) demo.queue(default_concurrency_limit=100, max_size=100).launch(share=True, max_threads=100, ssr_mode=False)