Spaces:

ByteDance-Seed
/

Seed1.5-VL

Running

File size: 24,429 Bytes

# Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed
import os
import re
import cv2
import json
import time
import numpy as np
import gradio as gr
from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN
from visualizer import draw_boxes_points_with_labels

infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'), api_key=os.getenv('API_KEY'))

label_translations = {
    "gr_chatinterface_ofl": {
        "English": "Chatbot",
        "中文": "对话界面"
    },
    "gr_chatinterface_ol": {
        "English": "Chatbot",
        "中文": "对话界面"
    },
    "gr_tab_ol": {
        "English": "Online",
        "中文": "在线模式"
    },
    "gr_tab_ofl": {
        "English": "Offline",
        "中文": "离线模式"
    },
    "gr_thinking": {
        "English": ConversationModeI18N.D,
        "中文": ConversationModeCN.D,
    },
    "gr_temperature": {
        "English": "Temperature",
        "中文": "温度系数"
    },
    "gr_webcam_image": {
        "English": "🤳 Open Webcam",
        "中文": "🤳 打开摄像头"
    },
    "gr_webcam_images": {
        "English": "📹 Recorded Frames",
        "中文": "📹 录制的视频帧"
    },
    "gr_chatinterface_ofl.textbox.placeholder": {
        "English":
        "Ask me anything. You can also drop in images and .mp4 videos.",
        "中文": "有什么想问的？支持上传图片和.mp4视频。"
    },
    "gr_chatinterface_ol.textbox.placeholder": {
        "English": "Ask me anything...",
        "中文": "有什么想问的？"
    },
    "gr_clear_button": {
        "English": "🧹 Clear History",
        "中文": "🧹 清除历史对话"
    }
}

def add_escape(text: str):
    return text.replace('<', '\<').replace('>', '\>')

def remove_escape(text: str):
    return text.replace('\<', '<').replace('\>', '>')

def plot_boxes_points_detections(image_path, message):
    detection_pattern = r'\[\s*{.*?}\s*\]'
    detection_matches = re.finditer(detection_pattern, message, flags=re.DOTALL)
    bboxes, categories = [], []
    for match in detection_matches:
        matched_str = match.group(0)
        detections = json.loads(matched_str)
        for detection in detections:
            cat, bbox_str = detection['category'], detection['bbox']
            bbox_str = bbox_str.replace('<bbox>', '').replace('</bbox>', '').replace('</bbox', '')
            bbox = list(map(float, bbox_str.split(' ')))
            bboxes.append(bbox)
            categories.append(cat)
    if not bboxes:
        box_pattern = r'<bbox>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</bbox>'
        box_matches = re.finditer(box_pattern, message)
        bboxes = [
            [float(match.group(1)), float(match.group(2)), 
            float(match.group(3)), float(match.group(4))]
            for match in box_matches
        ]
    
    points = []
    if not bboxes:
        point_pattern = r'<point>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</point>'
        point_matches = re.finditer(point_pattern, message)
        points = [
            [float(match.group(1)), float(match.group(2))]
            for match in point_matches
        ]

    if not bboxes and not points:
        return

    bboxes = np.array(bboxes, dtype='float') / 1000
    points = np.array(points, dtype='float') / 1000

    image = cv2.imread(image_path)
    h, w, c = image.shape
    if bboxes.size:
        bboxes[:, 0::2] *= w
        bboxes[:, 1::2] *= h
    if points.size:
        points[:, 0] *= w
        points[:, 1] *= h
    output_image = draw_boxes_points_with_labels(image, bboxes, points, categories)
    return output_image

def general_chat(inputs: dict, gr_history: list, infer_history: list,
                 if_thinking: bool, temperature: float, online: bool = False):
    if 'text' in inputs:
        inputs['text'] = remove_escape(inputs['text'])
    mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G
    for response_text, infer_history, finished in infer(inputs=inputs,
                                              history=infer_history,
                                              mode=mode,
                                              temperature=temperature,
                                              online=online):
        if if_thinking:
            reasoning_text, response_text = response_text.split('</think>')
            reasoning_text = reasoning_text.lstrip('<think>')
            response_message = [{
                "role": "assistant",
                "content": add_escape(reasoning_text),
                'metadata': {
                    'title': '🤔 Thinking'
                }
            }, {
                "role": "assistant",
                "content": add_escape(response_text)
            }]
        else:
            response_message = [{
                "role": "assistant",
                "content": add_escape(response_text)
            }]
        if finished and len(inputs.get('files', [])) == 1 and not inputs['files'][0].endswith('.mp4'):
            image_path = inputs['files'][0]
            response_text = infer_history[-1]['content']
            try:
                if if_thinking:
                    reasoning_text, response_text = response_text.split('</think>')
                output_image = plot_boxes_points_detections(image_path, response_text)
                if output_image is not None:
                    response_message.append({
                        "role": "assistant",
                        "content": gr.Image(output_image),
                    })
            except Exception as e:
                print(e)
        yield response_message, infer_history

def online_record_chat(text: str, gr_history: list, gr_webcam_images: list,
                       gr_counter: int, infer_history: list, if_thinking: bool,
                       temperature: float):
    if not gr_webcam_images:
        gr_webcam_images = []
    gr_webcam_images = gr_webcam_images[gr_counter:]
    inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
    yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len(
        gr_webcam_images), infer_history
    for response_message, infer_history in general_chat(
            inputs, gr_history, infer_history, if_thinking, temperature, online=True):
        yield response_message, gr.skip(), infer_history

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            gr_title = gr.Markdown('# Seed1.5-VL')
            with gr.Row():
                gr.Markdown(
                    """
                    <div style="display:flex; flex-direction:column; gap:10px;">
                    <a
                        href="https://github.com/ByteDance-Seed/Seed1.5-VL"
                        target="_blank"
                        style="
                        display: inline-flex;
                        align-items: center;
                        gap: 8px;
                        white-space: nowrap;
                        text-decoration: none;
                        "
                    >
                        <img
                        src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/github/github-original.svg"
                        alt="GitHub"
                        width="24"
                        >
                        Seed1.5-VL Cookbook
                    </a>
                    </div>
                    """
                )
                gr.Markdown(
                    """
                    <div style="display:flex; flex-direction:column; gap:10px;">
                    <a
                        href="https://huggingface.co/papers/2505.07062"
                        target="_blank"
                        style="
                        display: inline-flex;
                        align-items: center;
                        gap: 8px;
                        white-space: nowrap;
                        text-decoration: none;
                        "
                    >
                        <img
                        src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
                        alt="Paper"
                        width="24"
                        >
                        Seed1.5-VL Paper
                    </a>
                    </div>
                    """,
                )
                gr.Markdown('')
                gr.Markdown('')
                gr.Markdown('')

        gr_lang_selector = gr.Dropdown(choices=["English", "中文"],
            value="English",
            label="🌐 English Interface/中文界面",
            interactive=True,
            min_width=400,
            scale=0)
            
    with gr.Tabs():
        with gr.Tab("Offline") as gr_tab_ofl:
            gr_infer_history = gr.State([])
            gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
            gr_temperature_hidden = gr.Slider(minimum=0.0,
                                              maximum=2.0,
                                              step=0.1,
                                              value=0.0,
                                              interactive=True,
                                              visible=False)
            gr_chatinterface_ofl = gr.ChatInterface(
                fn=general_chat,
                type="messages",
                multimodal=True,
                chatbot=gr.Chatbot(height=600),
                textbox=gr.MultimodalTextbox(
                    file_count="multiple",
                    file_types=["image", ".mp4"],
                    sources=["upload"],
                    stop_btn=True,
                    placeholder=label_translations[
                        'gr_chatinterface_ofl.textbox.placeholder']['English'],
                ),
                additional_inputs=[
                    gr_infer_history, gr_thinking_hidden, gr_temperature_hidden
                ],
                additional_outputs=[gr_infer_history],
            )
            def add_escape_fn(inputs: dict):
                if inputs and 'text' in inputs:
                    inputs['text'] = add_escape(inputs['text'])
                return inputs
            gr_chatinterface_ofl.textbox.submit(
                fn=add_escape_fn,
                inputs=[gr_chatinterface_ofl.saved_input],
                outputs=[gr_chatinterface_ofl.saved_input]
            )
            gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
                  fn=lambda: [],
                  outputs=[gr_infer_history])
            with gr.Row():
                gr_thinking_ofl = gr.Checkbox(
                    value=True,
                    label=label_translations['gr_thinking']['English'],
                )
                gr_thinking_ofl.change(lambda x: x,
                                        inputs=gr_thinking_ofl,
                                        outputs=gr_thinking_hidden)
                gr_temperature_ofl = gr.Slider(
                    minimum=0.0,
                    maximum=2.0,
                    step=0.1,
                    value=0.0,
                    label=label_translations['gr_temperature']['English'],
                    interactive=True)
                gr_temperature_ofl.change(lambda x: x,
                                            inputs=gr_temperature_ofl,
                                            outputs=gr_temperature_hidden)
                gr_clear_button_ofl = gr.Button(value=label_translations['gr_clear_button']['English'])
                def clear_history_fn():
                    return None, [], [], [], []
                gr_clear_button_ofl.click(
                    fn=clear_history_fn, 
                    outputs=[
                        gr_chatinterface_ofl.conversation_id, 
                        gr_chatinterface_ofl.saved_conversations, 
                        gr_chatinterface_ofl.chatbot,
                        gr_chatinterface_ofl.chatbot_state, 
                        gr_infer_history
                    ]
                )
            with gr.Column(visible=True) as gr_examples_en:
                gr.Examples(
                    label='7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.',
                    examples=[
                        {
                            "text": "Who are you?",
                            "files": []
                        },
                        {
                            "text": "Introduce this.",
                            "files": ["examples/bancopy.jpg"]
                        },
                        {
                            "text":
                            """Find Curry's "Good Night" celebration time.""",
                            "files":
                            ["examples/I7pTpMjqNRM_1080p_small.mp4"]
                        },
                        {
                            "text":
                            "Share your feelings.",
                            "files": [
                                "examples/newyork.jpg",
                                "examples/beijing.jpg"
                            ]
                        },
                        {
                            "text": "Look and answer.",
                            "files": ["examples/puzzle.jpg"]
                        },
                        {
                            "text": "Please point out all the hats on people's heads in the image, output concatenated point coordinates like <point>x y</point><point>x y</point>",
                            "files": ["examples/000000001000.jpeg"]
                        },
                        {
                            "text": """Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
                            "files": ["examples/000000018380.jpeg"]
                        }
                    ],
                    inputs=[gr_chatinterface_ofl.textbox],
                )
            with gr.Column(visible=False) as gr_examples_cn:
                gr.Examples(
                    label='七个示例：文本，图像，视频，多个图像/视频，视觉解谜，坐标定位，开放式物体检测。',
                    examples=[
                        {
                            "text": "你是谁？",
                            "files": []
                        },
                        {
                            "text": "介绍一下。",
                            "files": ["examples/bancopy.jpg"]
                        },
                        {
                            "text":
                            "找到库里的“晚安”庆祝时间段。",
                            "files":
                            ["examples/I7pTpMjqNRM_1080p_small.mp4"]
                        },
                        {
                            "text":
                            "你有什么感想？",
                            "files": [
                                "examples/newyork.jpg",
                                "examples/beijing.jpg"
                            ]
                        },
                        {
                            "text": "看图回答。",
                            "files": ["examples/puzzle.jpg"]
                        },
                        {
                            "text": "请点出图像中所有戴在头上的帽子, 输出串联的点坐标<point>x y</point><point>x y</point>",
                            "files": ["examples/000000001000.jpeg"]
                        },
                        {
                            "text": """请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表，就像：[{"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
                            "files": ["examples/000000018380.jpeg"]
                        }
                    ],
                    inputs=[gr_chatinterface_ofl.textbox],
                )
        with gr.Tab("Online") as gr_tab_ol:
            with gr.Row():
                with gr.Column(scale=1):
                    gr_infer_history_ol = gr.State([])
                    gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
                    gr_temperature_hidden = gr.Slider(minimum=0.0,
                                                      maximum=2.0,
                                                      step=0.1,
                                                      value=1.0,
                                                      interactive=True,
                                                      visible=False)
                    with gr.Row():
                        with gr.Column(scale=1):
                            gr_webcam_image = gr.Image(
                                label=label_translations['gr_webcam_image']
                                ['English'],
                                sources="webcam",
                                height=250,
                                type='filepath')
                            gr_webcam_images = gr.Gallery(
                                label=label_translations['gr_webcam_images']
                                ['English'],
                                show_label=True,
                                format='webp',
                                columns=1,
                                height=250,
                                preview=True,
                                interactive=False)
                            gr_counter = gr.Number(value=0, visible=False)
                        with gr.Column(scale=3):
                            gr_chatinterface_ol = gr.ChatInterface(
                                fn=online_record_chat,
                                type="messages",
                                multimodal=False,
                                chatbot=gr.Chatbot(height=600),
                                textbox=gr.
                                Textbox(placeholder=label_translations[
                                    'gr_chatinterface_ol.textbox.placeholder']
                                        ['English'],
                                        submit_btn=True,
                                        stop_btn=True),
                                additional_inputs=[
                                    gr_webcam_images, gr_counter,
                                    gr_infer_history_ol, gr_thinking_hidden,
                                    gr_temperature_hidden
                                ],
                                additional_outputs=[
                                    gr_counter, gr_infer_history_ol
                                ],
                            )

                            def cache_webcam(recorded_image: str,
                                             recorded_images: list):
                                if not recorded_images:
                                    recorded_images = []
                                return recorded_images + [recorded_image]

                            gr_webcam_image.stream(
                                fn=cache_webcam,
                                inputs=[gr_webcam_image, gr_webcam_images],
                                outputs=[gr_webcam_images],
                                stream_every=1,
                                concurrency_limit=30,
                            )
                            with gr.Row():
                                gr_thinking_ol = gr.Checkbox(
                                    value=True,
                                    label=label_translations['gr_thinking']
                                    ['English'],
                                )
                                gr_thinking_ol.change(
                                    lambda x: x,
                                    inputs=gr_thinking_ol,
                                    outputs=gr_thinking_hidden)
                                gr_temperature_ol = gr.Slider(
                                    minimum=0.0,
                                    maximum=2.0,
                                    step=0.1,
                                    value=1.0,
                                    label=label_translations['gr_temperature']
                                    ['English'],
                                    interactive=True)
                                gr_temperature_ol.change(
                                    lambda x: x,
                                    inputs=gr_temperature_ol,
                                    outputs=gr_temperature_hidden)
                                gr_clear_button_ol = gr.Button(value=label_translations['gr_clear_button']['English'])
                                def clear_history_fn():
                                    return None, [], [], [], []
                                gr_clear_button_ol.click(
                                    fn=clear_history_fn, 
                                    outputs=[
                                        gr_chatinterface_ol.conversation_id, 
                                        gr_chatinterface_ol.saved_conversations, 
                                        gr_chatinterface_ol.chatbot,
                                        gr_chatinterface_ol.chatbot_state, 
                                        gr_infer_history_ol
                                    ]
                                )

    def update_lang(lang: str):
        return (
            gr.update(label=label_translations['gr_chatinterface_ofl'][lang]),
            gr.update(label=label_translations['gr_chatinterface_ol'][lang]),
            gr.update(placeholder=label_translations[
                'gr_chatinterface_ofl.textbox.placeholder'][lang]),
            gr.update(placeholder=label_translations[
                'gr_chatinterface_ol.textbox.placeholder'][lang]),
            gr.update(label=label_translations['gr_tab_ofl'][lang]),
            gr.update(label=label_translations['gr_tab_ol'][lang]),
            gr.update(label=label_translations['gr_thinking'][lang]),
            gr.update(label=label_translations['gr_thinking'][lang]),
            gr.update(label=label_translations['gr_temperature'][lang]),
            gr.update(label=label_translations['gr_temperature'][lang]),
            gr.update(visible=lang == 'English'),
            gr.update(visible=lang != 'English'),
            gr.update(label=label_translations['gr_webcam_image'][lang]),
            gr.update(label=label_translations['gr_webcam_images'][lang]),
            gr.update(value=label_translations['gr_clear_button'][lang]),
            gr.update(value=label_translations['gr_clear_button'][lang]),
        )

    gr_lang_selector.change(fn=update_lang,
                            inputs=[gr_lang_selector],
                            outputs=[
                                gr_chatinterface_ofl.chatbot,
                                gr_chatinterface_ol.chatbot,
                                gr_chatinterface_ofl.textbox,
                                gr_chatinterface_ol.textbox,
                                gr_tab_ofl,
                                gr_tab_ol,
                                gr_thinking_ofl,
                                gr_thinking_ol,
                                gr_temperature_ofl,
                                gr_temperature_ol,
                                gr_examples_en,
                                gr_examples_cn,
                                gr_webcam_image,
                                gr_webcam_images,
                                gr_clear_button_ofl,
                                gr_clear_button_ol,
                            ])
demo.queue(default_concurrency_limit=100, max_size=100).launch(share=True,
                                                               max_threads=100,
                                                               ssr_mode=False)