Spaces:

Fancy-MLLM
/

R1-Onevision

Running on Zero

File size: 2,396 Bytes

5be3d23
4b1b73d
5be3d23
 
4b1b73d
5be3d23
4b1b73d
366cc4b
5be3d23
4b1b73d
e547e36
bbdfb03
5be3d23
 
 
4b1b73d
 
 
 
e3288b1
4b1b73d
5be3d23
 
 
 
bbdfb03
5be3d23
 
 
 
4b1b73d
 
5be3d23
 
e3288b1
5be3d23
 
 
 
 
 
 
4b1b73d
5be3d23
4b1b73d
 
bbdfb03
 
 
 
 
 
5be3d23
e3288b1
4b1b73d
 
 
5be3d23
4b1b73d
 
366cc4b
5be3d23
bbdfb03
 
4b1b73d
e3288b1
bbdfb03
e3288b1
 
bbdfb03
 
e3288b1
bbdfb03
4b1b73d
 
e3288b1
bbdfb03

import gradio as gr
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image

# 指定模型路径
local_path = "Fancy-MLLM/R1-OneVision-7B"

# 加载模型和处理器
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    local_path, torch_dtype="auto", device_map="cpu"
)
processor = AutoProcessor.from_pretrained(local_path)

# 处理输入并生成输出
def generate_output(image, text):
    if image is None:
        return "Error: No image uploaded!"
    
    # 处理输入数据
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image, 'min_pixels': 1003520, 'max_pixels': 12845056},
                {"type": "text", "text": text},
            ],
        }
    ]

    # 生成模型输入
    text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text_input],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)  # 适配 CPU/GPU

    # **同步执行**，避免线程问题
    output_tokens = model.generate(
        **inputs,
        max_new_tokens=4096,
        top_p=0.001,
        top_k=1,
        temperature=0.01,
        repetition_penalty=1.0,
    )
    
    # 解析输出
    generated_text = processor.batch_decode(output_tokens, skip_special_tokens=True)[0]
    return generated_text  # 直接返回结果

# UI 组件
with gr.Blocks() as demo:
    gr.HTML("""<center><font size=8>🦖 R1-OneVision Demo</center>""")

    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type="pil", label="Upload")  # **改回 PIL 处理**
            input_text = gr.Textbox(label="Input your question")
            with gr.Row():
                clear_btn = gr.ClearButton([input_image, input_text])
                submit_btn = gr.Button("Submit", variant="primary")

        with gr.Column():
            output_text = gr.Markdown(elem_id="qwen-md", container=True)

    # 绑定事件，去掉 queue=True
    submit_btn.click(fn=generate_output, inputs=[input_image, input_text], outputs=output_text)

demo.launch(share=True)