File size: 9,216 Bytes
67ae540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c6ecb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5be3d23
67ae540
 
2ba0a0c
 
67ae540
5be3d23
4c6ecb5
 
67ae540
 
 
 
 
 
6a423bd
67ae540
 
 
5be3d23
4c6ecb5
67ae540
 
 
 
 
 
 
4c6ecb5
67ae540
4c6ecb5
67ae540
4c6ecb5
5be3d23
4c6ecb5
5be3d23
 
 
 
67ae540
5be3d23
 
 
 
67ae540
4c6ecb5
67ae540
5be3d23
67ae540
 
5be3d23
67ae540
 
5be3d23
4c6ecb5
 
 
 
2ba0a0c
4c6ecb5
 
 
 
 
67ae540
4c6ecb5
2ba0a0c
67ae540
4c6ecb5
67ae540
4c6ecb5
67ae540
 
4c6ecb5
67ae540
 
4c6ecb5
efd5003
 
 
67ae540
4c6ecb5
 
67ae540
efd5003
 
4c6ecb5
 
 
 
 
 
67ae540
 
 
 
efd5003
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# import gradio as gr
# from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
# from threading import Thread
# from qwen_vl_utils import process_vision_info
# import torch
# import time

# # Check if a GPU is available
# device = "cuda" if torch.cuda.is_available() else "cpu"

# local_path = "Fancy-MLLM/R1-OneVision-7B"

# # Load the model on the appropriate device (GPU if available, otherwise CPU)
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     local_path, torch_dtype="auto", device_map=device
# )
# processor = AutoProcessor.from_pretrained(local_path)

# def generate_output(image, text, button_click):
#     # Prepare input data
#     messages = [
#         {
#             "role": "user",
#             "content": [
#                 {"type": "image", "image": image, 'min_pixels': 1003520, 'max_pixels': 12845056},
#                 {"type": "text", "text": text},
#             ],
#         }
#     ]
    
#     # Prepare inputs for the model
#     text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     image_inputs, video_inputs = process_vision_info(messages)
#     inputs = processor(
#         text=[text_input],
#         images=image_inputs,
#         videos=video_inputs,
#         padding=True,
#         return_tensors="pt",
#     )
    
#     # Move inputs to the same device as the model
#     inputs = inputs.to(model.device)

#     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
#     generation_kwargs = dict(
#         **inputs,
#         streamer=streamer,
#         max_new_tokens=4096,
#         top_p=0.001,
#         top_k=1,
#         temperature=0.01,
#         repetition_penalty=1.0,
#     )
    
#     thread = Thread(target=model.generate, kwargs=generation_kwargs)
#     thread.start()
#     generated_text = ''
    
#     try:
#         for new_text in streamer:
#             generated_text += new_text
#             yield f"‎{generated_text}"
#     except Exception as e:
#         print(f"Error: {e}")
#         yield f"Error occurred: {str(e)}"

# Css = """
# #output-markdown {
#     overflow-y: auto;
#     white-space: pre-wrap; 
#     word-wrap: break-word;
# }
# #output-markdown .math {
#     overflow-x: auto;
#     max-width: 100%;
# }
# .markdown-text {
#     white-space: pre-wrap;
#     word-wrap: break-word;
# }
# .markdown-output {
#     min-height: 20vh;
#     max-width: 100%;
#     overflow-y: auto;
# }
# #qwen-md .katex-display { display: inline; }
# #qwen-md .katex-display>.katex { display: inline; }
# #qwen-md .katex-display>.katex>.katex-html { display: inline; }
# """

# with gr.Blocks(css=Css) as demo:
#     gr.HTML("""<center><font size=8>🦖 R1-OneVision Demo</center>""")

#     with gr.Row():
#         with gr.Column():
#             input_image = gr.Image(type="pil", label="Upload")  # **改回 PIL 处理**
#             input_text = gr.Textbox(label="Input your question")
#             with gr.Row():
#                 clear_btn = gr.ClearButton([input_image, input_text])
#                 submit_btn = gr.Button("Submit", variant="primary")

#         with gr.Column():
#             output_text = gr.Markdown(elem_id="qwen-md", container=True, elem_classes="markdown-output")

#     submit_btn.click(fn=generate_output, inputs=[input_image, input_text], outputs=output_text)

# demo.launch(share=False)


# import gradio as gr
# from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
# from transformers.image_utils import load_image
# from threading import Thread
# import time
# import torch
# import spaces

# MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
# processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     MODEL_ID,
#     trust_remote_code=True,
#     torch_dtype=torch.bfloat16
# ).to("cuda").eval()

# @spaces.GPU(duration=200)
# def model_inference(input_dict, history):
#     text = input_dict["text"]
#     files = input_dict["files"]

#     # Load images if provided
#     if len(files) > 1:
#         images = [load_image(image) for image in files]
#     elif len(files) == 1:
#         images = [load_image(files[0])]
#     else:
#         images = []

#     # Validate input
#     if text == "" and not images:
#         gr.Error("Please input a query and optionally image(s).")
#         return
#     if text == "" and images:
#         gr.Error("Please input a text query along with the image(s).")
#         return

#     # Prepare messages for the model
#     messages = [
#         {
#             "role": "user",
#             "content": [
#                 *[{"type": "image", "image": image} for image in images],
#                 {"type": "text", "text": text},
#             ],
#         }
#     ]

#     # Apply chat template and process inputs
#     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     inputs = processor(
#         text=[prompt],
#         images=images if images else None,
#         return_tensors="pt",
#         padding=True,
#     ).to("cuda")

#     # Set up streamer for real-time output
#     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
#     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)

#     # Start generation in a separate thread
#     thread = Thread(target=model.generate, kwargs=generation_kwargs)
#     thread.start()

#     # Stream the output
#     buffer = ""
#     yield "Thinking..."
#     for new_text in streamer:
#         buffer += new_text
#         time.sleep(0.01)
#         yield buffer

# examples = [
#     [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
# ]

# demo = gr.ChatInterface(
#     fn=model_inference,
#     description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
#     examples=examples,
#     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
#     stop_btn="Stop Generation",
#     multimodal=True,
#     cache_examples=False,
# )

# demo.launch(debug=True)

import gradio as gr
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
from transformers.image_utils import load_image
from threading import Thread
import time
import torch

# 加载模型和处理器
MODEL_ID = "Fancy-MLLM/R1-OneVision-7B"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
).to("cuda").eval()

def model_inference(input_dict, history):
    text = input_dict["text"]
    files = input_dict["files"]

    # 加载图片(如果提供)
    if len(files) > 1:
        images = [load_image(image) for image in files]
    elif len(files) == 1:
        images = [load_image(files[0])]
    else:
        images = []

    # 输入验证
    if text == "" and not images:
        return gr.Error("Please input a query and optionally image(s).")
    if text == "" and images:
        return gr.Error("Please input a text query along with the image(s).")

    # 准备输入消息
    messages = [
        {
            "role": "user",
            "content": [
                *[{"type": "image", "image": image} for image in images],
                {"type": "text", "text": text},
            ],
        }
    ]

    # 使用处理器准备输入
    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt],
        images=images if images else None,
        return_tensors="pt",
        padding=True,
    ).to("cuda")

    # 设置最大输出token数以控制推理时间
    max_new_tokens = 1024  # 可以根据实际需要调整

    # 创建流式输出
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)

    # 使用后台线程执行推理
    def run_inference():
        model.generate(**generation_kwargs)

    thread = Thread(target=run_inference)
    thread.start()

    # 生成过程中更新UI
    buffer = ""
    yield "Processing your request, please wait..."
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)  # 给UI流畅更新的时间
        yield buffer

# 示例输入
examples = [
    [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
]

# 创建Gradio界面
demo = gr.Interface(
    fn=model_inference,
    description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
    examples=examples,
    inputs=gr.Chatbox(),
    outputs=gr.Textbox(),
    live=True,
    allow_flagging="never",
    layout="vertical",
    title="Multimodal Inference with Fancy-MLLM",
    cache_examples=False,
)

demo.launch(debug=True)