File size: 3,093 Bytes
f6a98e4
a997f34
f6a98e4
d35379a
8f558df
f6a98e4
 
36163a8
 
f6a98e4
36163a8
 
 
 
 
39761c3
36163a8
97313a7
723de5f
97313a7
 
723de5f
97313a7
 
 
723de5f
f6a98e4
723de5f
c462fef
36163a8
 
 
c462fef
36163a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39761c3
36163a8
 
 
 
 
 
 
 
 
c462fef
36163a8
c462fef
 
f6a98e4
723de5f
ae0aef7
 
 
 
 
8f558df
723de5f
f6a98e4
723de5f
 
 
 
 
 
 
 
 
 
 
f6a98e4
 
723de5f
97313a7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import spaces
import os
import time
import torch
import gradio as gr
from threading import Thread
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# Model and processor initialization
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/QVQ-72B-Preview", 
    torch_dtype="auto", 
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")

# Footer
footer = """
<div style="text-align: center; margin-top: 20px;">
    <p>Powered by QVQ-72B Model</p>
</div>
"""

# Vision model function
@spaces.GPU()
def process_image(image, text_input=None):
    try:
        # Convert image to PIL format if needed
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image).convert("RGB")
        
        # Prepare messages
        if not text_input:
            text_input = "Please describe this image in detail."
            
        messages = [
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
                ],
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": text_input}
                ],
            }
        ]

        # Process inputs
        text = processor.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")

        # Generate response
        generated_ids = model.generate(**inputs, max_new_tokens=8192)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False
        )[0]
        
        return output_text
    except Exception as e:
        return f"Error processing image: {str(e)}"

# CSS styling
css = """
footer {
    visibility: hidden;
}
"""

# Gradio interface
with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
    with gr.Row():
        input_img = gr.Image(label="Input Image")
    with gr.Row():
        text_input = gr.Textbox(label="Question (Optional)")
    with gr.Row():
        submit_btn = gr.Button(value="Submit")
    with gr.Row():
        output_text = gr.Textbox(label="Response")

    submit_btn.click(process_image, [input_img, text_input], [output_text])
    
    gr.HTML(footer)

# Launch the app
demo.launch(debug=True)