File size: 2,315 Bytes
8e76f41
bd40713
8e76f41
 
 
ae54346
6f2dfe8
af95610
f09701f
 
 
b393445
 
f09701f
 
ea9cb5f
 
 
 
47ba7f8
 
 
 
bd40713
f09701f
bd40713
f09701f
8e76f41
 
7b3b2c9
 
 
 
8e76f41
af95610
8e76f41
 
 
 
af95610
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e76f41
 
 
af95610
f09701f
8e76f41
 
f09701f
8e76f41
f09701f
f65c36e
8e76f41
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from PIL import Image
import spaces

import gradio as gr

MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
MODEL_FINETUNE_ID = "davidr99/qwen2.5-7b-instruct-blackjack"

EXAMPLES = [
    "examples/black_jack_screenshot_1737088587.png",
    "examples/black_jack_screenshot_1737088629.png",
    "examples/black_jack_screenshot_1737088648.png",
    "examples/Screenshot 2024-12-06 220410.png"
]


from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto").to('cuda')
model.load_adapter(MODEL_FINETUNE_ID)
processor = AutoProcessor.from_pretrained(MODEL_FINETUNE_ID)

@spaces.GPU(duration=30)
def blackjack_ai(image, question):

    instruction = question
    
    messages = [
        {"role": "system",
            "content": [
            {"type":"text", "text": "You are a blackjack player.  Extract the image into json information."} ]
        },
        {"role": "user", "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": instruction}
        ]}
    ]

    print(messages)

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text

with gr.Blocks() as demo:
   
    image = gr.Image(type="filepath")
    question = gr.Textbox(value = "extract json from this image.")
    submit = gr.Button("Submit")
    output = gr.TextArea()
    examples = gr.Examples(examples=EXAMPLES, inputs=[image])

    submit.click(blackjack_ai, inputs=[image, question], outputs=[output])

demo.launch()