File size: 2,660 Bytes
5038429
 
 
 
32f13d0
5038429
 
 
dc2ea44
101c1f1
5038429
32f13d0
5038429
f23c17c
 
3f49fe4
32f13d0
 
dc2ea44
5038429
 
 
 
 
 
 
 
3f49fe4
 
 
32f13d0
5038429
3f49fe4
5038429
f23c17c
 
 
 
3f49fe4
 
 
 
 
32f13d0
5038429
 
 
3f49fe4
 
32f13d0
 
3f49fe4
 
32f13d0
3f49fe4
 
 
 
 
 
 
 
 
 
 
 
5038429
32f13d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from PIL import Image
import re 
import requests
from io import BytesIO

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="auto", trust_remote_code=True).eval()

def generate_predictions(image_input, text_input, with_grounding):
    user_image_path = "/tmp/user_input_test_image.jpg"
    original_image = Image.fromarray((255 - (image_input * 255).astype('uint8')))
    original_image.save(user_image_path)
    
    if with_grounding == "Yes":
        text_input += " with grounding"
    
    query = tokenizer.from_list_format([
        {'image': user_image_path},
        {'text': text_input},
    ])
    inputs = tokenizer(query, return_tensors='pt')
    inputs = inputs.to(model.device)
    
    pred = model.generate(**inputs)
    full_response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
    
    frontend_response = re.sub(r'Picture \d+:|<.*?>|\/tmp\/.*\.jpg', '', full_response).replace(text_input, '').strip()
    print("Generated Caption:", frontend_response)  # Debugging line
    
    image_with_boxes = tokenizer.draw_bbox_on_latest_picture(full_response)
    
    # Check if the response contains bounding box coordinates
    if not re.search(r'\(\d+,\d+\),\(\d+,\d+\)', frontend_response):
        image_with_boxes = original_image
    
    if image_with_boxes:
        temp_path = "/tmp/image_with_boxes.jpg"
        image_with_boxes.save(temp_path)
        image_with_boxes = Image.open(temp_path)
    
    return image_with_boxes, frontend_response

iface = gr.Interface(
    fn=generate_predictions, 
    inputs=[
        gr.inputs.Image(label="Image Input"), 
        gr.inputs.Textbox(default="Generate a caption for that image:", label="Prompt"),
        gr.inputs.Radio(["No", "Yes"], label="With Grounding", default="No")
    ], 
    outputs=[
        gr.outputs.Image(type='pil', label="Image"),
        gr.outputs.Textbox(label="Generated")
    ],
    title="Qwen-VL Demonstration",
    description = """
## Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud
**Space by [@Artificialguybr](https://twitter.com/artificialguybr)**
    
### Key Features:
- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.
- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.
- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.
""",
)
iface.launch()