|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import torch |
|
from PIL import Image |
|
import requests |
|
from io import BytesIO |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True).eval() |
|
|
|
def generate_predictions(image_input, text_input): |
|
|
|
user_image_path = "/tmp/user_input_test_image.jpg" |
|
image_input.save(user_image_path) |
|
image_input = Image.open(user_image_path) |
|
|
|
|
|
query = tokenizer.from_list_format([ |
|
{'image': user_image_path}, |
|
{'text': text_input}, |
|
]) |
|
inputs = tokenizer(query, return_tensors='pt') |
|
inputs = inputs.to(model.device) |
|
|
|
|
|
pred = model.generate(**inputs) |
|
response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False) |
|
|
|
|
|
image_with_boxes = tokenizer.draw_bbox_on_latest_picture(response) |
|
|
|
return image_with_boxes, response |
|
|
|
|
|
iface = gr.Interface( |
|
fn=generate_predictions, |
|
inputs=["image", "text"], |
|
outputs=["image", "text"] |
|
) |
|
|
|
iface.launch() |
|
|