Spaces:
Build error
Build error
File size: 2,660 Bytes
5038429 32f13d0 5038429 dc2ea44 101c1f1 5038429 32f13d0 5038429 f23c17c 3f49fe4 32f13d0 dc2ea44 5038429 3f49fe4 32f13d0 5038429 3f49fe4 5038429 f23c17c 3f49fe4 32f13d0 5038429 3f49fe4 32f13d0 3f49fe4 32f13d0 3f49fe4 5038429 32f13d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from PIL import Image
import re
import requests
from io import BytesIO
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="auto", trust_remote_code=True).eval()
def generate_predictions(image_input, text_input, with_grounding):
user_image_path = "/tmp/user_input_test_image.jpg"
original_image = Image.fromarray((255 - (image_input * 255).astype('uint8')))
original_image.save(user_image_path)
if with_grounding == "Yes":
text_input += " with grounding"
query = tokenizer.from_list_format([
{'image': user_image_path},
{'text': text_input},
])
inputs = tokenizer(query, return_tensors='pt')
inputs = inputs.to(model.device)
pred = model.generate(**inputs)
full_response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
frontend_response = re.sub(r'Picture \d+:|<.*?>|\/tmp\/.*\.jpg', '', full_response).replace(text_input, '').strip()
print("Generated Caption:", frontend_response) # Debugging line
image_with_boxes = tokenizer.draw_bbox_on_latest_picture(full_response)
# Check if the response contains bounding box coordinates
if not re.search(r'\(\d+,\d+\),\(\d+,\d+\)', frontend_response):
image_with_boxes = original_image
if image_with_boxes:
temp_path = "/tmp/image_with_boxes.jpg"
image_with_boxes.save(temp_path)
image_with_boxes = Image.open(temp_path)
return image_with_boxes, frontend_response
iface = gr.Interface(
fn=generate_predictions,
inputs=[
gr.inputs.Image(label="Image Input"),
gr.inputs.Textbox(default="Generate a caption for that image:", label="Prompt"),
gr.inputs.Radio(["No", "Yes"], label="With Grounding", default="No")
],
outputs=[
gr.outputs.Image(type='pil', label="Image"),
gr.outputs.Textbox(label="Generated")
],
title="Qwen-VL Demonstration",
description = """
## Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud
**Space by [@Artificialguybr](https://twitter.com/artificialguybr)**
### Key Features:
- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.
- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.
- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.
""",
)
iface.launch()
|