Spaces:
Sleeping
Sleeping
File size: 4,578 Bytes
b27b0a2 311e9b0 1c21f60 b27b0a2 311e9b0 f16f78e c3f7520 311e9b0 b27b0a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
from models.vsa_model import VisionSearchAssistant
from models.vsa_prompt import COCO_CLASSES
SAMPLES = {
"images/iclr.jpg": ("What prize did this paper win in 2024?", ", ".join(COCO_CLASSES)),
"images/tesla.jpg": ("What's the income of this company?", "car"),
"images/xiaomi.jpg": ("Provide information about the new products of this brand.", ", ".join(COCO_CLASSES)),
"images/leshi.jpg": ("Provide information about new products of this brand of potato chips in 2024.", ", ".join(COCO_CLASSES)),
}
SAMPLE_IMAGES = list(SAMPLES.keys())
SAMPLE_TEXTS = [e[0] for e in SAMPLES.values()]
SAMPLE_CLASSES = [e[1] for e in SAMPLES.values()]
def process_inputs(image, text, ground_classes):
if len(ground_classes) == 0:
ground_classes = None
else:
ground_classes = ground_classes.split(', ')
ground_output, query_output, search_output, answer_output = None, None, None, None
for output, output_type in vsa.app_run(image, text, ground_classes = ground_classes):
if output_type == 'ground':
ground_output = output
yield ground_output, query_output, search_output, answer_output
elif output_type == 'query':
query_output = ''
for qid, query in enumerate(output):
query_output += '[Area {}] '.format(qid) + query + '\n'
yield ground_output, query_output, search_output, answer_output
elif output_type == 'search':
search_output = ''
for cid, context in enumerate(output):
search_output += '[Context {}] '.format(cid) + context + '\n'
yield ground_output, query_output, search_output, answer_output
elif output_type == 'answer':
answer_output = output
yield ground_output, query_output, search_output, answer_output
def select_sample_inputs(sample):
if sample == 'none':
return None, None, None
image = sample
text, classes = SAMPLES[sample]
return image, text, classes
def confirm_sample_inputs(image, text, classes):
return image, text, classes
# Create a Blocks interface
with gr.Blocks() as app:
with gr.Tab("Run"):
with gr.Row():
with gr.Column():
with gr.Row():
image_input = gr.Image(label="Input Image", height=300, width=300)
ground_output = gr.Image(label="Grounding Output", height=300, width=300, interactive=False)
prompt_input = gr.Textbox(label="Input Text Prompt", lines=1, max_lines=1)
ground_class_input = gr.Textbox(
label="Ground Classes",
placeholder="Defaultly, the model will use COCO classes.",
lines=1, max_lines=1
)
submit_button = gr.Button("Submit")
answer_output = gr.Textbox(label="Answer Output", lines=4, max_lines=4, interactive=False)
with gr.Column():
query_output = gr.Textbox(label='Query Output', lines=14, max_lines=14, interactive=False)
search_output = gr.Textbox(label="Search Output", lines=14, max_lines=14, interactive=False)
with gr.Tab("Samples"):
sample_input = gr.Dropdown(label="Select One Sample", choices=SAMPLE_IMAGES)
with gr.Row():
sample_image = gr.Image(label="Sample Input Image", height=300, interactive=False, value=SAMPLE_IMAGES[0])
with gr.Column():
sample_text = gr.Textbox(label="Sample Input Text", lines=4, max_lines=4, interactive=False, value=SAMPLE_TEXTS[0])
sample_classes = gr.Textbox(label="Sample Input Classes", lines=4, max_lines=4, interactive=False, value=SAMPLE_CLASSES[0])
sample_button = gr.Button("Select This Sample")
# Processing action
submit_button.click(
fn=process_inputs,
inputs=[image_input, prompt_input, ground_class_input],
outputs=[ground_output, query_output, search_output, answer_output],
show_progress=True,
)
sample_input.change(
fn=select_sample_inputs,
inputs=[sample_input],
outputs=[sample_image, sample_text, sample_classes]
)
sample_button.click(
fn=confirm_sample_inputs,
inputs=[sample_image, sample_text, sample_classes],
outputs=[image_input, prompt_input, ground_class_input],
)
vsa = VisionSearchAssistant(
ground_device = "cuda:0",
vlm_device="cuda:0",
vlm_load_4bit=True,
)
# Launch the app
app.launch() |