import gradio as gr import supervision as sv import torch import spaces from utils.annotate import annotate_with_boxes from utils.models import load_models, run_inference, CHECKPOINTS from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \ CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \ MORE_DETAILED_CAPTION_TASK_NAME MARKDOWN = """ # Better Florence-2 Playground 🔥
Colab Roboflow arXiv YouTube
""" OBJECT_DETECTION_EXAMPLES = [ ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"] ] CAPTION_EXAMPLES = [ ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"], ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"], ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"] ] DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") MODELS, PROCESSORS = load_models(DEVICE) @spaces.GPU def process(checkpoint_dropdown, task_dropdown, image_input): model = MODELS[checkpoint_dropdown] processor = PROCESSORS[checkpoint_dropdown] task = TASKS[task_dropdown] if task_dropdown == OBJECT_DETECTION_TASK_NAME: _, response = run_inference( model, processor, DEVICE, image_input, task) detections = sv.Detections.from_lmm( lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size) return annotate_with_boxes(image_input, detections) elif task_dropdown in CAPTION_TASK_NAMES: _, response = run_inference( model, processor, DEVICE, image_input, task) return response[task] image_output_component = None text_output_component = None with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): checkpoint_dropdown_component = gr.Dropdown( choices=CHECKPOINTS, value=CHECKPOINTS[0], label="Model", info="Select a Florence 2 model to use.") task_dropdown_component = gr.Dropdown( choices=TASK_NAMES, value=TASK_NAMES[0], label="Task", info="Select a task to perform with the model.") with gr.Row(): with gr.Column(): image_input_component = gr.Image(type='pil', label='Image Input') submit_button_component = gr.Button(value='Submit', variant='primary') with gr.Column(): @gr.render(inputs=task_dropdown_component) def show_output(text): if text == OBJECT_DETECTION_TASK_NAME: global image_output_component image_output_component = gr.Image(type='pil', label='Image Output') submit_button_component.click( fn=process, inputs=[ checkpoint_dropdown_component, task_dropdown_component, image_input_component ], outputs=image_output_component ) elif text in CAPTION_TASK_NAMES: global text_output_component text_output_component = gr.Textbox(label='Caption Output') submit_button_component.click( fn=process, inputs=[ checkpoint_dropdown_component, task_dropdown_component, image_input_component ], outputs=text_output_component ) @gr.render(inputs=task_dropdown_component) def show_examples(text): if text == OBJECT_DETECTION_TASK_NAME: global image_output_component gr.Examples( fn=process, examples=OBJECT_DETECTION_EXAMPLES, inputs=[ checkpoint_dropdown_component, task_dropdown_component, image_input_component ], outputs=image_output_component ) elif text in CAPTION_TASK_NAMES: global text_output_component gr.Examples( fn=process, examples=CAPTION_EXAMPLES, inputs=[ checkpoint_dropdown_component, task_dropdown_component, image_input_component ], outputs=text_output_component ) demo.launch(debug=False, show_error=True, max_threads=1)