Spaces:

SkalskiP
/

better-florence-2

Running on Zero

App Files Files Community

SkalskiP commited on Jul 4, 2024

Commit

3b99a8a

1 Parent(s): c3f2745

OCR tasks added

Browse files

Files changed (2) hide show

app.py +47 -9
utils/tasks.py +8 -2

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from utils.annotate import annotate_with_boxes
 from utils.models import load_models, run_inference, CHECKPOINTS
 from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
     CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
-    MORE_DETAILED_CAPTION_TASK_NAME
 MARKDOWN = """
 # Better Florence-2 Playground 🔥
@@ -25,6 +25,15 @@ MARKDOWN = """
         <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
     </a>
 </div>
 """
 OBJECT_DETECTION_EXAMPLES = [
@@ -35,6 +44,13 @@ CAPTION_EXAMPLES = [
     ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"],
     ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
 ]
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODELS, PROCESSORS = load_models(DEVICE)
@@ -45,13 +61,13 @@ def process(checkpoint_dropdown, task_dropdown, image_input):
     model = MODELS[checkpoint_dropdown]
     processor = PROCESSORS[checkpoint_dropdown]
     task = TASKS[task_dropdown]
-    if task_dropdown == OBJECT_DETECTION_TASK_NAME:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         detections = sv.Detections.from_lmm(
             lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
         return annotate_with_boxes(image_input, detections)
-    elif task_dropdown in CAPTION_TASK_NAMES:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         return response[task]
@@ -81,8 +97,9 @@ with gr.Blocks() as demo:
         with gr.Column():
             @gr.render(inputs=task_dropdown_component)
             def show_output(text):
-                if text == OBJECT_DETECTION_TASK_NAME:
-                    global image_output_component
                     image_output_component = gr.Image(type='pil', label='Image Output')
                     submit_button_component.click(
                         fn=process,
@@ -93,8 +110,7 @@ with gr.Blocks() as demo:
                         ],
                         outputs=image_output_component
                     )
-                elif text in CAPTION_TASK_NAMES:
-                    global text_output_component
                     text_output_component = gr.Textbox(label='Caption Output')
                     submit_button_component.click(
                         fn=process,
@@ -108,8 +124,9 @@ with gr.Blocks() as demo:
     @gr.render(inputs=task_dropdown_component)
     def show_examples(text):
         if text == OBJECT_DETECTION_TASK_NAME:
-            global image_output_component
             gr.Examples(
                 fn=process,
                 examples=OBJECT_DETECTION_EXAMPLES,
@@ -121,7 +138,6 @@ with gr.Blocks() as demo:
                 outputs=image_output_component
             )
         elif text in CAPTION_TASK_NAMES:
-            global text_output_component
             gr.Examples(
                 fn=process,
                 examples=CAPTION_EXAMPLES,
@@ -132,5 +148,27 @@ with gr.Blocks() as demo:
                 ],
                 outputs=text_output_component
             )
 demo.launch(debug=False, show_error=True, max_threads=1)

 from utils.models import load_models, run_inference, CHECKPOINTS
 from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
     CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
+    MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME
 MARKDOWN = """
 # Better Florence-2 Playground 🔥
         <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
     </a>
 </div>
+Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
+MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
+across tasks such as captioning, object detection, grounding, and segmentation.
+The model takes images and task prompts as input, generating the desired results in
+text format. It uses a DaViT vision encoder to convert images into visual token
+embeddings. These are then concatenated with BERT-generated text embeddings and
+processed by a transformer-based multi-modal encoder-decoder to generate the response.
 """
 OBJECT_DETECTION_EXAMPLES = [
     ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"],
     ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
 ]
+OCR_EXAMPLES = [
+    ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"],
+]
+OCR_WITH_REGION_EXAMPLES = [
+    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"],
+    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg"]
+]
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODELS, PROCESSORS = load_models(DEVICE)
     model = MODELS[checkpoint_dropdown]
     processor = PROCESSORS[checkpoint_dropdown]
     task = TASKS[task_dropdown]
+    if task_dropdown in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         detections = sv.Detections.from_lmm(
             lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
         return annotate_with_boxes(image_input, detections)
+    elif task_dropdown in CAPTION_TASK_NAMES or task_dropdown == OCR_TASK_NAME:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         return response[task]
         with gr.Column():
             @gr.render(inputs=task_dropdown_component)
             def show_output(text):
+                global image_output_component
+                global text_output_component
+                if text in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
                     image_output_component = gr.Image(type='pil', label='Image Output')
                     submit_button_component.click(
                         fn=process,
                         ],
                         outputs=image_output_component
                     )
+                elif text in CAPTION_TASK_NAMES or text == OCR_TASK_NAME:
                     text_output_component = gr.Textbox(label='Caption Output')
                     submit_button_component.click(
                         fn=process,
     @gr.render(inputs=task_dropdown_component)
     def show_examples(text):
+        global image_output_component
+        global text_output_component
         if text == OBJECT_DETECTION_TASK_NAME:
             gr.Examples(
                 fn=process,
                 examples=OBJECT_DETECTION_EXAMPLES,
                 outputs=image_output_component
             )
         elif text in CAPTION_TASK_NAMES:
             gr.Examples(
                 fn=process,
                 examples=CAPTION_EXAMPLES,
                 ],
                 outputs=text_output_component
             )
+        elif text == OCR_TASK_NAME:
+            gr.Examples(
+                fn=process,
+                examples=OCR_EXAMPLES,
+                inputs=[
+                    checkpoint_dropdown_component,
+                    task_dropdown_component,
+                    image_input_component
+                ],
+                outputs=text_output_component
+            )
+        elif text == OCR_WITH_REGION_TASK_NAME:
+            gr.Examples(
+                fn=process,
+                examples=OCR_WITH_REGION_EXAMPLES,
+                inputs=[
+                    checkpoint_dropdown_component,
+                    task_dropdown_component,
+                    image_input_component
+                ],
+                outputs=image_output_component
+            )
 demo.launch(debug=False, show_error=True, max_threads=1)

utils/tasks.py CHANGED Viewed

@@ -2,18 +2,24 @@ OBJECT_DETECTION_TASK_NAME = "Object Detection"
 CAPTION_TASK_NAME = "Caption"
 DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
 MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
 TASK_NAMES = [
     OBJECT_DETECTION_TASK_NAME,
     CAPTION_TASK_NAME,
     DETAILED_CAPTION_TASK_NAME,
-    MORE_DETAILED_CAPTION_TASK_NAME
 ]
 TASKS = {
     OBJECT_DETECTION_TASK_NAME: "<OD>",
     CAPTION_TASK_NAME: "<CAPTION>",
     DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
-    MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>"
 }
 CAPTION_TASK_NAMES = [
     CAPTION_TASK_NAME,

 CAPTION_TASK_NAME = "Caption"
 DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
 MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
+OCR_TASK_NAME = "OCR"
+OCR_WITH_REGION_TASK_NAME = "OCR with Region"
 TASK_NAMES = [
     OBJECT_DETECTION_TASK_NAME,
     CAPTION_TASK_NAME,
     DETAILED_CAPTION_TASK_NAME,
+    MORE_DETAILED_CAPTION_TASK_NAME,
+    OCR_TASK_NAME,
+    OCR_WITH_REGION_TASK_NAME
 ]
 TASKS = {
     OBJECT_DETECTION_TASK_NAME: "<OD>",
     CAPTION_TASK_NAME: "<CAPTION>",
     DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
+    MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
+    OCR_TASK_NAME: "<OCR>",
+    OCR_WITH_REGION_TASK_NAME: "<OCR_WITH_REGION>"
 }
 CAPTION_TASK_NAMES = [
     CAPTION_TASK_NAME,