SkalskiP commited on
Commit
3b99a8a
·
1 Parent(s): c3f2745

OCR tasks added

Browse files
Files changed (2) hide show
  1. app.py +47 -9
  2. utils/tasks.py +8 -2
app.py CHANGED
@@ -7,7 +7,7 @@ from utils.annotate import annotate_with_boxes
7
  from utils.models import load_models, run_inference, CHECKPOINTS
8
  from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
9
  CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
10
- MORE_DETAILED_CAPTION_TASK_NAME
11
 
12
  MARKDOWN = """
13
  # Better Florence-2 Playground 🔥
@@ -25,6 +25,15 @@ MARKDOWN = """
25
  <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
26
  </a>
27
  </div>
 
 
 
 
 
 
 
 
 
28
  """
29
 
30
  OBJECT_DETECTION_EXAMPLES = [
@@ -35,6 +44,13 @@ CAPTION_EXAMPLES = [
35
  ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"],
36
  ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
37
  ]
 
 
 
 
 
 
 
38
 
39
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
  MODELS, PROCESSORS = load_models(DEVICE)
@@ -45,13 +61,13 @@ def process(checkpoint_dropdown, task_dropdown, image_input):
45
  model = MODELS[checkpoint_dropdown]
46
  processor = PROCESSORS[checkpoint_dropdown]
47
  task = TASKS[task_dropdown]
48
- if task_dropdown == OBJECT_DETECTION_TASK_NAME:
49
  _, response = run_inference(
50
  model, processor, DEVICE, image_input, task)
51
  detections = sv.Detections.from_lmm(
52
  lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
53
  return annotate_with_boxes(image_input, detections)
54
- elif task_dropdown in CAPTION_TASK_NAMES:
55
  _, response = run_inference(
56
  model, processor, DEVICE, image_input, task)
57
  return response[task]
@@ -81,8 +97,9 @@ with gr.Blocks() as demo:
81
  with gr.Column():
82
  @gr.render(inputs=task_dropdown_component)
83
  def show_output(text):
84
- if text == OBJECT_DETECTION_TASK_NAME:
85
- global image_output_component
 
86
  image_output_component = gr.Image(type='pil', label='Image Output')
87
  submit_button_component.click(
88
  fn=process,
@@ -93,8 +110,7 @@ with gr.Blocks() as demo:
93
  ],
94
  outputs=image_output_component
95
  )
96
- elif text in CAPTION_TASK_NAMES:
97
- global text_output_component
98
  text_output_component = gr.Textbox(label='Caption Output')
99
  submit_button_component.click(
100
  fn=process,
@@ -108,8 +124,9 @@ with gr.Blocks() as demo:
108
 
109
  @gr.render(inputs=task_dropdown_component)
110
  def show_examples(text):
 
 
111
  if text == OBJECT_DETECTION_TASK_NAME:
112
- global image_output_component
113
  gr.Examples(
114
  fn=process,
115
  examples=OBJECT_DETECTION_EXAMPLES,
@@ -121,7 +138,6 @@ with gr.Blocks() as demo:
121
  outputs=image_output_component
122
  )
123
  elif text in CAPTION_TASK_NAMES:
124
- global text_output_component
125
  gr.Examples(
126
  fn=process,
127
  examples=CAPTION_EXAMPLES,
@@ -132,5 +148,27 @@ with gr.Blocks() as demo:
132
  ],
133
  outputs=text_output_component
134
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  demo.launch(debug=False, show_error=True, max_threads=1)
 
7
  from utils.models import load_models, run_inference, CHECKPOINTS
8
  from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
9
  CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
10
+ MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME
11
 
12
  MARKDOWN = """
13
  # Better Florence-2 Playground 🔥
 
25
  <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
26
  </a>
27
  </div>
28
+
29
+ Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
30
+ MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
31
+ across tasks such as captioning, object detection, grounding, and segmentation.
32
+
33
+ The model takes images and task prompts as input, generating the desired results in
34
+ text format. It uses a DaViT vision encoder to convert images into visual token
35
+ embeddings. These are then concatenated with BERT-generated text embeddings and
36
+ processed by a transformer-based multi-modal encoder-decoder to generate the response.
37
  """
38
 
39
  OBJECT_DETECTION_EXAMPLES = [
 
44
  ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"],
45
  ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
46
  ]
47
+ OCR_EXAMPLES = [
48
+ ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"],
49
+ ]
50
+ OCR_WITH_REGION_EXAMPLES = [
51
+ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"],
52
+ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg"]
53
+ ]
54
 
55
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
56
  MODELS, PROCESSORS = load_models(DEVICE)
 
61
  model = MODELS[checkpoint_dropdown]
62
  processor = PROCESSORS[checkpoint_dropdown]
63
  task = TASKS[task_dropdown]
64
+ if task_dropdown in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
65
  _, response = run_inference(
66
  model, processor, DEVICE, image_input, task)
67
  detections = sv.Detections.from_lmm(
68
  lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
69
  return annotate_with_boxes(image_input, detections)
70
+ elif task_dropdown in CAPTION_TASK_NAMES or task_dropdown == OCR_TASK_NAME:
71
  _, response = run_inference(
72
  model, processor, DEVICE, image_input, task)
73
  return response[task]
 
97
  with gr.Column():
98
  @gr.render(inputs=task_dropdown_component)
99
  def show_output(text):
100
+ global image_output_component
101
+ global text_output_component
102
+ if text in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
103
  image_output_component = gr.Image(type='pil', label='Image Output')
104
  submit_button_component.click(
105
  fn=process,
 
110
  ],
111
  outputs=image_output_component
112
  )
113
+ elif text in CAPTION_TASK_NAMES or text == OCR_TASK_NAME:
 
114
  text_output_component = gr.Textbox(label='Caption Output')
115
  submit_button_component.click(
116
  fn=process,
 
124
 
125
  @gr.render(inputs=task_dropdown_component)
126
  def show_examples(text):
127
+ global image_output_component
128
+ global text_output_component
129
  if text == OBJECT_DETECTION_TASK_NAME:
 
130
  gr.Examples(
131
  fn=process,
132
  examples=OBJECT_DETECTION_EXAMPLES,
 
138
  outputs=image_output_component
139
  )
140
  elif text in CAPTION_TASK_NAMES:
 
141
  gr.Examples(
142
  fn=process,
143
  examples=CAPTION_EXAMPLES,
 
148
  ],
149
  outputs=text_output_component
150
  )
151
+ elif text == OCR_TASK_NAME:
152
+ gr.Examples(
153
+ fn=process,
154
+ examples=OCR_EXAMPLES,
155
+ inputs=[
156
+ checkpoint_dropdown_component,
157
+ task_dropdown_component,
158
+ image_input_component
159
+ ],
160
+ outputs=text_output_component
161
+ )
162
+ elif text == OCR_WITH_REGION_TASK_NAME:
163
+ gr.Examples(
164
+ fn=process,
165
+ examples=OCR_WITH_REGION_EXAMPLES,
166
+ inputs=[
167
+ checkpoint_dropdown_component,
168
+ task_dropdown_component,
169
+ image_input_component
170
+ ],
171
+ outputs=image_output_component
172
+ )
173
 
174
  demo.launch(debug=False, show_error=True, max_threads=1)
utils/tasks.py CHANGED
@@ -2,18 +2,24 @@ OBJECT_DETECTION_TASK_NAME = "Object Detection"
2
  CAPTION_TASK_NAME = "Caption"
3
  DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
4
  MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
 
 
5
 
6
  TASK_NAMES = [
7
  OBJECT_DETECTION_TASK_NAME,
8
  CAPTION_TASK_NAME,
9
  DETAILED_CAPTION_TASK_NAME,
10
- MORE_DETAILED_CAPTION_TASK_NAME
 
 
11
  ]
12
  TASKS = {
13
  OBJECT_DETECTION_TASK_NAME: "<OD>",
14
  CAPTION_TASK_NAME: "<CAPTION>",
15
  DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
16
- MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>"
 
 
17
  }
18
  CAPTION_TASK_NAMES = [
19
  CAPTION_TASK_NAME,
 
2
  CAPTION_TASK_NAME = "Caption"
3
  DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
4
  MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
5
+ OCR_TASK_NAME = "OCR"
6
+ OCR_WITH_REGION_TASK_NAME = "OCR with Region"
7
 
8
  TASK_NAMES = [
9
  OBJECT_DETECTION_TASK_NAME,
10
  CAPTION_TASK_NAME,
11
  DETAILED_CAPTION_TASK_NAME,
12
+ MORE_DETAILED_CAPTION_TASK_NAME,
13
+ OCR_TASK_NAME,
14
+ OCR_WITH_REGION_TASK_NAME
15
  ]
16
  TASKS = {
17
  OBJECT_DETECTION_TASK_NAME: "<OD>",
18
  CAPTION_TASK_NAME: "<CAPTION>",
19
  DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
20
+ MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
21
+ OCR_TASK_NAME: "<OCR>",
22
+ OCR_WITH_REGION_TASK_NAME: "<OCR_WITH_REGION>"
23
  }
24
  CAPTION_TASK_NAMES = [
25
  CAPTION_TASK_NAME,