Ahmed007 commited on
Commit
dd2e01d
1 Parent(s): 1a16c2f

Upload processor

Browse files
preprocessor_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "auto_map": {
3
- "AutoProcessor": "microsoft/Florence-2-base--processing_florence2.Florence2Processor"
4
  },
5
  "crop_size": {
6
  "height": 768,
 
1
  {
2
  "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
  },
5
  "crop_size": {
6
  "height": 768,
processing_florence2.py ADDED
@@ -0,0 +1,1088 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Florence-2.
17
+ """
18
+
19
+ import re
20
+ import logging
21
+ from typing import List, Optional, Union
22
+ import numpy as np
23
+
24
+ import torch
25
+
26
+ from transformers.feature_extraction_utils import BatchFeature
27
+ from transformers.image_utils import ImageInput, is_valid_image
28
+ from transformers.processing_utils import ProcessorMixin
29
+ from transformers.tokenization_utils_base import (
30
+ PaddingStrategy,
31
+ PreTokenizedInput,
32
+ TextInput,
33
+ TruncationStrategy,
34
+ )
35
+ from transformers.utils import TensorType
36
+
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Copied from transformers.models.idefics2.processing_idefics2.is_url
41
+ def is_url(val) -> bool:
42
+ return isinstance(val, str) and val.startswith("http")
43
+
44
+ # Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
45
+ def is_image_or_image_url(elem):
46
+ return is_url(elem) or is_valid_image(elem)
47
+
48
+
49
+ def _is_str_or_image(elem):
50
+ return isinstance(elem, (str)) or is_image_or_image_url(elem)
51
+
52
+
53
+ class Florence2Processor(ProcessorMixin):
54
+ r"""
55
+ Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
56
+
57
+ [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
58
+ [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
59
+
60
+ Args:
61
+ image_processor ([`CLIPImageProcessor`], *optional*):
62
+ The image processor is a required input.
63
+ tokenizer ([`BartTokenizerFast`], *optional*):
64
+ The tokenizer is a required input.
65
+ """
66
+
67
+ attributes = ["image_processor", "tokenizer"]
68
+ image_processor_class = "CLIPImageProcessor"
69
+ tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
70
+
71
+ def __init__(
72
+ self,
73
+ image_processor=None,
74
+ tokenizer=None,
75
+ ):
76
+ if image_processor is None:
77
+ raise ValueError("You need to specify an `image_processor`.")
78
+ if tokenizer is None:
79
+ raise ValueError("You need to specify a `tokenizer`.")
80
+ if not hasattr(image_processor, "image_seq_length"):
81
+ raise ValueError("Image processor is missing an `image_seq_length` attribute.")
82
+
83
+ self.image_seq_length = image_processor.image_seq_length
84
+
85
+ tokens_to_add = {
86
+ 'additional_special_tokens': \
87
+ tokenizer.additional_special_tokens + \
88
+ ['<od>', '</od>', '<ocr>', '</ocr>'] + \
89
+ [f'<loc_{x}>' for x in range(1000)] + \
90
+ ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
91
+ }
92
+ tokenizer.add_special_tokens(tokens_to_add)
93
+
94
+ self.tasks_answer_post_processing_type = {
95
+ '<OCR>': 'pure_text',
96
+ '<OCR_WITH_REGION>': 'ocr',
97
+ '<CAPTION>': 'pure_text',
98
+ '<DETAILED_CAPTION>': 'pure_text',
99
+ '<MORE_DETAILED_CAPTION>': 'pure_text',
100
+ '<OD>': 'description_with_bboxes',
101
+ '<DENSE_REGION_CAPTION>': 'description_with_bboxes',
102
+ '<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
103
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
104
+ '<REGION_TO_SEGMENTATION>': 'polygons',
105
+ '<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
106
+ '<REGION_TO_CATEGORY>': 'pure_text',
107
+ '<REGION_TO_DESCRIPTION>': 'pure_text',
108
+ '<REGION_TO_OCR>': 'pure_text',
109
+ '<REGION_PROPOSAL>': 'bboxes'
110
+ }
111
+
112
+ self.task_prompts_without_inputs = {
113
+ '<OCR>': 'What is the text in the image?',
114
+ '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
115
+ '<CAPTION>': 'What does the image describe?',
116
+ '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
117
+ '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
118
+ '<OD>': 'Locate the objects with category name in the image.',
119
+ '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
120
+ '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
121
+ }
122
+
123
+ self.task_prompts_with_input = {
124
+ '<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
125
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
126
+ '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
127
+ '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
128
+ '<REGION_TO_CATEGORY>': 'What is the region {input}?',
129
+ '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
130
+ '<REGION_TO_OCR>': 'What text is in the region {input}?',
131
+ }
132
+
133
+ self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
134
+
135
+
136
+ super().__init__(image_processor, tokenizer)
137
+
138
+ def _construct_prompts(self, text):
139
+ # replace the task tokens with the task prompts if task token is in the text
140
+ prompts = []
141
+ for _text in text:
142
+ # 1. fixed task prompts without additional inputs
143
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
144
+ if task_token in _text:
145
+ assert _text == task_token, f"Task token {task_token} should be the only token in the text."
146
+ _text = task_prompt
147
+ break
148
+ # 2. task prompts with additional inputs
149
+ for task_token, task_prompt in self.task_prompts_with_input.items():
150
+ if task_token in _text:
151
+ _text = task_prompt.format(input=_text.replace(task_token, ''))
152
+ break
153
+ prompts.append(_text)
154
+ return prompts
155
+
156
+ def __call__(
157
+ self,
158
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
159
+ images: ImageInput = None,
160
+ tokenize_newline_separately: bool = True,
161
+ padding: Union[bool, str, PaddingStrategy] = False,
162
+ truncation: Union[bool, str, TruncationStrategy] = None,
163
+ max_length=None,
164
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
165
+ do_resize: bool = None,
166
+ do_normalize: bool = None,
167
+ image_mean: Optional[Union[float, List[float]]] = None,
168
+ image_std: Optional[Union[float, List[float]]] = None,
169
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
170
+ input_data_format: Optional[
171
+ Union[str, "ChannelDimension"] # noqa: F821
172
+ ] = None,
173
+ resample: "PILImageResampling" = None, # noqa: F821
174
+ do_convert_rgb: bool = None,
175
+ do_thumbnail: bool = None,
176
+ do_align_long_axis: bool = None,
177
+ do_rescale: bool = None,
178
+ ) -> BatchFeature:
179
+ """
180
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
181
+ and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
182
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
183
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
184
+ of the above two methods for more information.
185
+
186
+ Args:
187
+ text (`str`, `List[str]`, `List[List[str]]`):
188
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
189
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
190
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
191
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
192
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
193
+ tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
194
+ number of channels, H and W are image height and width.
195
+ tokenize_newline_separately (`bool`, defaults to `True`):
196
+ Adds a separately tokenized '\n' at the end of the prompt.
197
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
198
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
199
+ index) among:
200
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
201
+ sequence if provided).
202
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
203
+ acceptable input length for the model if that argument is not provided.
204
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
205
+ lengths).
206
+ max_length (`int`, *optional*):
207
+ Maximum length of the returned list and optionally padding length (see above).
208
+ truncation (`bool`, *optional*):
209
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
210
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
211
+ If set, will return tensors of a particular framework. Acceptable values are:
212
+
213
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
214
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
215
+ - `'np'`: Return NumPy `np.ndarray` objects.
216
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
217
+
218
+ Returns:
219
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
220
+
221
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
222
+ is provided, the `input_ids` will also contain the suffix input ids.
223
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
224
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
225
+ `None`).
226
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
227
+ - **labels** -- Labels compatible with training if `suffix` is not None
228
+ """
229
+
230
+ return_token_type_ids = False
231
+
232
+ if images is None:
233
+ raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
234
+ if text is None:
235
+ logger.warning_once(
236
+ "You are using Florence-2 without a text prompt."
237
+ )
238
+ text = ""
239
+
240
+ if isinstance(text, List) and isinstance(images, List):
241
+ if len(images) < len(text):
242
+ raise ValueError(
243
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
244
+ )
245
+ if _is_str_or_image(text):
246
+ text = [text]
247
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
248
+ pass
249
+
250
+ pixel_values = self.image_processor(
251
+ images,
252
+ do_resize=do_resize,
253
+ do_normalize=do_normalize,
254
+ return_tensors=return_tensors,
255
+ image_mean=image_mean,
256
+ image_std=image_std,
257
+ input_data_format=input_data_format,
258
+ data_format=data_format,
259
+ resample=resample,
260
+ do_convert_rgb=do_convert_rgb,
261
+ )["pixel_values"]
262
+
263
+ if max_length is not None:
264
+ max_length -= self.image_seq_length # max_length has to account for the image tokens
265
+
266
+ text = self._construct_prompts(text)
267
+
268
+ inputs = self.tokenizer(
269
+ text,
270
+ return_tensors=return_tensors,
271
+ padding=padding,
272
+ max_length=max_length,
273
+ truncation=truncation,
274
+ return_token_type_ids=return_token_type_ids,
275
+ )
276
+
277
+ return_data = {**inputs, "pixel_values": pixel_values}
278
+
279
+ if return_token_type_ids:
280
+ labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
281
+ return_data.update({"labels": labels})
282
+ return BatchFeature(data=return_data)
283
+
284
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
285
+ def batch_decode(self, *args, **kwargs):
286
+ """
287
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
288
+ refer to the docstring of this method for more information.
289
+ """
290
+ return self.tokenizer.batch_decode(*args, **kwargs)
291
+
292
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
293
+ def decode(self, *args, **kwargs):
294
+ """
295
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
296
+ the docstring of this method for more information.
297
+ """
298
+ return self.tokenizer.decode(*args, **kwargs)
299
+
300
+ @property
301
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
302
+ def model_input_names(self):
303
+ tokenizer_input_names = self.tokenizer.model_input_names
304
+ image_processor_input_names = self.image_processor.model_input_names
305
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
306
+
307
+ def post_process_generation(self, text, task, image_size):
308
+ """
309
+ Post-process the output of the model to each of the task outputs.
310
+
311
+ Args:
312
+ text (`str`): The text to post-process.
313
+ task (`str`): The task to post-process the text for.
314
+ image_size (`Tuple[int, int]`): The size of the image. height x width.
315
+ """
316
+
317
+ task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
318
+ task_answer = self.post_processor(
319
+ text=text,
320
+ image_size=image_size,
321
+ parse_tasks=task_answer_post_processing_type,
322
+ )[task_answer_post_processing_type]
323
+
324
+ if task_answer_post_processing_type == 'pure_text':
325
+ final_answer = task_answer
326
+ # remove the special tokens
327
+ final_answer = final_answer.replace('<s>', '').replace('</s>', '')
328
+ elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
329
+ od_instances = task_answer
330
+ bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
331
+ labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
332
+ final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
333
+ elif task_answer_post_processing_type in ['ocr']:
334
+ bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
335
+ labels = [str(_od_instance['text']) for _od_instance in task_answer]
336
+ final_answer = {'quad_boxes': bboxes, 'labels': labels}
337
+ elif task_answer_post_processing_type in ['phrase_grounding']:
338
+ bboxes = []
339
+ labels = []
340
+ for _grounded_phrase in task_answer:
341
+ for _bbox in _grounded_phrase['bbox']:
342
+ bboxes.append(_bbox)
343
+ labels.append(_grounded_phrase['cat_name'])
344
+ final_answer = {'bboxes': bboxes, 'labels': labels}
345
+ elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
346
+ labels = []
347
+ polygons = []
348
+ for result in task_answer:
349
+ label = result['cat_name']
350
+ _polygons = result['polygons']
351
+ labels.append(label)
352
+ polygons.append(_polygons)
353
+ final_answer = {'polygons': polygons, 'labels': labels}
354
+ elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
355
+ bboxes = []
356
+ bboxes_labels = []
357
+ polygons = []
358
+ polygons_labels = []
359
+ for result in task_answer:
360
+ label = result['cat_name']
361
+ if 'polygons' in result:
362
+ _polygons = result['polygons']
363
+ polygons.append(_polygons)
364
+ polygons_labels.append(label)
365
+ else:
366
+ _bbox = result['bbox']
367
+ bboxes.append(_bbox)
368
+ bboxes_labels.append(label)
369
+ final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
370
+ else:
371
+ raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
372
+
373
+ final_answer = {
374
+ task: final_answer}
375
+ return final_answer
376
+
377
+ class BoxQuantizer(object):
378
+ def __init__(self, mode, bins):
379
+ self.mode = mode
380
+ self.bins = bins
381
+
382
+ def quantize(self, boxes: torch.Tensor, size):
383
+ bins_w, bins_h = self.bins # Quantization bins.
384
+ size_w, size_h = size # Original image size.
385
+ size_per_bin_w = size_w / bins_w
386
+ size_per_bin_h = size_h / bins_h
387
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
388
+
389
+ if self.mode == 'floor':
390
+ quantized_xmin = (
391
+ xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
392
+ quantized_ymin = (
393
+ ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
394
+ quantized_xmax = (
395
+ xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
396
+ quantized_ymax = (
397
+ ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
398
+
399
+ elif self.mode == 'round':
400
+ raise NotImplementedError()
401
+
402
+ else:
403
+ raise ValueError('Incorrect quantization type.')
404
+
405
+ quantized_boxes = torch.cat(
406
+ (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
407
+ ).int()
408
+
409
+ return quantized_boxes
410
+
411
+ def dequantize(self, boxes: torch.Tensor, size):
412
+ bins_w, bins_h = self.bins # Quantization bins.
413
+ size_w, size_h = size # Original image size.
414
+ size_per_bin_w = size_w / bins_w
415
+ size_per_bin_h = size_h / bins_h
416
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
417
+
418
+ if self.mode == 'floor':
419
+ # Add 0.5 to use the center position of the bin as the coordinate.
420
+ dequantized_xmin = (xmin + 0.5) * size_per_bin_w
421
+ dequantized_ymin = (ymin + 0.5) * size_per_bin_h
422
+ dequantized_xmax = (xmax + 0.5) * size_per_bin_w
423
+ dequantized_ymax = (ymax + 0.5) * size_per_bin_h
424
+
425
+ elif self.mode == 'round':
426
+ raise NotImplementedError()
427
+
428
+ else:
429
+ raise ValueError('Incorrect quantization type.')
430
+
431
+ dequantized_boxes = torch.cat(
432
+ (dequantized_xmin, dequantized_ymin,
433
+ dequantized_xmax, dequantized_ymax), dim=-1
434
+ )
435
+
436
+ return dequantized_boxes
437
+
438
+
439
+ class CoordinatesQuantizer(object):
440
+ """
441
+ Quantize coornidates (Nx2)
442
+ """
443
+
444
+ def __init__(self, mode, bins):
445
+ self.mode = mode
446
+ self.bins = bins
447
+
448
+ def quantize(self, coordinates: torch.Tensor, size):
449
+ bins_w, bins_h = self.bins # Quantization bins.
450
+ size_w, size_h = size # Original image size.
451
+ size_per_bin_w = size_w / bins_w
452
+ size_per_bin_h = size_h / bins_h
453
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
454
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
455
+
456
+ if self.mode == 'floor':
457
+ quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
458
+ quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
459
+
460
+ elif self.mode == 'round':
461
+ raise NotImplementedError()
462
+
463
+ else:
464
+ raise ValueError('Incorrect quantization type.')
465
+
466
+ quantized_coordinates = torch.cat(
467
+ (quantized_x, quantized_y), dim=-1
468
+ ).int()
469
+
470
+ return quantized_coordinates
471
+
472
+ def dequantize(self, coordinates: torch.Tensor, size):
473
+ bins_w, bins_h = self.bins # Quantization bins.
474
+ size_w, size_h = size # Original image size.
475
+ size_per_bin_w = size_w / bins_w
476
+ size_per_bin_h = size_h / bins_h
477
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
478
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
479
+
480
+ if self.mode == 'floor':
481
+ # Add 0.5 to use the center position of the bin as the coordinate.
482
+ dequantized_x = (x + 0.5) * size_per_bin_w
483
+ dequantized_y = (y + 0.5) * size_per_bin_h
484
+
485
+ elif self.mode == 'round':
486
+ raise NotImplementedError()
487
+
488
+ else:
489
+ raise ValueError('Incorrect quantization type.')
490
+
491
+ dequantized_coordinates = torch.cat(
492
+ (dequantized_x, dequantized_y), dim=-1
493
+ )
494
+
495
+ return dequantized_coordinates
496
+
497
+
498
+ class Florence2PostProcesser(object):
499
+ """
500
+ Florence-2 post process for converting text prediction to various tasks results.
501
+
502
+ Args:
503
+ config: A dict of configs.
504
+ tokenizer: A tokenizer for decoding text to spans.
505
+ sample config:
506
+ UNIFIED_POST_PROCESS:
507
+ # commom configs
508
+ NUM_BBOX_HEIGHT_BINS: 1000
509
+ NUM_BBOX_WIDTH_BINS: 1000
510
+ COORDINATES_HEIGHT_BINS: 1000
511
+ COORDINATES_WIDTH_BINS: 1000
512
+ # task specific configs, override the common configs
513
+ PRASE_TASKS:
514
+ - TASK_NAME: 'video_dense_caption'
515
+ PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
516
+ SCORE_MODE: 'avg_cat_name_scores'
517
+ NUM_BINS: 100
518
+ - TASK_NAME: 'od'
519
+ PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
520
+ SCORE_MODE: 'avg_cat_name_scores'
521
+
522
+ Returns:
523
+ parsed_dict (dict): A dict of parsed results.
524
+ """
525
+ def __init__(
526
+ self,
527
+ tokenizer=None
528
+ ):
529
+ parse_tasks = []
530
+ parse_task_configs = {}
531
+ config = self._create_default_config()
532
+ for task in config['PARSE_TASKS']:
533
+ parse_tasks.append(task['TASK_NAME'])
534
+ parse_task_configs[task['TASK_NAME']] = task
535
+
536
+ self.config = config
537
+ self.parse_tasks = parse_tasks
538
+ self.parse_tasks_configs = parse_task_configs
539
+
540
+ self.tokenizer = tokenizer
541
+ if self.tokenizer is not None:
542
+ self.all_special_tokens = set(self.tokenizer.all_special_tokens)
543
+
544
+ self.init_quantizers()
545
+ self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
546
+
547
+ def _create_black_list_of_phrase_grounding(self):
548
+ black_list = {}
549
+
550
+ if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
551
+ black_list = set(
552
+ ['it', 'I', 'me', 'mine',
553
+ 'you', 'your', 'yours',
554
+ 'he', 'him', 'his',
555
+ 'she', 'her', 'hers',
556
+ 'they', 'them', 'their', 'theirs',
557
+ 'one', 'oneself',
558
+ 'we', 'us', 'our', 'ours',
559
+ 'you', 'your', 'yours',
560
+ 'they', 'them', 'their', 'theirs',
561
+ 'mine', 'yours', 'his', 'hers', 'its',
562
+ 'ours', 'yours', 'theirs',
563
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
564
+ 'ourselves', 'yourselves', 'themselves',
565
+ 'this', 'that',
566
+ 'these', 'those',
567
+ 'who', 'whom', 'whose', 'which', 'what',
568
+ 'who', 'whom', 'whose', 'which', 'that',
569
+ 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
570
+ 'each', 'everybody', 'everyone', 'everything',
571
+ 'few', 'many', 'nobody', 'none', 'one', 'several',
572
+ 'some', 'somebody', 'someone', 'something',
573
+ 'each other', 'one another',
574
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
575
+ 'ourselves', 'yourselves', 'themselves',
576
+ 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
577
+ 'other objects', 'lots', 'a set',
578
+ ]
579
+ )
580
+
581
+ return black_list
582
+
583
+ def _create_default_config(self):
584
+ config = {
585
+ 'NUM_BBOX_HEIGHT_BINS': 1000,
586
+ 'NUM_BBOX_WIDTH_BINS': 1000,
587
+ 'BOX_QUANTIZATION_MODE': 'floor',
588
+ 'COORDINATES_HEIGHT_BINS': 1000,
589
+ 'COORDINATES_WIDTH_BINS': 1000,
590
+ 'COORDINATES_QUANTIZATION_MODE': 'floor',
591
+ 'PARSE_TASKS': [
592
+ {
593
+ 'TASK_NAME': 'od',
594
+ 'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>'
595
+ },
596
+ {
597
+ 'TASK_NAME': 'ocr',
598
+ 'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
599
+ 'AREA_THRESHOLD': 0.00
600
+ },
601
+ {
602
+ 'TASK_NAME': 'phrase_grounding',
603
+ 'FILTER_BY_BLACK_LIST': True
604
+ },
605
+ {
606
+ 'TASK_NAME': 'pure_text',
607
+ },
608
+ {
609
+ 'TASK_NAME': 'description_with_bboxes',
610
+ },
611
+ {
612
+ 'TASK_NAME': 'description_with_polygons',
613
+ },
614
+ {
615
+ 'TASK_NAME': 'polygons',
616
+ },
617
+ {
618
+ 'TASK_NAME': 'bboxes',
619
+ },
620
+ {
621
+ 'TASK_NAME': 'description_with_bboxes_or_polygons',
622
+ }
623
+ ]
624
+ }
625
+
626
+ return config
627
+
628
+ def init_quantizers(self):
629
+ # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
630
+ num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
631
+ num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
632
+ box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
633
+ self.box_quantizer = BoxQuantizer(
634
+ box_quantization_mode,
635
+ (num_bbox_width_bins, num_bbox_height_bins),
636
+ )
637
+
638
+ num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
639
+ num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
640
+ box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
641
+ self.coordinates_quantizer = CoordinatesQuantizer(
642
+ box_quantization_mode,
643
+ (num_bbox_width_bins, num_bbox_height_bins),
644
+ )
645
+
646
+ def decode_with_spans(self, tokenizer, token_ids):
647
+ filtered_tokens = tokenizer.convert_ids_to_tokens(
648
+ token_ids, skip_special_tokens=False)
649
+ assert len(filtered_tokens) == len(token_ids)
650
+
651
+ # To avoid mixing byte-level and unicode for byte-level BPT
652
+ # we need to build string separately for added tokens and byte-level tokens
653
+ # cf. https://github.com/huggingface/transformers/issues/1133
654
+ sub_texts = []
655
+ for token in filtered_tokens:
656
+ if token in self.all_special_tokens:
657
+ sub_texts.append(token)
658
+ else:
659
+ if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
660
+ sub_text = tokenizer.convert_tokens_to_string([token])
661
+ elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
662
+ # Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
663
+ # Note: Do not strip sub_text as it may have functional whitespace
664
+ sub_text = token.replace('▁', ' ')
665
+ else:
666
+ raise ValueError(f'type {type(tokenizer)} not supported')
667
+ sub_texts.append(sub_text)
668
+
669
+ text = ''
670
+ spans = []
671
+ for sub_text in sub_texts:
672
+ span = (len(text), len(text) + len(sub_text)) # [start index, end index).
673
+ text += sub_text
674
+ spans.append(span)
675
+
676
+ # Text format:
677
+ # 1. T5Tokenizer/T5TokenizerFast:
678
+ # "<loc_1><loc_2><loc_3><loc_4> transplanting dog<loc_1><loc_2><loc_3><loc_4> cat</s>"
679
+ # Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
680
+ # 2. BartTokenizer (need to double check):
681
+ # "<s><loc_1><loc_2><loc_3><loc_4>transplanting dog<loc_1><loc_2><loc_3><loc_4>cat</s>"
682
+ # Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
683
+ return text, spans
684
+
685
+ def parse_od_from_text_and_spans(
686
+ self,
687
+ text,
688
+ pattern,
689
+ image_size,
690
+ phrase_centric=False
691
+ ):
692
+ parsed = list(re.finditer(pattern, text))
693
+
694
+ instances = []
695
+ for i in range(len(parsed)):
696
+ # Prepare instance.
697
+ instance = {}
698
+
699
+ if phrase_centric:
700
+ bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
701
+ else:
702
+ bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
703
+ instance['bbox'] = self.box_quantizer.dequantize(
704
+ boxes=torch.tensor(bbox_bins),
705
+ size=image_size
706
+ ).tolist()
707
+
708
+ if phrase_centric:
709
+ instance['cat_name'] = parsed[i].group(1).lower().strip()
710
+ else:
711
+ instance['cat_name'] = parsed[i].group(5).lower().strip()
712
+ instances.append(instance)
713
+
714
+ return instances
715
+
716
+ def parse_ocr_from_text_and_spans(self,
717
+ text,
718
+ pattern,
719
+ image_size,
720
+ area_threshold=-1.0,
721
+ ):
722
+ bboxes = []
723
+ labels = []
724
+ text = text.replace('<s>', '')
725
+ # ocr with regions
726
+ parsed = re.findall(pattern, text)
727
+ instances = []
728
+ image_width, image_height = image_size
729
+
730
+ for ocr_line in parsed:
731
+ ocr_content = ocr_line[0]
732
+ quad_box = ocr_line[1:]
733
+ quad_box = [int(i) for i in quad_box]
734
+ quad_box = self.coordinates_quantizer.dequantize(
735
+ torch.tensor(np.array(quad_box).reshape(-1, 2)),
736
+ size=image_size
737
+ ).reshape(-1).tolist()
738
+
739
+ if area_threshold > 0:
740
+ x_coords = [i for i in quad_box[0::2]]
741
+ y_coords = [i for i in quad_box[1::2]]
742
+
743
+ # apply the Shoelace formula
744
+ area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
745
+
746
+ if area < (image_width * image_height) * area_threshold:
747
+ continue
748
+
749
+ bboxes.append(quad_box)
750
+ labels.append(ocr_content)
751
+ instances.append({
752
+ 'quad_box': quad_box,
753
+ 'text': ocr_content,
754
+ })
755
+ return instances
756
+
757
+ def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
758
+ # ignore <s> </s> and <pad>
759
+ cur_span = 0
760
+ if text.startswith('<s>'):
761
+ cur_span += 3
762
+
763
+ text = text.replace('<s>', '')
764
+ text = text.replace('</s>', '')
765
+ text = text.replace('<pad>', '')
766
+
767
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
768
+ phrases = re.findall(pattern, text)
769
+
770
+ # pattern should be text pattern and od pattern
771
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
772
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
773
+
774
+ instances = []
775
+ for pharse_text in phrases:
776
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
777
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
778
+
779
+ if phrase_text_strip == '':
780
+ cur_span += len(pharse_text)
781
+ continue
782
+
783
+ # Prepare instance.
784
+ instance = {}
785
+
786
+ # parse phrase, get string
787
+ phrase = re.search(pattern, phrase_text_strip)
788
+ if phrase is None:
789
+ cur_span += len(pharse_text)
790
+ continue
791
+
792
+ # parse bboxes by box_pattern
793
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
794
+ if len(bboxes_parsed) == 0:
795
+ cur_span += len(pharse_text)
796
+ continue
797
+
798
+ phrase = phrase.group()
799
+ # remove leading and trailing spaces
800
+ phrase = phrase.strip()
801
+
802
+ if phrase in self.black_list_of_phrase_grounding:
803
+ cur_span += len(pharse_text)
804
+ continue
805
+
806
+ # a list of list
807
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
808
+ instance['bbox'] = self.box_quantizer.dequantize(
809
+ boxes=torch.tensor(bbox_bins),
810
+ size=image_size
811
+ ).tolist()
812
+
813
+ # exclude non-ascii characters
814
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
815
+ instance['cat_name'] = phrase
816
+
817
+ instances.append(instance)
818
+
819
+ return instances
820
+
821
+ def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
822
+ # temporary parse solution, split by '.'
823
+ # ignore <s> </s> and <pad>
824
+
825
+ text = text.replace('<s>', '')
826
+ text = text.replace('</s>', '')
827
+ text = text.replace('<pad>', '')
828
+
829
+ if allow_empty_phrase:
830
+ pattern = rf"(?:(?:<loc_\d+>){{4,}})"
831
+ else:
832
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
833
+ phrases = re.findall(pattern, text)
834
+
835
+ # pattern should be text pattern and od pattern
836
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
837
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
838
+
839
+ instances = []
840
+ for pharse_text in phrases:
841
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
842
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
843
+
844
+ if phrase_text_strip == '' and not allow_empty_phrase:
845
+ continue
846
+
847
+ # parse phrase, get string
848
+ phrase = re.search(pattern, phrase_text_strip)
849
+ if phrase is None:
850
+ continue
851
+
852
+ phrase = phrase.group()
853
+ # remove leading and trailing spaces
854
+ phrase = phrase.strip()
855
+
856
+ # parse bboxes by box_pattern
857
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
858
+ if len(bboxes_parsed) == 0:
859
+ continue
860
+
861
+ # a list of list
862
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
863
+
864
+ bboxes = self.box_quantizer.dequantize(
865
+ boxes=torch.tensor(bbox_bins),
866
+ size=image_size
867
+ ).tolist()
868
+
869
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
870
+ for _bboxes in bboxes:
871
+ # Prepare instance.
872
+ instance = {}
873
+ instance['bbox'] = _bboxes
874
+ # exclude non-ascii characters
875
+ instance['cat_name'] = phrase
876
+ instances.append(instance)
877
+
878
+ return instances
879
+
880
+ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
881
+ allow_empty_phrase=False,
882
+ polygon_sep_token='<sep>',
883
+ polygon_start_token='<poly>',
884
+ polygon_end_token='</poly>',
885
+ with_box_at_start=False,
886
+ ):
887
+
888
+ # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
889
+ # ignore <s> </s> and <pad>
890
+
891
+ text = text.replace('<s>', '')
892
+ text = text.replace('</s>', '')
893
+ text = text.replace('<pad>', '')
894
+
895
+ if allow_empty_phrase:
896
+ pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
897
+ else:
898
+ # [^<]+: This part matches one or more characters that are not the < symbol.
899
+ # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
900
+ #
901
+ pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
902
+ phrases = re.findall(pattern, text)
903
+
904
+ phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
905
+ box_pattern = rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'
906
+
907
+ # one polygons instance is separated by polygon_start_token and polygon_end_token
908
+ polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
909
+
910
+ instances = []
911
+ for phrase_text in phrases:
912
+
913
+ # exclude loc_\d+>
914
+ # need to get span if want to include category score
915
+ phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
916
+
917
+ # phrase = phrase.replace('<poly>', '')
918
+ # phrase = phrase.replace('poly>', '')
919
+
920
+ if phrase_text_strip == '' and not allow_empty_phrase:
921
+ continue
922
+
923
+
924
+ # parse phrase, get string
925
+ phrase = re.search(phrase_string_pattern, phrase_text_strip)
926
+ if phrase is None:
927
+ continue
928
+ phrase = phrase.group()
929
+ # remove leading and trailing spaces
930
+ phrase = phrase.strip()
931
+
932
+ # parse bboxes by box_pattern
933
+
934
+ # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
935
+ if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
936
+ polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
937
+ else:
938
+ polygons_instances_parsed = [phrase_text]
939
+
940
+ for _polygons_instances_parsed in polygons_instances_parsed:
941
+ # Prepare instance.
942
+ instance = {}
943
+
944
+ # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
945
+ if isinstance(_polygons_instances_parsed, str):
946
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
947
+ else:
948
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
949
+ if len(polygons_parsed) == 0:
950
+ continue
951
+
952
+ # a list of list (polygon)
953
+ bbox = []
954
+ polygons = []
955
+ for _polygon_parsed in polygons_parsed:
956
+ # group 1: whole <loc_\d+>...</loc_\d+>
957
+ _polygon = _polygon_parsed.group(1)
958
+ # parse into list of int
959
+ _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
960
+ if with_box_at_start and len(bbox) == 0:
961
+ if len(_polygon) > 4:
962
+ # no valid bbox prediction
963
+ bbox = _polygon[:4]
964
+ _polygon = _polygon[4:]
965
+ else:
966
+ bbox = [0, 0, 0, 0]
967
+ # abandon last element if is not paired
968
+ if len(_polygon) % 2 == 1:
969
+ _polygon = _polygon[:-1]
970
+
971
+ # reshape into (n, 2)
972
+ _polygon = self.coordinates_quantizer.dequantize(
973
+ torch.tensor(np.array(_polygon).reshape(-1, 2)),
974
+ size=image_size
975
+ ).reshape(-1).tolist()
976
+ # reshape back
977
+ polygons.append(_polygon)
978
+
979
+ instance['cat_name'] = phrase
980
+ instance['polygons'] = polygons
981
+ if len(bbox) != 0:
982
+ instance['bbox'] = self.box_quantizer.dequantize(
983
+ boxes=torch.tensor([bbox]),
984
+ size=image_size
985
+ ).tolist()[0]
986
+
987
+ instances.append(instance)
988
+
989
+ return instances
990
+
991
+ def __call__(
992
+ self,
993
+ text=None,
994
+ image_size=None,
995
+ parse_tasks=None,
996
+ ):
997
+ """
998
+ Args:
999
+ text: model outputs
1000
+ image_size: (width, height)
1001
+ parse_tasks: a list of tasks to parse, if None, parse all tasks.
1002
+
1003
+ """
1004
+ if parse_tasks is not None:
1005
+ if isinstance(parse_tasks, str):
1006
+ parse_tasks = [parse_tasks]
1007
+ for _parse_task in parse_tasks:
1008
+ assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
1009
+
1010
+ # sequence or text should be provided
1011
+ assert text is not None, 'text should be provided'
1012
+
1013
+ parsed_dict = {
1014
+ 'text': text
1015
+ }
1016
+
1017
+ for task in self.parse_tasks:
1018
+ if parse_tasks is not None and task not in parse_tasks:
1019
+ continue
1020
+
1021
+ pattern = self.parse_tasks_configs[task].get('PATTERN', None)
1022
+
1023
+ if task == 'ocr':
1024
+ instances = self.parse_ocr_from_text_and_spans(
1025
+ text,
1026
+ pattern=pattern,
1027
+ image_size=image_size,
1028
+ area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.0),
1029
+ )
1030
+ parsed_dict['ocr'] = instances
1031
+ elif task == 'phrase_grounding':
1032
+ instances = self.parse_phrase_grounding_from_text_and_spans(
1033
+ text,
1034
+ pattern=pattern,
1035
+ image_size=image_size,
1036
+ )
1037
+ parsed_dict['phrase_grounding'] = instances
1038
+ elif task == 'pure_text':
1039
+ parsed_dict['pure_text'] = text
1040
+ elif task == 'description_with_bboxes':
1041
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1042
+ text,
1043
+ pattern=pattern,
1044
+ image_size=image_size,
1045
+ )
1046
+ parsed_dict['description_with_bboxes'] = instances
1047
+ elif task == 'description_with_polygons':
1048
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1049
+ text,
1050
+ pattern=pattern,
1051
+ image_size=image_size,
1052
+ )
1053
+ parsed_dict['description_with_polygons'] = instances
1054
+ elif task == 'polygons':
1055
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1056
+ text,
1057
+ pattern=pattern,
1058
+ image_size=image_size,
1059
+ allow_empty_phrase=True,
1060
+ )
1061
+ parsed_dict['polygons'] = instances
1062
+ elif task == 'bboxes':
1063
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1064
+ text,
1065
+ pattern=pattern,
1066
+ image_size=image_size,
1067
+ allow_empty_phrase=True,
1068
+ )
1069
+ parsed_dict['bboxes'] = instances
1070
+ elif task == 'description_with_bboxes_or_polygons':
1071
+ if '<poly>' in text:
1072
+ # only support either polygons or bboxes, not both at the same time
1073
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1074
+ text,
1075
+ pattern=pattern,
1076
+ image_size=image_size,
1077
+ )
1078
+ else:
1079
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1080
+ text,
1081
+ pattern=pattern,
1082
+ image_size=image_size,
1083
+ )
1084
+ parsed_dict['description_with_bboxes_or_polygons'] = instances
1085
+ else:
1086
+ raise ValueError("task {} is not supported".format(task))
1087
+
1088
+ return parsed_dict
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "processor_class": "Florence2Processor"
6
+ }
special_tokens_map.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer.json CHANGED
@@ -1,14 +1,7 @@
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
- "padding": {
5
- "strategy": "BatchLongest",
6
- "direction": "Right",
7
- "pad_to_multiple_of": null,
8
- "pad_id": 1,
9
- "pad_type_id": 0,
10
- "pad_token": "<pad>"
11
- },
12
  "added_tokens": [
13
  {
14
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
tokenizer_config.json CHANGED
@@ -9258,16 +9258,1047 @@
9258
  "</proposal>",
9259
  "<poly>",
9260
  "</poly>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9261
  "<and>"
9262
  ],
 
 
 
9263
  "bos_token": "<s>",
9264
  "clean_up_tokenization_spaces": false,
9265
  "cls_token": "<s>",
9266
  "eos_token": "</s>",
9267
  "errors": "replace",
9268
  "mask_token": "<mask>",
 
9269
  "model_max_length": 1024,
 
9270
  "pad_token": "<pad>",
 
 
9271
  "processor_class": "Florence2Processor",
9272
  "sep_token": "</s>",
9273
  "tokenizer_class": "BartTokenizer",
 
9258
  "</proposal>",
9259
  "<poly>",
9260
  "</poly>",
9261
+ "<and>",
9262
+ "<od>",
9263
+ "</od>",
9264
+ "<ocr>",
9265
+ "</ocr>",
9266
+ "<loc_0>",
9267
+ "<loc_1>",
9268
+ "<loc_2>",
9269
+ "<loc_3>",
9270
+ "<loc_4>",
9271
+ "<loc_5>",
9272
+ "<loc_6>",
9273
+ "<loc_7>",
9274
+ "<loc_8>",
9275
+ "<loc_9>",
9276
+ "<loc_10>",
9277
+ "<loc_11>",
9278
+ "<loc_12>",
9279
+ "<loc_13>",
9280
+ "<loc_14>",
9281
+ "<loc_15>",
9282
+ "<loc_16>",
9283
+ "<loc_17>",
9284
+ "<loc_18>",
9285
+ "<loc_19>",
9286
+ "<loc_20>",
9287
+ "<loc_21>",
9288
+ "<loc_22>",
9289
+ "<loc_23>",
9290
+ "<loc_24>",
9291
+ "<loc_25>",
9292
+ "<loc_26>",
9293
+ "<loc_27>",
9294
+ "<loc_28>",
9295
+ "<loc_29>",
9296
+ "<loc_30>",
9297
+ "<loc_31>",
9298
+ "<loc_32>",
9299
+ "<loc_33>",
9300
+ "<loc_34>",
9301
+ "<loc_35>",
9302
+ "<loc_36>",
9303
+ "<loc_37>",
9304
+ "<loc_38>",
9305
+ "<loc_39>",
9306
+ "<loc_40>",
9307
+ "<loc_41>",
9308
+ "<loc_42>",
9309
+ "<loc_43>",
9310
+ "<loc_44>",
9311
+ "<loc_45>",
9312
+ "<loc_46>",
9313
+ "<loc_47>",
9314
+ "<loc_48>",
9315
+ "<loc_49>",
9316
+ "<loc_50>",
9317
+ "<loc_51>",
9318
+ "<loc_52>",
9319
+ "<loc_53>",
9320
+ "<loc_54>",
9321
+ "<loc_55>",
9322
+ "<loc_56>",
9323
+ "<loc_57>",
9324
+ "<loc_58>",
9325
+ "<loc_59>",
9326
+ "<loc_60>",
9327
+ "<loc_61>",
9328
+ "<loc_62>",
9329
+ "<loc_63>",
9330
+ "<loc_64>",
9331
+ "<loc_65>",
9332
+ "<loc_66>",
9333
+ "<loc_67>",
9334
+ "<loc_68>",
9335
+ "<loc_69>",
9336
+ "<loc_70>",
9337
+ "<loc_71>",
9338
+ "<loc_72>",
9339
+ "<loc_73>",
9340
+ "<loc_74>",
9341
+ "<loc_75>",
9342
+ "<loc_76>",
9343
+ "<loc_77>",
9344
+ "<loc_78>",
9345
+ "<loc_79>",
9346
+ "<loc_80>",
9347
+ "<loc_81>",
9348
+ "<loc_82>",
9349
+ "<loc_83>",
9350
+ "<loc_84>",
9351
+ "<loc_85>",
9352
+ "<loc_86>",
9353
+ "<loc_87>",
9354
+ "<loc_88>",
9355
+ "<loc_89>",
9356
+ "<loc_90>",
9357
+ "<loc_91>",
9358
+ "<loc_92>",
9359
+ "<loc_93>",
9360
+ "<loc_94>",
9361
+ "<loc_95>",
9362
+ "<loc_96>",
9363
+ "<loc_97>",
9364
+ "<loc_98>",
9365
+ "<loc_99>",
9366
+ "<loc_100>",
9367
+ "<loc_101>",
9368
+ "<loc_102>",
9369
+ "<loc_103>",
9370
+ "<loc_104>",
9371
+ "<loc_105>",
9372
+ "<loc_106>",
9373
+ "<loc_107>",
9374
+ "<loc_108>",
9375
+ "<loc_109>",
9376
+ "<loc_110>",
9377
+ "<loc_111>",
9378
+ "<loc_112>",
9379
+ "<loc_113>",
9380
+ "<loc_114>",
9381
+ "<loc_115>",
9382
+ "<loc_116>",
9383
+ "<loc_117>",
9384
+ "<loc_118>",
9385
+ "<loc_119>",
9386
+ "<loc_120>",
9387
+ "<loc_121>",
9388
+ "<loc_122>",
9389
+ "<loc_123>",
9390
+ "<loc_124>",
9391
+ "<loc_125>",
9392
+ "<loc_126>",
9393
+ "<loc_127>",
9394
+ "<loc_128>",
9395
+ "<loc_129>",
9396
+ "<loc_130>",
9397
+ "<loc_131>",
9398
+ "<loc_132>",
9399
+ "<loc_133>",
9400
+ "<loc_134>",
9401
+ "<loc_135>",
9402
+ "<loc_136>",
9403
+ "<loc_137>",
9404
+ "<loc_138>",
9405
+ "<loc_139>",
9406
+ "<loc_140>",
9407
+ "<loc_141>",
9408
+ "<loc_142>",
9409
+ "<loc_143>",
9410
+ "<loc_144>",
9411
+ "<loc_145>",
9412
+ "<loc_146>",
9413
+ "<loc_147>",
9414
+ "<loc_148>",
9415
+ "<loc_149>",
9416
+ "<loc_150>",
9417
+ "<loc_151>",
9418
+ "<loc_152>",
9419
+ "<loc_153>",
9420
+ "<loc_154>",
9421
+ "<loc_155>",
9422
+ "<loc_156>",
9423
+ "<loc_157>",
9424
+ "<loc_158>",
9425
+ "<loc_159>",
9426
+ "<loc_160>",
9427
+ "<loc_161>",
9428
+ "<loc_162>",
9429
+ "<loc_163>",
9430
+ "<loc_164>",
9431
+ "<loc_165>",
9432
+ "<loc_166>",
9433
+ "<loc_167>",
9434
+ "<loc_168>",
9435
+ "<loc_169>",
9436
+ "<loc_170>",
9437
+ "<loc_171>",
9438
+ "<loc_172>",
9439
+ "<loc_173>",
9440
+ "<loc_174>",
9441
+ "<loc_175>",
9442
+ "<loc_176>",
9443
+ "<loc_177>",
9444
+ "<loc_178>",
9445
+ "<loc_179>",
9446
+ "<loc_180>",
9447
+ "<loc_181>",
9448
+ "<loc_182>",
9449
+ "<loc_183>",
9450
+ "<loc_184>",
9451
+ "<loc_185>",
9452
+ "<loc_186>",
9453
+ "<loc_187>",
9454
+ "<loc_188>",
9455
+ "<loc_189>",
9456
+ "<loc_190>",
9457
+ "<loc_191>",
9458
+ "<loc_192>",
9459
+ "<loc_193>",
9460
+ "<loc_194>",
9461
+ "<loc_195>",
9462
+ "<loc_196>",
9463
+ "<loc_197>",
9464
+ "<loc_198>",
9465
+ "<loc_199>",
9466
+ "<loc_200>",
9467
+ "<loc_201>",
9468
+ "<loc_202>",
9469
+ "<loc_203>",
9470
+ "<loc_204>",
9471
+ "<loc_205>",
9472
+ "<loc_206>",
9473
+ "<loc_207>",
9474
+ "<loc_208>",
9475
+ "<loc_209>",
9476
+ "<loc_210>",
9477
+ "<loc_211>",
9478
+ "<loc_212>",
9479
+ "<loc_213>",
9480
+ "<loc_214>",
9481
+ "<loc_215>",
9482
+ "<loc_216>",
9483
+ "<loc_217>",
9484
+ "<loc_218>",
9485
+ "<loc_219>",
9486
+ "<loc_220>",
9487
+ "<loc_221>",
9488
+ "<loc_222>",
9489
+ "<loc_223>",
9490
+ "<loc_224>",
9491
+ "<loc_225>",
9492
+ "<loc_226>",
9493
+ "<loc_227>",
9494
+ "<loc_228>",
9495
+ "<loc_229>",
9496
+ "<loc_230>",
9497
+ "<loc_231>",
9498
+ "<loc_232>",
9499
+ "<loc_233>",
9500
+ "<loc_234>",
9501
+ "<loc_235>",
9502
+ "<loc_236>",
9503
+ "<loc_237>",
9504
+ "<loc_238>",
9505
+ "<loc_239>",
9506
+ "<loc_240>",
9507
+ "<loc_241>",
9508
+ "<loc_242>",
9509
+ "<loc_243>",
9510
+ "<loc_244>",
9511
+ "<loc_245>",
9512
+ "<loc_246>",
9513
+ "<loc_247>",
9514
+ "<loc_248>",
9515
+ "<loc_249>",
9516
+ "<loc_250>",
9517
+ "<loc_251>",
9518
+ "<loc_252>",
9519
+ "<loc_253>",
9520
+ "<loc_254>",
9521
+ "<loc_255>",
9522
+ "<loc_256>",
9523
+ "<loc_257>",
9524
+ "<loc_258>",
9525
+ "<loc_259>",
9526
+ "<loc_260>",
9527
+ "<loc_261>",
9528
+ "<loc_262>",
9529
+ "<loc_263>",
9530
+ "<loc_264>",
9531
+ "<loc_265>",
9532
+ "<loc_266>",
9533
+ "<loc_267>",
9534
+ "<loc_268>",
9535
+ "<loc_269>",
9536
+ "<loc_270>",
9537
+ "<loc_271>",
9538
+ "<loc_272>",
9539
+ "<loc_273>",
9540
+ "<loc_274>",
9541
+ "<loc_275>",
9542
+ "<loc_276>",
9543
+ "<loc_277>",
9544
+ "<loc_278>",
9545
+ "<loc_279>",
9546
+ "<loc_280>",
9547
+ "<loc_281>",
9548
+ "<loc_282>",
9549
+ "<loc_283>",
9550
+ "<loc_284>",
9551
+ "<loc_285>",
9552
+ "<loc_286>",
9553
+ "<loc_287>",
9554
+ "<loc_288>",
9555
+ "<loc_289>",
9556
+ "<loc_290>",
9557
+ "<loc_291>",
9558
+ "<loc_292>",
9559
+ "<loc_293>",
9560
+ "<loc_294>",
9561
+ "<loc_295>",
9562
+ "<loc_296>",
9563
+ "<loc_297>",
9564
+ "<loc_298>",
9565
+ "<loc_299>",
9566
+ "<loc_300>",
9567
+ "<loc_301>",
9568
+ "<loc_302>",
9569
+ "<loc_303>",
9570
+ "<loc_304>",
9571
+ "<loc_305>",
9572
+ "<loc_306>",
9573
+ "<loc_307>",
9574
+ "<loc_308>",
9575
+ "<loc_309>",
9576
+ "<loc_310>",
9577
+ "<loc_311>",
9578
+ "<loc_312>",
9579
+ "<loc_313>",
9580
+ "<loc_314>",
9581
+ "<loc_315>",
9582
+ "<loc_316>",
9583
+ "<loc_317>",
9584
+ "<loc_318>",
9585
+ "<loc_319>",
9586
+ "<loc_320>",
9587
+ "<loc_321>",
9588
+ "<loc_322>",
9589
+ "<loc_323>",
9590
+ "<loc_324>",
9591
+ "<loc_325>",
9592
+ "<loc_326>",
9593
+ "<loc_327>",
9594
+ "<loc_328>",
9595
+ "<loc_329>",
9596
+ "<loc_330>",
9597
+ "<loc_331>",
9598
+ "<loc_332>",
9599
+ "<loc_333>",
9600
+ "<loc_334>",
9601
+ "<loc_335>",
9602
+ "<loc_336>",
9603
+ "<loc_337>",
9604
+ "<loc_338>",
9605
+ "<loc_339>",
9606
+ "<loc_340>",
9607
+ "<loc_341>",
9608
+ "<loc_342>",
9609
+ "<loc_343>",
9610
+ "<loc_344>",
9611
+ "<loc_345>",
9612
+ "<loc_346>",
9613
+ "<loc_347>",
9614
+ "<loc_348>",
9615
+ "<loc_349>",
9616
+ "<loc_350>",
9617
+ "<loc_351>",
9618
+ "<loc_352>",
9619
+ "<loc_353>",
9620
+ "<loc_354>",
9621
+ "<loc_355>",
9622
+ "<loc_356>",
9623
+ "<loc_357>",
9624
+ "<loc_358>",
9625
+ "<loc_359>",
9626
+ "<loc_360>",
9627
+ "<loc_361>",
9628
+ "<loc_362>",
9629
+ "<loc_363>",
9630
+ "<loc_364>",
9631
+ "<loc_365>",
9632
+ "<loc_366>",
9633
+ "<loc_367>",
9634
+ "<loc_368>",
9635
+ "<loc_369>",
9636
+ "<loc_370>",
9637
+ "<loc_371>",
9638
+ "<loc_372>",
9639
+ "<loc_373>",
9640
+ "<loc_374>",
9641
+ "<loc_375>",
9642
+ "<loc_376>",
9643
+ "<loc_377>",
9644
+ "<loc_378>",
9645
+ "<loc_379>",
9646
+ "<loc_380>",
9647
+ "<loc_381>",
9648
+ "<loc_382>",
9649
+ "<loc_383>",
9650
+ "<loc_384>",
9651
+ "<loc_385>",
9652
+ "<loc_386>",
9653
+ "<loc_387>",
9654
+ "<loc_388>",
9655
+ "<loc_389>",
9656
+ "<loc_390>",
9657
+ "<loc_391>",
9658
+ "<loc_392>",
9659
+ "<loc_393>",
9660
+ "<loc_394>",
9661
+ "<loc_395>",
9662
+ "<loc_396>",
9663
+ "<loc_397>",
9664
+ "<loc_398>",
9665
+ "<loc_399>",
9666
+ "<loc_400>",
9667
+ "<loc_401>",
9668
+ "<loc_402>",
9669
+ "<loc_403>",
9670
+ "<loc_404>",
9671
+ "<loc_405>",
9672
+ "<loc_406>",
9673
+ "<loc_407>",
9674
+ "<loc_408>",
9675
+ "<loc_409>",
9676
+ "<loc_410>",
9677
+ "<loc_411>",
9678
+ "<loc_412>",
9679
+ "<loc_413>",
9680
+ "<loc_414>",
9681
+ "<loc_415>",
9682
+ "<loc_416>",
9683
+ "<loc_417>",
9684
+ "<loc_418>",
9685
+ "<loc_419>",
9686
+ "<loc_420>",
9687
+ "<loc_421>",
9688
+ "<loc_422>",
9689
+ "<loc_423>",
9690
+ "<loc_424>",
9691
+ "<loc_425>",
9692
+ "<loc_426>",
9693
+ "<loc_427>",
9694
+ "<loc_428>",
9695
+ "<loc_429>",
9696
+ "<loc_430>",
9697
+ "<loc_431>",
9698
+ "<loc_432>",
9699
+ "<loc_433>",
9700
+ "<loc_434>",
9701
+ "<loc_435>",
9702
+ "<loc_436>",
9703
+ "<loc_437>",
9704
+ "<loc_438>",
9705
+ "<loc_439>",
9706
+ "<loc_440>",
9707
+ "<loc_441>",
9708
+ "<loc_442>",
9709
+ "<loc_443>",
9710
+ "<loc_444>",
9711
+ "<loc_445>",
9712
+ "<loc_446>",
9713
+ "<loc_447>",
9714
+ "<loc_448>",
9715
+ "<loc_449>",
9716
+ "<loc_450>",
9717
+ "<loc_451>",
9718
+ "<loc_452>",
9719
+ "<loc_453>",
9720
+ "<loc_454>",
9721
+ "<loc_455>",
9722
+ "<loc_456>",
9723
+ "<loc_457>",
9724
+ "<loc_458>",
9725
+ "<loc_459>",
9726
+ "<loc_460>",
9727
+ "<loc_461>",
9728
+ "<loc_462>",
9729
+ "<loc_463>",
9730
+ "<loc_464>",
9731
+ "<loc_465>",
9732
+ "<loc_466>",
9733
+ "<loc_467>",
9734
+ "<loc_468>",
9735
+ "<loc_469>",
9736
+ "<loc_470>",
9737
+ "<loc_471>",
9738
+ "<loc_472>",
9739
+ "<loc_473>",
9740
+ "<loc_474>",
9741
+ "<loc_475>",
9742
+ "<loc_476>",
9743
+ "<loc_477>",
9744
+ "<loc_478>",
9745
+ "<loc_479>",
9746
+ "<loc_480>",
9747
+ "<loc_481>",
9748
+ "<loc_482>",
9749
+ "<loc_483>",
9750
+ "<loc_484>",
9751
+ "<loc_485>",
9752
+ "<loc_486>",
9753
+ "<loc_487>",
9754
+ "<loc_488>",
9755
+ "<loc_489>",
9756
+ "<loc_490>",
9757
+ "<loc_491>",
9758
+ "<loc_492>",
9759
+ "<loc_493>",
9760
+ "<loc_494>",
9761
+ "<loc_495>",
9762
+ "<loc_496>",
9763
+ "<loc_497>",
9764
+ "<loc_498>",
9765
+ "<loc_499>",
9766
+ "<loc_500>",
9767
+ "<loc_501>",
9768
+ "<loc_502>",
9769
+ "<loc_503>",
9770
+ "<loc_504>",
9771
+ "<loc_505>",
9772
+ "<loc_506>",
9773
+ "<loc_507>",
9774
+ "<loc_508>",
9775
+ "<loc_509>",
9776
+ "<loc_510>",
9777
+ "<loc_511>",
9778
+ "<loc_512>",
9779
+ "<loc_513>",
9780
+ "<loc_514>",
9781
+ "<loc_515>",
9782
+ "<loc_516>",
9783
+ "<loc_517>",
9784
+ "<loc_518>",
9785
+ "<loc_519>",
9786
+ "<loc_520>",
9787
+ "<loc_521>",
9788
+ "<loc_522>",
9789
+ "<loc_523>",
9790
+ "<loc_524>",
9791
+ "<loc_525>",
9792
+ "<loc_526>",
9793
+ "<loc_527>",
9794
+ "<loc_528>",
9795
+ "<loc_529>",
9796
+ "<loc_530>",
9797
+ "<loc_531>",
9798
+ "<loc_532>",
9799
+ "<loc_533>",
9800
+ "<loc_534>",
9801
+ "<loc_535>",
9802
+ "<loc_536>",
9803
+ "<loc_537>",
9804
+ "<loc_538>",
9805
+ "<loc_539>",
9806
+ "<loc_540>",
9807
+ "<loc_541>",
9808
+ "<loc_542>",
9809
+ "<loc_543>",
9810
+ "<loc_544>",
9811
+ "<loc_545>",
9812
+ "<loc_546>",
9813
+ "<loc_547>",
9814
+ "<loc_548>",
9815
+ "<loc_549>",
9816
+ "<loc_550>",
9817
+ "<loc_551>",
9818
+ "<loc_552>",
9819
+ "<loc_553>",
9820
+ "<loc_554>",
9821
+ "<loc_555>",
9822
+ "<loc_556>",
9823
+ "<loc_557>",
9824
+ "<loc_558>",
9825
+ "<loc_559>",
9826
+ "<loc_560>",
9827
+ "<loc_561>",
9828
+ "<loc_562>",
9829
+ "<loc_563>",
9830
+ "<loc_564>",
9831
+ "<loc_565>",
9832
+ "<loc_566>",
9833
+ "<loc_567>",
9834
+ "<loc_568>",
9835
+ "<loc_569>",
9836
+ "<loc_570>",
9837
+ "<loc_571>",
9838
+ "<loc_572>",
9839
+ "<loc_573>",
9840
+ "<loc_574>",
9841
+ "<loc_575>",
9842
+ "<loc_576>",
9843
+ "<loc_577>",
9844
+ "<loc_578>",
9845
+ "<loc_579>",
9846
+ "<loc_580>",
9847
+ "<loc_581>",
9848
+ "<loc_582>",
9849
+ "<loc_583>",
9850
+ "<loc_584>",
9851
+ "<loc_585>",
9852
+ "<loc_586>",
9853
+ "<loc_587>",
9854
+ "<loc_588>",
9855
+ "<loc_589>",
9856
+ "<loc_590>",
9857
+ "<loc_591>",
9858
+ "<loc_592>",
9859
+ "<loc_593>",
9860
+ "<loc_594>",
9861
+ "<loc_595>",
9862
+ "<loc_596>",
9863
+ "<loc_597>",
9864
+ "<loc_598>",
9865
+ "<loc_599>",
9866
+ "<loc_600>",
9867
+ "<loc_601>",
9868
+ "<loc_602>",
9869
+ "<loc_603>",
9870
+ "<loc_604>",
9871
+ "<loc_605>",
9872
+ "<loc_606>",
9873
+ "<loc_607>",
9874
+ "<loc_608>",
9875
+ "<loc_609>",
9876
+ "<loc_610>",
9877
+ "<loc_611>",
9878
+ "<loc_612>",
9879
+ "<loc_613>",
9880
+ "<loc_614>",
9881
+ "<loc_615>",
9882
+ "<loc_616>",
9883
+ "<loc_617>",
9884
+ "<loc_618>",
9885
+ "<loc_619>",
9886
+ "<loc_620>",
9887
+ "<loc_621>",
9888
+ "<loc_622>",
9889
+ "<loc_623>",
9890
+ "<loc_624>",
9891
+ "<loc_625>",
9892
+ "<loc_626>",
9893
+ "<loc_627>",
9894
+ "<loc_628>",
9895
+ "<loc_629>",
9896
+ "<loc_630>",
9897
+ "<loc_631>",
9898
+ "<loc_632>",
9899
+ "<loc_633>",
9900
+ "<loc_634>",
9901
+ "<loc_635>",
9902
+ "<loc_636>",
9903
+ "<loc_637>",
9904
+ "<loc_638>",
9905
+ "<loc_639>",
9906
+ "<loc_640>",
9907
+ "<loc_641>",
9908
+ "<loc_642>",
9909
+ "<loc_643>",
9910
+ "<loc_644>",
9911
+ "<loc_645>",
9912
+ "<loc_646>",
9913
+ "<loc_647>",
9914
+ "<loc_648>",
9915
+ "<loc_649>",
9916
+ "<loc_650>",
9917
+ "<loc_651>",
9918
+ "<loc_652>",
9919
+ "<loc_653>",
9920
+ "<loc_654>",
9921
+ "<loc_655>",
9922
+ "<loc_656>",
9923
+ "<loc_657>",
9924
+ "<loc_658>",
9925
+ "<loc_659>",
9926
+ "<loc_660>",
9927
+ "<loc_661>",
9928
+ "<loc_662>",
9929
+ "<loc_663>",
9930
+ "<loc_664>",
9931
+ "<loc_665>",
9932
+ "<loc_666>",
9933
+ "<loc_667>",
9934
+ "<loc_668>",
9935
+ "<loc_669>",
9936
+ "<loc_670>",
9937
+ "<loc_671>",
9938
+ "<loc_672>",
9939
+ "<loc_673>",
9940
+ "<loc_674>",
9941
+ "<loc_675>",
9942
+ "<loc_676>",
9943
+ "<loc_677>",
9944
+ "<loc_678>",
9945
+ "<loc_679>",
9946
+ "<loc_680>",
9947
+ "<loc_681>",
9948
+ "<loc_682>",
9949
+ "<loc_683>",
9950
+ "<loc_684>",
9951
+ "<loc_685>",
9952
+ "<loc_686>",
9953
+ "<loc_687>",
9954
+ "<loc_688>",
9955
+ "<loc_689>",
9956
+ "<loc_690>",
9957
+ "<loc_691>",
9958
+ "<loc_692>",
9959
+ "<loc_693>",
9960
+ "<loc_694>",
9961
+ "<loc_695>",
9962
+ "<loc_696>",
9963
+ "<loc_697>",
9964
+ "<loc_698>",
9965
+ "<loc_699>",
9966
+ "<loc_700>",
9967
+ "<loc_701>",
9968
+ "<loc_702>",
9969
+ "<loc_703>",
9970
+ "<loc_704>",
9971
+ "<loc_705>",
9972
+ "<loc_706>",
9973
+ "<loc_707>",
9974
+ "<loc_708>",
9975
+ "<loc_709>",
9976
+ "<loc_710>",
9977
+ "<loc_711>",
9978
+ "<loc_712>",
9979
+ "<loc_713>",
9980
+ "<loc_714>",
9981
+ "<loc_715>",
9982
+ "<loc_716>",
9983
+ "<loc_717>",
9984
+ "<loc_718>",
9985
+ "<loc_719>",
9986
+ "<loc_720>",
9987
+ "<loc_721>",
9988
+ "<loc_722>",
9989
+ "<loc_723>",
9990
+ "<loc_724>",
9991
+ "<loc_725>",
9992
+ "<loc_726>",
9993
+ "<loc_727>",
9994
+ "<loc_728>",
9995
+ "<loc_729>",
9996
+ "<loc_730>",
9997
+ "<loc_731>",
9998
+ "<loc_732>",
9999
+ "<loc_733>",
10000
+ "<loc_734>",
10001
+ "<loc_735>",
10002
+ "<loc_736>",
10003
+ "<loc_737>",
10004
+ "<loc_738>",
10005
+ "<loc_739>",
10006
+ "<loc_740>",
10007
+ "<loc_741>",
10008
+ "<loc_742>",
10009
+ "<loc_743>",
10010
+ "<loc_744>",
10011
+ "<loc_745>",
10012
+ "<loc_746>",
10013
+ "<loc_747>",
10014
+ "<loc_748>",
10015
+ "<loc_749>",
10016
+ "<loc_750>",
10017
+ "<loc_751>",
10018
+ "<loc_752>",
10019
+ "<loc_753>",
10020
+ "<loc_754>",
10021
+ "<loc_755>",
10022
+ "<loc_756>",
10023
+ "<loc_757>",
10024
+ "<loc_758>",
10025
+ "<loc_759>",
10026
+ "<loc_760>",
10027
+ "<loc_761>",
10028
+ "<loc_762>",
10029
+ "<loc_763>",
10030
+ "<loc_764>",
10031
+ "<loc_765>",
10032
+ "<loc_766>",
10033
+ "<loc_767>",
10034
+ "<loc_768>",
10035
+ "<loc_769>",
10036
+ "<loc_770>",
10037
+ "<loc_771>",
10038
+ "<loc_772>",
10039
+ "<loc_773>",
10040
+ "<loc_774>",
10041
+ "<loc_775>",
10042
+ "<loc_776>",
10043
+ "<loc_777>",
10044
+ "<loc_778>",
10045
+ "<loc_779>",
10046
+ "<loc_780>",
10047
+ "<loc_781>",
10048
+ "<loc_782>",
10049
+ "<loc_783>",
10050
+ "<loc_784>",
10051
+ "<loc_785>",
10052
+ "<loc_786>",
10053
+ "<loc_787>",
10054
+ "<loc_788>",
10055
+ "<loc_789>",
10056
+ "<loc_790>",
10057
+ "<loc_791>",
10058
+ "<loc_792>",
10059
+ "<loc_793>",
10060
+ "<loc_794>",
10061
+ "<loc_795>",
10062
+ "<loc_796>",
10063
+ "<loc_797>",
10064
+ "<loc_798>",
10065
+ "<loc_799>",
10066
+ "<loc_800>",
10067
+ "<loc_801>",
10068
+ "<loc_802>",
10069
+ "<loc_803>",
10070
+ "<loc_804>",
10071
+ "<loc_805>",
10072
+ "<loc_806>",
10073
+ "<loc_807>",
10074
+ "<loc_808>",
10075
+ "<loc_809>",
10076
+ "<loc_810>",
10077
+ "<loc_811>",
10078
+ "<loc_812>",
10079
+ "<loc_813>",
10080
+ "<loc_814>",
10081
+ "<loc_815>",
10082
+ "<loc_816>",
10083
+ "<loc_817>",
10084
+ "<loc_818>",
10085
+ "<loc_819>",
10086
+ "<loc_820>",
10087
+ "<loc_821>",
10088
+ "<loc_822>",
10089
+ "<loc_823>",
10090
+ "<loc_824>",
10091
+ "<loc_825>",
10092
+ "<loc_826>",
10093
+ "<loc_827>",
10094
+ "<loc_828>",
10095
+ "<loc_829>",
10096
+ "<loc_830>",
10097
+ "<loc_831>",
10098
+ "<loc_832>",
10099
+ "<loc_833>",
10100
+ "<loc_834>",
10101
+ "<loc_835>",
10102
+ "<loc_836>",
10103
+ "<loc_837>",
10104
+ "<loc_838>",
10105
+ "<loc_839>",
10106
+ "<loc_840>",
10107
+ "<loc_841>",
10108
+ "<loc_842>",
10109
+ "<loc_843>",
10110
+ "<loc_844>",
10111
+ "<loc_845>",
10112
+ "<loc_846>",
10113
+ "<loc_847>",
10114
+ "<loc_848>",
10115
+ "<loc_849>",
10116
+ "<loc_850>",
10117
+ "<loc_851>",
10118
+ "<loc_852>",
10119
+ "<loc_853>",
10120
+ "<loc_854>",
10121
+ "<loc_855>",
10122
+ "<loc_856>",
10123
+ "<loc_857>",
10124
+ "<loc_858>",
10125
+ "<loc_859>",
10126
+ "<loc_860>",
10127
+ "<loc_861>",
10128
+ "<loc_862>",
10129
+ "<loc_863>",
10130
+ "<loc_864>",
10131
+ "<loc_865>",
10132
+ "<loc_866>",
10133
+ "<loc_867>",
10134
+ "<loc_868>",
10135
+ "<loc_869>",
10136
+ "<loc_870>",
10137
+ "<loc_871>",
10138
+ "<loc_872>",
10139
+ "<loc_873>",
10140
+ "<loc_874>",
10141
+ "<loc_875>",
10142
+ "<loc_876>",
10143
+ "<loc_877>",
10144
+ "<loc_878>",
10145
+ "<loc_879>",
10146
+ "<loc_880>",
10147
+ "<loc_881>",
10148
+ "<loc_882>",
10149
+ "<loc_883>",
10150
+ "<loc_884>",
10151
+ "<loc_885>",
10152
+ "<loc_886>",
10153
+ "<loc_887>",
10154
+ "<loc_888>",
10155
+ "<loc_889>",
10156
+ "<loc_890>",
10157
+ "<loc_891>",
10158
+ "<loc_892>",
10159
+ "<loc_893>",
10160
+ "<loc_894>",
10161
+ "<loc_895>",
10162
+ "<loc_896>",
10163
+ "<loc_897>",
10164
+ "<loc_898>",
10165
+ "<loc_899>",
10166
+ "<loc_900>",
10167
+ "<loc_901>",
10168
+ "<loc_902>",
10169
+ "<loc_903>",
10170
+ "<loc_904>",
10171
+ "<loc_905>",
10172
+ "<loc_906>",
10173
+ "<loc_907>",
10174
+ "<loc_908>",
10175
+ "<loc_909>",
10176
+ "<loc_910>",
10177
+ "<loc_911>",
10178
+ "<loc_912>",
10179
+ "<loc_913>",
10180
+ "<loc_914>",
10181
+ "<loc_915>",
10182
+ "<loc_916>",
10183
+ "<loc_917>",
10184
+ "<loc_918>",
10185
+ "<loc_919>",
10186
+ "<loc_920>",
10187
+ "<loc_921>",
10188
+ "<loc_922>",
10189
+ "<loc_923>",
10190
+ "<loc_924>",
10191
+ "<loc_925>",
10192
+ "<loc_926>",
10193
+ "<loc_927>",
10194
+ "<loc_928>",
10195
+ "<loc_929>",
10196
+ "<loc_930>",
10197
+ "<loc_931>",
10198
+ "<loc_932>",
10199
+ "<loc_933>",
10200
+ "<loc_934>",
10201
+ "<loc_935>",
10202
+ "<loc_936>",
10203
+ "<loc_937>",
10204
+ "<loc_938>",
10205
+ "<loc_939>",
10206
+ "<loc_940>",
10207
+ "<loc_941>",
10208
+ "<loc_942>",
10209
+ "<loc_943>",
10210
+ "<loc_944>",
10211
+ "<loc_945>",
10212
+ "<loc_946>",
10213
+ "<loc_947>",
10214
+ "<loc_948>",
10215
+ "<loc_949>",
10216
+ "<loc_950>",
10217
+ "<loc_951>",
10218
+ "<loc_952>",
10219
+ "<loc_953>",
10220
+ "<loc_954>",
10221
+ "<loc_955>",
10222
+ "<loc_956>",
10223
+ "<loc_957>",
10224
+ "<loc_958>",
10225
+ "<loc_959>",
10226
+ "<loc_960>",
10227
+ "<loc_961>",
10228
+ "<loc_962>",
10229
+ "<loc_963>",
10230
+ "<loc_964>",
10231
+ "<loc_965>",
10232
+ "<loc_966>",
10233
+ "<loc_967>",
10234
+ "<loc_968>",
10235
+ "<loc_969>",
10236
+ "<loc_970>",
10237
+ "<loc_971>",
10238
+ "<loc_972>",
10239
+ "<loc_973>",
10240
+ "<loc_974>",
10241
+ "<loc_975>",
10242
+ "<loc_976>",
10243
+ "<loc_977>",
10244
+ "<loc_978>",
10245
+ "<loc_979>",
10246
+ "<loc_980>",
10247
+ "<loc_981>",
10248
+ "<loc_982>",
10249
+ "<loc_983>",
10250
+ "<loc_984>",
10251
+ "<loc_985>",
10252
+ "<loc_986>",
10253
+ "<loc_987>",
10254
+ "<loc_988>",
10255
+ "<loc_989>",
10256
+ "<loc_990>",
10257
+ "<loc_991>",
10258
+ "<loc_992>",
10259
+ "<loc_993>",
10260
+ "<loc_994>",
10261
+ "<loc_995>",
10262
+ "<loc_996>",
10263
+ "<loc_997>",
10264
+ "<loc_998>",
10265
+ "<loc_999>",
10266
+ "<cap>",
10267
+ "</cap>",
10268
+ "<ncap>",
10269
+ "</ncap>",
10270
+ "<dcap>",
10271
+ "</dcap>",
10272
+ "<grounding>",
10273
+ "</grounding>",
10274
+ "<seg>",
10275
+ "</seg>",
10276
+ "<sep>",
10277
+ "<region_cap>",
10278
+ "</region_cap>",
10279
+ "<region_to_desciption>",
10280
+ "</region_to_desciption>",
10281
+ "<proposal>",
10282
+ "</proposal>",
10283
+ "<poly>",
10284
+ "</poly>",
10285
  "<and>"
10286
  ],
10287
+ "auto_map": {
10288
+ "AutoProcessor": "processing_florence2.Florence2Processor"
10289
+ },
10290
  "bos_token": "<s>",
10291
  "clean_up_tokenization_spaces": false,
10292
  "cls_token": "<s>",
10293
  "eos_token": "</s>",
10294
  "errors": "replace",
10295
  "mask_token": "<mask>",
10296
+ "max_length": null,
10297
  "model_max_length": 1024,
10298
+ "pad_to_multiple_of": null,
10299
  "pad_token": "<pad>",
10300
+ "pad_token_type_id": 0,
10301
+ "padding_side": "right",
10302
  "processor_class": "Florence2Processor",
10303
  "sep_token": "</s>",
10304
  "tokenizer_class": "BartTokenizer",