File size: 16,121 Bytes
18ddfe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility functions used by target assigner."""

import tensorflow.compat.v1 as tf

from object_detection.utils import shape_utils


def image_shape_to_grids(height, width):
  """Computes xy-grids given the shape of the image.

  Args:
    height: The height of the image.
    width: The width of the image.

  Returns:
    A tuple of two tensors:
      y_grid: A float tensor with shape [height, width] representing the
        y-coordinate of each pixel grid.
      x_grid: A float tensor with shape [height, width] representing the
        x-coordinate of each pixel grid.
  """
  out_height = tf.cast(height, tf.float32)
  out_width = tf.cast(width, tf.float32)
  x_range = tf.range(out_width, dtype=tf.float32)
  y_range = tf.range(out_height, dtype=tf.float32)
  x_grid, y_grid = tf.meshgrid(x_range, y_range, indexing='xy')
  return (y_grid, x_grid)


def coordinates_to_heatmap(y_grid,
                           x_grid,
                           y_coordinates,
                           x_coordinates,
                           sigma,
                           channel_onehot,
                           channel_weights=None):
  """Returns the heatmap targets from a set of point coordinates.

  This function maps a set of point coordinates to the output heatmap image
  applied using a Gaussian kernel. Note that this function be can used by both
  object detection and keypoint estimation tasks. For object detection, the
  "channel" refers to the object class. For keypoint estimation, the "channel"
  refers to the number of keypoint types.

  Args:
    y_grid: A 2D tensor with shape [height, width] which contains the grid
      y-coordinates given in the (output) image dimensions.
    x_grid: A 2D tensor with shape [height, width] which contains the grid
      x-coordinates given in the (output) image dimensions.
    y_coordinates: A 1D tensor with shape [num_instances] representing the
      y-coordinates of the instances in the output space coordinates.
    x_coordinates: A 1D tensor with shape [num_instances] representing the
      x-coordinates of the instances in the output space coordinates.
    sigma: A 1D tensor with shape [num_instances] representing the standard
      deviation of the Gaussian kernel to be applied to the point.
    channel_onehot: A 2D tensor with shape [num_instances, num_channels]
      representing the one-hot encoded channel labels for each point.
    channel_weights: A 1D tensor with shape [num_instances] corresponding to the
      weight of each instance.

  Returns:
    heatmap: A tensor of size [height, width, num_channels] representing the
      heatmap. Output (height, width) match the dimensions of the input grids.
  """
  num_instances, num_channels = (
      shape_utils.combined_static_and_dynamic_shape(channel_onehot))

  x_grid = tf.expand_dims(x_grid, 2)
  y_grid = tf.expand_dims(y_grid, 2)
  # The raw center coordinates in the output space.
  x_diff = x_grid - tf.math.floor(x_coordinates)
  y_diff = y_grid - tf.math.floor(y_coordinates)
  squared_distance = x_diff**2 + y_diff**2

  gaussian_map = tf.exp(-squared_distance / (2 * sigma * sigma))

  reshaped_gaussian_map = tf.expand_dims(gaussian_map, axis=-1)
  reshaped_channel_onehot = tf.reshape(channel_onehot,
                                       (1, 1, num_instances, num_channels))
  gaussian_per_box_per_class_map = (
      reshaped_gaussian_map * reshaped_channel_onehot)

  if channel_weights is not None:
    reshaped_weights = tf.reshape(channel_weights, (1, 1, num_instances, 1))
    gaussian_per_box_per_class_map *= reshaped_weights

  # Take maximum along the "instance" dimension so that all per-instance
  # heatmaps of the same class are merged together.
  heatmap = tf.reduce_max(gaussian_per_box_per_class_map, axis=2)

  # Maximum of an empty tensor is -inf, the following is to avoid that.
  heatmap = tf.maximum(heatmap, 0)

  return heatmap


def compute_floor_offsets_with_indices(y_source,
                                       x_source,
                                       y_target=None,
                                       x_target=None):
  """Computes offsets from floored source(floored) to target coordinates.

  This function computes the offsets from source coordinates ("floored" as if
  they were put on the grids) to target coordinates. Note that the input
  coordinates should be the "absolute" coordinates in terms of the output image
  dimensions as opposed to the normalized coordinates (i.e. values in [0, 1]).
  If the input y and x source have the second dimension (representing the
  neighboring pixels), then the offsets are computed from each of the
  neighboring pixels to their corresponding target (first dimension).

  Args:
    y_source: A tensor with shape [num_points] (or [num_points, num_neighbors])
      representing the absolute y-coordinates (in the output image space) of the
      source points.
    x_source: A tensor with shape [num_points] (or [num_points, num_neighbors])
      representing the absolute x-coordinates (in the output image space) of the
      source points.
    y_target: A tensor with shape [num_points] representing the absolute
      y-coordinates (in the output image space) of the target points. If not
      provided, then y_source is used as the targets.
    x_target: A tensor with shape [num_points] representing the absolute
      x-coordinates (in the output image space) of the target points. If not
      provided, then x_source is used as the targets.

  Returns:
    A tuple of two tensors:
      offsets: A tensor with shape [num_points, 2] (or
        [num_points, num_neighbors, 2]) representing the offsets of each input
        point.
      indices: A tensor with shape [num_points, 2] (or
        [num_points, num_neighbors, 2]) representing the indices of where the
        offsets should be retrieved in the output image dimension space.

  Raise:
    ValueError: source and target shapes have unexpected values.
  """
  y_source_floored = tf.floor(y_source)
  x_source_floored = tf.floor(x_source)

  source_shape = shape_utils.combined_static_and_dynamic_shape(y_source)
  if y_target is None and x_target is None:
    y_target = y_source
    x_target = x_source
  else:
    target_shape = shape_utils.combined_static_and_dynamic_shape(y_target)
    if len(source_shape) == 2 and len(target_shape) == 1:
      _, num_neighbors = source_shape
      y_target = tf.tile(
          tf.expand_dims(y_target, -1), multiples=[1, num_neighbors])
      x_target = tf.tile(
          tf.expand_dims(x_target, -1), multiples=[1, num_neighbors])
    elif source_shape != target_shape:
      raise ValueError('Inconsistent source and target shape.')

  y_offset = y_target - y_source_floored
  x_offset = x_target - x_source_floored

  y_source_indices = tf.cast(y_source_floored, tf.int32)
  x_source_indices = tf.cast(x_source_floored, tf.int32)

  indices = tf.stack([y_source_indices, x_source_indices], axis=-1)
  offsets = tf.stack([y_offset, x_offset], axis=-1)
  return offsets, indices


def get_valid_keypoint_mask_for_class(keypoint_coordinates,
                                      class_id,
                                      class_onehot,
                                      class_weights=None,
                                      keypoint_indices=None):
  """Mask keypoints by their class ids and indices.

  For a given task, we may want to only consider a subset of instances or
  keypoints. This function is used to provide the mask (in terms of weights) to
  mark those elements which should be considered based on the classes of the
  instances and optionally, their keypoint indices. Note that the NaN values
  in the keypoints will also be masked out.

  Args:
    keypoint_coordinates: A float tensor with shape [num_instances,
      num_keypoints, 2] which contains the coordinates of each keypoint.
    class_id: An integer representing the target class id to be selected.
    class_onehot: A 2D tensor of shape [num_instances, num_classes] repesents
      the onehot (or k-hot) encoding of the class for each instance.
    class_weights: A 1D tensor of shape [num_instances] repesents the weight of
      each instance. If not provided, all instances are weighted equally.
    keypoint_indices: A list of integers representing the keypoint indices used
      to select the values on the keypoint dimension. If provided, the output
      dimension will be [num_instances, len(keypoint_indices)]

  Returns:
    A tuple of tensors:
      mask: A float tensor of shape [num_instances, K], where K is num_keypoints
        or len(keypoint_indices) if provided. The tensor has values either 0 or
        1 indicating whether an element in the input keypoints should be used.
      keypoints_nan_to_zeros: Same as input keypoints with the NaN values
        replaced by zeros and selected columns corresponding to the
        keypoint_indices (if provided). The shape of this tensor will always be
        the same as the output mask.
  """
  num_keypoints = tf.shape(keypoint_coordinates)[1]
  class_mask = class_onehot[:, class_id]
  reshaped_class_mask = tf.tile(
      tf.expand_dims(class_mask, axis=-1), multiples=[1, num_keypoints])
  not_nan = tf.math.logical_not(tf.math.is_nan(keypoint_coordinates))
  mask = reshaped_class_mask * tf.cast(not_nan[:, :, 0], dtype=tf.float32)
  keypoints_nan_to_zeros = tf.where(not_nan, keypoint_coordinates,
                                    tf.zeros_like(keypoint_coordinates))
  if class_weights is not None:
    reshaped_class_weight = tf.tile(
        tf.expand_dims(class_weights, axis=-1), multiples=[1, num_keypoints])
    mask = mask * reshaped_class_weight

  if keypoint_indices is not None:
    mask = tf.gather(mask, indices=keypoint_indices, axis=1)
    keypoints_nan_to_zeros = tf.gather(
        keypoints_nan_to_zeros, indices=keypoint_indices, axis=1)
  return mask, keypoints_nan_to_zeros


def blackout_pixel_weights_by_box_regions(height, width, boxes, blackout):
  """Blackout the pixel weights in the target box regions.

  This function is used to generate the pixel weight mask (usually in the output
  image dimension). The mask is to ignore some regions when computing loss.

  Args:
    height: int, height of the (output) image.
    width: int, width of the (output) image.
    boxes: A float tensor with shape [num_instances, 4] indicating the
      coordinates of the four corners of the boxes.
    blackout: A boolean tensor with shape [num_instances] indicating whether to
      blackout (zero-out) the weights within the box regions.

  Returns:
    A float tensor with shape [height, width] where all values within the
    regions of the blackout boxes are 0.0 and 1.0 else where.
  """
  num_instances, _ = shape_utils.combined_static_and_dynamic_shape(boxes)
  # If no annotation instance is provided, return all ones (instead of
  # unexpected values) to avoid NaN loss value.
  if num_instances == 0:
    return tf.ones([height, width], dtype=tf.float32)

  (y_grid, x_grid) = image_shape_to_grids(height, width)
  y_grid = tf.expand_dims(y_grid, axis=0)
  x_grid = tf.expand_dims(x_grid, axis=0)
  y_min = tf.expand_dims(boxes[:, 0:1], axis=-1)
  x_min = tf.expand_dims(boxes[:, 1:2], axis=-1)
  y_max = tf.expand_dims(boxes[:, 2:3], axis=-1)
  x_max = tf.expand_dims(boxes[:, 3:], axis=-1)

  # Make the mask with all 1.0 in the box regions.
  # Shape: [num_instances, height, width]
  in_boxes = tf.cast(
      tf.logical_and(
          tf.logical_and(y_grid >= y_min, y_grid <= y_max),
          tf.logical_and(x_grid >= x_min, x_grid <= x_max)),
      dtype=tf.float32)

  # Shape: [num_instances, height, width]
  blackout = tf.tile(
      tf.expand_dims(tf.expand_dims(blackout, axis=-1), axis=-1),
      [1, height, width])

  # Select only the boxes specified by blackout.
  selected_in_boxes = tf.where(blackout, in_boxes, tf.zeros_like(in_boxes))
  out_boxes = tf.reduce_max(selected_in_boxes, axis=0)
  out_boxes = tf.ones_like(out_boxes) - out_boxes
  return out_boxes


def _get_yx_indices_offset_by_radius(radius):
  """Gets the y and x index offsets that are within the radius."""
  y_offsets = []
  x_offsets = []
  for y_offset in range(-radius, radius + 1, 1):
    for x_offset in range(-radius, radius + 1, 1):
      if x_offset ** 2 + y_offset ** 2 <= radius ** 2:
        y_offsets.append(y_offset)
        x_offsets.append(x_offset)
  return (tf.constant(y_offsets, dtype=tf.float32),
          tf.constant(x_offsets, dtype=tf.float32))


def get_surrounding_grids(height, width, y_coordinates, x_coordinates, radius):
  """Gets the indices of the surrounding pixels of the input y, x coordinates.

  This function returns the pixel indices corresponding to the (floor of the)
  input coordinates and their surrounding pixels within the radius. If the
  radius is set to 0, then only the pixels that correspond to the floor of the
  coordinates will be returned. If the radius is larger than 0, then all of the
  pixels within the radius of the "floor pixels" will also be returned. For
  example, if the input coorindate is [2.1, 3.5] and radius is 1, then the five
  pixel indices will be returned: [2, 3], [1, 3], [2, 2], [2, 4], [3, 3]. Also,
  if the surrounding pixels are outside of valid image region, then the returned
  pixel indices will be [0, 0] and its corresponding "valid" value will be
  False.

  Args:
    height: int, the height of the output image.
    width: int, the width of the output image.
    y_coordinates: A tensor with shape [num_points] representing the absolute
      y-coordinates (in the output image space) of the points.
    x_coordinates: A tensor with shape [num_points] representing the absolute
      x-coordinates (in the output image space) of the points.
    radius: int, the radius of the neighboring pixels to be considered and
      returned. If set to 0, then only the pixel indices corresponding to the
      floor of the input coordinates will be returned.

  Returns:
    A tuple of three tensors:
      y_indices: A [num_points, num_neighbors] float tensor representing the
        pixel y indices corresponding to the input points within radius. The
        "num_neighbors" is determined by the size of the radius.
      x_indices: A [num_points, num_neighbors] float tensor representing the
        pixel x indices corresponding to the input points within radius. The
        "num_neighbors" is determined by the size of the radius.
      valid: A [num_points, num_neighbors] boolean tensor representing whether
        each returned index is in valid image region or not.
  """
  # Floored y, x: [num_points, 1].
  y_center = tf.expand_dims(tf.math.floor(y_coordinates), axis=-1)
  x_center = tf.expand_dims(tf.math.floor(x_coordinates), axis=-1)
  y_offsets, x_offsets = _get_yx_indices_offset_by_radius(radius)
  # Indices offsets: [1, num_neighbors].
  y_offsets = tf.expand_dims(y_offsets, axis=0)
  x_offsets = tf.expand_dims(x_offsets, axis=0)

  # Floor + offsets: [num_points, num_neighbors].
  y_output = y_center + y_offsets
  x_output = x_center + x_offsets
  default_output = tf.zeros_like(y_output)
  valid = tf.logical_and(
      tf.logical_and(x_output >= 0, x_output < width),
      tf.logical_and(y_output >= 0, y_output < height))
  y_output = tf.where(valid, y_output, default_output)
  x_output = tf.where(valid, x_output, default_output)
  return (y_output, x_output, valid)