|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Provides DeepLab model definition and helper functions. |
|
|
|
DeepLab is a deep learning system for semantic image segmentation with |
|
the following features: |
|
|
|
(1) Atrous convolution to explicitly control the resolution at which |
|
feature responses are computed within Deep Convolutional Neural Networks. |
|
|
|
(2) Atrous spatial pyramid pooling (ASPP) to robustly segment objects at |
|
multiple scales with filters at multiple sampling rates and effective |
|
fields-of-views. |
|
|
|
(3) ASPP module augmented with image-level feature and batch normalization. |
|
|
|
(4) A simple yet effective decoder module to recover the object boundaries. |
|
|
|
See the following papers for more details: |
|
|
|
"Encoder-Decoder with Atrous Separable Convolution for Semantic Image |
|
Segmentation" |
|
Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam. |
|
(https://arxiv.org/abs1802.02611) |
|
|
|
"Rethinking Atrous Convolution for Semantic Image Segmentation," |
|
Liang-Chieh Chen, George Papandreou, Florian Schroff, Hartwig Adam |
|
(https://arxiv.org/abs/1706.05587) |
|
|
|
"DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, |
|
Atrous Convolution, and Fully Connected CRFs", |
|
Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy, |
|
Alan L Yuille (* equal contribution) |
|
(https://arxiv.org/abs/1606.00915) |
|
|
|
"Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected |
|
CRFs" |
|
Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy, |
|
Alan L. Yuille (* equal contribution) |
|
(https://arxiv.org/abs/1412.7062) |
|
""" |
|
import collections |
|
import tensorflow as tf |
|
|
|
from deeplab import model |
|
from feelvos import common |
|
from feelvos.utils import embedding_utils |
|
from feelvos.utils import train_utils |
|
|
|
slim = tf.contrib.slim |
|
|
|
|
|
get_branch_logits = model.get_branch_logits |
|
get_extra_layer_scopes = model.get_extra_layer_scopes |
|
multi_scale_logits_v2 = model.multi_scale_logits |
|
refine_by_decoder = model.refine_by_decoder |
|
scale_dimension = model.scale_dimension |
|
split_separable_conv2d = model.split_separable_conv2d |
|
|
|
MERGED_LOGITS_SCOPE = model.MERGED_LOGITS_SCOPE |
|
IMAGE_POOLING_SCOPE = model.IMAGE_POOLING_SCOPE |
|
ASPP_SCOPE = model.ASPP_SCOPE |
|
CONCAT_PROJECTION_SCOPE = model.CONCAT_PROJECTION_SCOPE |
|
|
|
|
|
def predict_labels(images, |
|
model_options, |
|
image_pyramid=None, |
|
reference_labels=None, |
|
k_nearest_neighbors=1, |
|
embedding_dimension=None, |
|
use_softmax_feedback=False, |
|
initial_softmax_feedback=None, |
|
embedding_seg_feature_dimension=256, |
|
embedding_seg_n_layers=4, |
|
embedding_seg_kernel_size=7, |
|
embedding_seg_atrous_rates=None, |
|
also_return_softmax_probabilities=False, |
|
num_frames_per_video=None, |
|
normalize_nearest_neighbor_distances=False, |
|
also_attend_to_previous_frame=False, |
|
use_local_previous_frame_attention=False, |
|
previous_frame_attention_window_size=9, |
|
use_first_frame_matching=True, |
|
also_return_embeddings=False, |
|
ref_embeddings=None): |
|
"""Predicts segmentation labels. |
|
|
|
Args: |
|
images: A tensor of size [batch, height, width, channels]. |
|
model_options: An InternalModelOptions instance to configure models. |
|
image_pyramid: Input image scales for multi-scale feature extraction. |
|
reference_labels: A tensor of size [batch, height, width, 1]. |
|
ground truth labels used to perform a nearest neighbor query |
|
k_nearest_neighbors: Integer, the number of neighbors to use for nearest |
|
neighbor queries. |
|
embedding_dimension: Integer, the dimension used for the learned embedding. |
|
use_softmax_feedback: Boolean, whether to give the softmax predictions of |
|
the last frame as additional input to the segmentation head. |
|
initial_softmax_feedback: Float32 tensor, or None. Can be used to |
|
initialize the softmax predictions used for the feedback loop. |
|
Typically only useful for inference. Only has an effect if |
|
use_softmax_feedback is True. |
|
embedding_seg_feature_dimension: Integer, the dimensionality used in the |
|
segmentation head layers. |
|
embedding_seg_n_layers: Integer, the number of layers in the segmentation |
|
head. |
|
embedding_seg_kernel_size: Integer, the kernel size used in the |
|
segmentation head. |
|
embedding_seg_atrous_rates: List of integers of length |
|
embedding_seg_n_layers, the atrous rates to use for the segmentation head. |
|
also_return_softmax_probabilities: Boolean, if true, additionally return |
|
the softmax probabilities as second return value. |
|
num_frames_per_video: Integer, the number of frames per video. |
|
normalize_nearest_neighbor_distances: Boolean, whether to normalize the |
|
nearest neighbor distances to [0,1] using sigmoid, scale and shift. |
|
also_attend_to_previous_frame: Boolean, whether to also use nearest |
|
neighbor attention with respect to the previous frame. |
|
use_local_previous_frame_attention: Boolean, whether to restrict the |
|
previous frame attention to a local search window. |
|
Only has an effect, if also_attend_to_previous_frame is True. |
|
previous_frame_attention_window_size: Integer, the window size used for |
|
local previous frame attention, if use_local_previous_frame_attention |
|
is True. |
|
use_first_frame_matching: Boolean, whether to extract features by matching |
|
to the reference frame. This should always be true except for ablation |
|
experiments. |
|
also_return_embeddings: Boolean, whether to return the embeddings as well. |
|
ref_embeddings: Tuple of |
|
(first_frame_embeddings, previous_frame_embeddings), |
|
each of shape [batch, height, width, embedding_dimension], or None. |
|
|
|
Returns: |
|
A dictionary with keys specifying the output_type (e.g., semantic |
|
prediction) and values storing Tensors representing predictions (argmax |
|
over channels). Each prediction has size [batch, height, width]. |
|
If also_return_softmax_probabilities is True, the second return value are |
|
the softmax probabilities. |
|
If also_return_embeddings is True, it will also return an embeddings |
|
tensor of shape [batch, height, width, embedding_dimension]. |
|
|
|
Raises: |
|
ValueError: If classification_loss is not softmax, softmax_with_attention, |
|
nor triplet. |
|
""" |
|
if (model_options.classification_loss == 'triplet' and |
|
reference_labels is None): |
|
raise ValueError('Need reference_labels for triplet loss') |
|
|
|
if model_options.classification_loss == 'softmax_with_attention': |
|
if embedding_dimension is None: |
|
raise ValueError('Need embedding_dimension for softmax_with_attention ' |
|
'loss') |
|
if reference_labels is None: |
|
raise ValueError('Need reference_labels for softmax_with_attention loss') |
|
res = ( |
|
multi_scale_logits_with_nearest_neighbor_matching( |
|
images, |
|
model_options=model_options, |
|
image_pyramid=image_pyramid, |
|
is_training=False, |
|
reference_labels=reference_labels, |
|
clone_batch_size=1, |
|
num_frames_per_video=num_frames_per_video, |
|
embedding_dimension=embedding_dimension, |
|
max_neighbors_per_object=0, |
|
k_nearest_neighbors=k_nearest_neighbors, |
|
use_softmax_feedback=use_softmax_feedback, |
|
initial_softmax_feedback=initial_softmax_feedback, |
|
embedding_seg_feature_dimension=embedding_seg_feature_dimension, |
|
embedding_seg_n_layers=embedding_seg_n_layers, |
|
embedding_seg_kernel_size=embedding_seg_kernel_size, |
|
embedding_seg_atrous_rates=embedding_seg_atrous_rates, |
|
normalize_nearest_neighbor_distances= |
|
normalize_nearest_neighbor_distances, |
|
also_attend_to_previous_frame=also_attend_to_previous_frame, |
|
use_local_previous_frame_attention= |
|
use_local_previous_frame_attention, |
|
previous_frame_attention_window_size= |
|
previous_frame_attention_window_size, |
|
use_first_frame_matching=use_first_frame_matching, |
|
also_return_embeddings=also_return_embeddings, |
|
ref_embeddings=ref_embeddings |
|
)) |
|
if also_return_embeddings: |
|
outputs_to_scales_to_logits, embeddings = res |
|
else: |
|
outputs_to_scales_to_logits = res |
|
embeddings = None |
|
else: |
|
outputs_to_scales_to_logits = multi_scale_logits_v2( |
|
images, |
|
model_options=model_options, |
|
image_pyramid=image_pyramid, |
|
is_training=False, |
|
fine_tune_batch_norm=False) |
|
|
|
predictions = {} |
|
for output in sorted(outputs_to_scales_to_logits): |
|
scales_to_logits = outputs_to_scales_to_logits[output] |
|
original_logits = scales_to_logits[MERGED_LOGITS_SCOPE] |
|
if isinstance(original_logits, list): |
|
assert len(original_logits) == 1 |
|
original_logits = original_logits[0] |
|
logits = tf.image.resize_bilinear(original_logits, tf.shape(images)[1:3], |
|
align_corners=True) |
|
if model_options.classification_loss in ('softmax', |
|
'softmax_with_attention'): |
|
predictions[output] = tf.argmax(logits, 3) |
|
elif model_options.classification_loss == 'triplet': |
|
|
|
|
|
|
|
embeddings = original_logits |
|
reference_labels_logits_size = tf.squeeze( |
|
tf.image.resize_nearest_neighbor( |
|
reference_labels[tf.newaxis], |
|
train_utils.resolve_shape(embeddings)[1:3], |
|
align_corners=True), axis=0) |
|
nn_labels = embedding_utils.assign_labels_by_nearest_neighbors( |
|
embeddings[0], embeddings[1:], reference_labels_logits_size, |
|
k_nearest_neighbors) |
|
predictions[common.OUTPUT_TYPE] = tf.image.resize_nearest_neighbor( |
|
nn_labels, tf.shape(images)[1:3], align_corners=True) |
|
else: |
|
raise ValueError( |
|
'Only support softmax, triplet, or softmax_with_attention for ' |
|
'classification_loss.') |
|
|
|
if also_return_embeddings: |
|
assert also_return_softmax_probabilities |
|
return predictions, tf.nn.softmax(original_logits, axis=-1), embeddings |
|
elif also_return_softmax_probabilities: |
|
return predictions, tf.nn.softmax(original_logits, axis=-1) |
|
else: |
|
return predictions |
|
|
|
|
|
def multi_scale_logits_with_nearest_neighbor_matching( |
|
images, |
|
model_options, |
|
image_pyramid, |
|
clone_batch_size, |
|
reference_labels, |
|
num_frames_per_video, |
|
embedding_dimension, |
|
max_neighbors_per_object, |
|
weight_decay=0.0001, |
|
is_training=False, |
|
fine_tune_batch_norm=False, |
|
k_nearest_neighbors=1, |
|
use_softmax_feedback=False, |
|
initial_softmax_feedback=None, |
|
embedding_seg_feature_dimension=256, |
|
embedding_seg_n_layers=4, |
|
embedding_seg_kernel_size=7, |
|
embedding_seg_atrous_rates=None, |
|
normalize_nearest_neighbor_distances=False, |
|
also_attend_to_previous_frame=False, |
|
damage_initial_previous_frame_mask=False, |
|
use_local_previous_frame_attention=False, |
|
previous_frame_attention_window_size=9, |
|
use_first_frame_matching=True, |
|
also_return_embeddings=False, |
|
ref_embeddings=None): |
|
"""Gets the logits for multi-scale inputs using nearest neighbor attention. |
|
|
|
Adjusted version of multi_scale_logits_v2 to support nearest neighbor |
|
attention and a variable number of classes for each element of the batch. |
|
The returned logits are all downsampled (due to max-pooling layers) |
|
for both training and evaluation. |
|
|
|
Args: |
|
images: A tensor of size [batch, height, width, channels]. |
|
model_options: A ModelOptions instance to configure models. |
|
image_pyramid: Input image scales for multi-scale feature extraction. |
|
clone_batch_size: Integer, the number of videos on a batch. |
|
reference_labels: The segmentation labels of the reference frame on which |
|
attention is applied. |
|
num_frames_per_video: Integer, the number of frames per video. |
|
embedding_dimension: Integer, the dimension of the embedding. |
|
max_neighbors_per_object: Integer, the maximum number of candidates |
|
for the nearest neighbor query per object after subsampling. |
|
Can be 0 for no subsampling. |
|
weight_decay: The weight decay for model variables. |
|
is_training: Is training or not. |
|
fine_tune_batch_norm: Fine-tune the batch norm parameters or not. |
|
k_nearest_neighbors: Integer, the number of nearest neighbors to use. |
|
use_softmax_feedback: Boolean, whether to give the softmax predictions of |
|
the last frame as additional input to the segmentation head. |
|
initial_softmax_feedback: List of Float32 tensors, or None. |
|
Can be used to initialize the softmax predictions used for the feedback |
|
loop. Only has an effect if use_softmax_feedback is True. |
|
embedding_seg_feature_dimension: Integer, the dimensionality used in the |
|
segmentation head layers. |
|
embedding_seg_n_layers: Integer, the number of layers in the segmentation |
|
head. |
|
embedding_seg_kernel_size: Integer, the kernel size used in the |
|
segmentation head. |
|
embedding_seg_atrous_rates: List of integers of length |
|
embedding_seg_n_layers, the atrous rates to use for the segmentation head. |
|
normalize_nearest_neighbor_distances: Boolean, whether to normalize the |
|
nearest neighbor distances to [0,1] using sigmoid, scale and shift. |
|
also_attend_to_previous_frame: Boolean, whether to also use nearest |
|
neighbor attention with respect to the previous frame. |
|
damage_initial_previous_frame_mask: Boolean, whether to artificially damage |
|
the initial previous frame mask. Only has an effect if |
|
also_attend_to_previous_frame is True. |
|
use_local_previous_frame_attention: Boolean, whether to restrict the |
|
previous frame attention to a local search window. |
|
Only has an effect, if also_attend_to_previous_frame is True. |
|
previous_frame_attention_window_size: Integer, the window size used for |
|
local previous frame attention, if use_local_previous_frame_attention |
|
is True. |
|
use_first_frame_matching: Boolean, whether to extract features by matching |
|
to the reference frame. This should always be true except for ablation |
|
experiments. |
|
also_return_embeddings: Boolean, whether to return the embeddings as well. |
|
ref_embeddings: Tuple of |
|
(first_frame_embeddings, previous_frame_embeddings), |
|
each of shape [batch, height, width, embedding_dimension], or None. |
|
|
|
Returns: |
|
outputs_to_scales_to_logits: A map of maps from output_type (e.g., |
|
semantic prediction) to a dictionary of multi-scale logits names to |
|
logits. For each output_type, the dictionary has keys which |
|
correspond to the scales and values which correspond to the logits. |
|
For example, if `scales` equals [1.0, 1.5], then the keys would |
|
include 'merged_logits', 'logits_1.00' and 'logits_1.50'. |
|
If also_return_embeddings is True, it will also return an embeddings |
|
tensor of shape [batch, height, width, embedding_dimension]. |
|
|
|
Raises: |
|
ValueError: If model_options doesn't specify crop_size and its |
|
add_image_level_feature = True, since add_image_level_feature requires |
|
crop_size information. |
|
""" |
|
|
|
if not image_pyramid: |
|
image_pyramid = [1.0] |
|
crop_height = ( |
|
model_options.crop_size[0] |
|
if model_options.crop_size else tf.shape(images)[1]) |
|
crop_width = ( |
|
model_options.crop_size[1] |
|
if model_options.crop_size else tf.shape(images)[2]) |
|
|
|
|
|
if model_options.decoder_output_stride: |
|
logits_output_stride = min(model_options.decoder_output_stride) |
|
else: |
|
logits_output_stride = model_options.output_stride |
|
logits_height = scale_dimension( |
|
crop_height, |
|
max(1.0, max(image_pyramid)) / logits_output_stride) |
|
logits_width = scale_dimension( |
|
crop_width, |
|
max(1.0, max(image_pyramid)) / logits_output_stride) |
|
|
|
|
|
outputs_to_scales_to_logits = { |
|
k: {} |
|
for k in model_options.outputs_to_num_classes |
|
} |
|
|
|
for image_scale in image_pyramid: |
|
if image_scale != 1.0: |
|
scaled_height = scale_dimension(crop_height, image_scale) |
|
scaled_width = scale_dimension(crop_width, image_scale) |
|
scaled_crop_size = [scaled_height, scaled_width] |
|
scaled_images = tf.image.resize_bilinear( |
|
images, scaled_crop_size, align_corners=True) |
|
scaled_reference_labels = tf.image.resize_nearest_neighbor( |
|
reference_labels, scaled_crop_size, align_corners=True |
|
) |
|
if model_options.crop_size is None: |
|
scaled_crop_size = None |
|
if model_options.crop_size: |
|
scaled_images.set_shape([None, scaled_height, scaled_width, 3]) |
|
else: |
|
scaled_crop_size = model_options.crop_size |
|
scaled_images = images |
|
scaled_reference_labels = reference_labels |
|
|
|
updated_options = model_options._replace(crop_size=scaled_crop_size) |
|
res = embedding_utils.get_logits_with_matching( |
|
scaled_images, |
|
updated_options, |
|
weight_decay=weight_decay, |
|
reuse=tf.AUTO_REUSE, |
|
is_training=is_training, |
|
fine_tune_batch_norm=fine_tune_batch_norm, |
|
reference_labels=scaled_reference_labels, |
|
batch_size=clone_batch_size, |
|
num_frames_per_video=num_frames_per_video, |
|
embedding_dimension=embedding_dimension, |
|
max_neighbors_per_object=max_neighbors_per_object, |
|
k_nearest_neighbors=k_nearest_neighbors, |
|
use_softmax_feedback=use_softmax_feedback, |
|
initial_softmax_feedback=initial_softmax_feedback, |
|
embedding_seg_feature_dimension=embedding_seg_feature_dimension, |
|
embedding_seg_n_layers=embedding_seg_n_layers, |
|
embedding_seg_kernel_size=embedding_seg_kernel_size, |
|
embedding_seg_atrous_rates=embedding_seg_atrous_rates, |
|
normalize_nearest_neighbor_distances= |
|
normalize_nearest_neighbor_distances, |
|
also_attend_to_previous_frame=also_attend_to_previous_frame, |
|
damage_initial_previous_frame_mask=damage_initial_previous_frame_mask, |
|
use_local_previous_frame_attention=use_local_previous_frame_attention, |
|
previous_frame_attention_window_size= |
|
previous_frame_attention_window_size, |
|
use_first_frame_matching=use_first_frame_matching, |
|
also_return_embeddings=also_return_embeddings, |
|
ref_embeddings=ref_embeddings |
|
) |
|
if also_return_embeddings: |
|
outputs_to_logits, embeddings = res |
|
else: |
|
outputs_to_logits = res |
|
embeddings = None |
|
|
|
|
|
for output in sorted(outputs_to_logits): |
|
if isinstance(outputs_to_logits[output], collections.Sequence): |
|
outputs_to_logits[output] = [tf.image.resize_bilinear( |
|
x, [logits_height, logits_width], align_corners=True) |
|
for x in outputs_to_logits[output]] |
|
else: |
|
outputs_to_logits[output] = tf.image.resize_bilinear( |
|
outputs_to_logits[output], [logits_height, logits_width], |
|
align_corners=True) |
|
|
|
|
|
if len(image_pyramid) == 1: |
|
for output in sorted(model_options.outputs_to_num_classes): |
|
outputs_to_scales_to_logits[output][ |
|
MERGED_LOGITS_SCOPE] = outputs_to_logits[output] |
|
if also_return_embeddings: |
|
return outputs_to_scales_to_logits, embeddings |
|
else: |
|
return outputs_to_scales_to_logits |
|
|
|
|
|
for output in sorted(model_options.outputs_to_num_classes): |
|
outputs_to_scales_to_logits[output][ |
|
'logits_%.2f' % image_scale] = outputs_to_logits[output] |
|
|
|
|
|
for output in sorted(model_options.outputs_to_num_classes): |
|
|
|
all_logits = [ |
|
[tf.expand_dims(l, axis=4)] |
|
for logits in outputs_to_scales_to_logits[output].values() |
|
for l in logits |
|
] |
|
transposed = map(list, zip(*all_logits)) |
|
all_logits = [tf.concat(t, 4) for t in transposed] |
|
merge_fn = ( |
|
tf.reduce_max |
|
if model_options.merge_method == 'max' else tf.reduce_mean) |
|
outputs_to_scales_to_logits[output][MERGED_LOGITS_SCOPE] = [merge_fn( |
|
l, axis=4) for l in all_logits] |
|
|
|
if also_return_embeddings: |
|
return outputs_to_scales_to_logits, embeddings |
|
else: |
|
return outputs_to_scales_to_logits |
|
|