|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Provides flags that are common to scripts. |
|
|
|
Common flags from train/vis_video.py are collected in this script. |
|
""" |
|
import tensorflow as tf |
|
|
|
from deeplab import common |
|
|
|
flags = tf.app.flags |
|
|
|
flags.DEFINE_enum( |
|
'classification_loss', 'softmax_with_attention', |
|
['softmax', 'triplet', 'softmax_with_attention'], |
|
'Type of loss function used for classifying pixels, can be either softmax, ' |
|
'softmax_with_attention, or triplet.') |
|
|
|
flags.DEFINE_integer('k_nearest_neighbors', 1, |
|
'The number of nearest neighbors to use.') |
|
|
|
flags.DEFINE_integer('embedding_dimension', 100, 'The dimension used for the ' |
|
'learned embedding') |
|
|
|
flags.DEFINE_boolean('use_softmax_feedback', True, |
|
'Whether to give the softmax predictions of the last ' |
|
'frame as additional input to the segmentation head.') |
|
|
|
flags.DEFINE_boolean('sample_adjacent_and_consistent_query_frames', True, |
|
'If true, the query frames (all but the first frame ' |
|
'which is the reference frame) will be sampled such ' |
|
'that they are adjacent video frames and have the same ' |
|
'crop coordinates and flip augmentation. Note that if ' |
|
'use_softmax_feedback is True, this option will ' |
|
'automatically be activated.') |
|
|
|
flags.DEFINE_integer('embedding_seg_feature_dimension', 256, |
|
'The dimensionality used in the segmentation head layers.') |
|
|
|
flags.DEFINE_integer('embedding_seg_n_layers', 4, 'The number of layers in the ' |
|
'segmentation head.') |
|
|
|
flags.DEFINE_integer('embedding_seg_kernel_size', 7, 'The kernel size used in ' |
|
'the segmentation head.') |
|
|
|
flags.DEFINE_multi_integer('embedding_seg_atrous_rates', [], |
|
'The atrous rates to use for the segmentation head.') |
|
|
|
flags.DEFINE_boolean('normalize_nearest_neighbor_distances', True, |
|
'Whether to normalize the nearest neighbor distances ' |
|
'to [0,1] using sigmoid, scale and shift.') |
|
|
|
flags.DEFINE_boolean('also_attend_to_previous_frame', True, 'Whether to also ' |
|
'use nearest neighbor attention with respect to the ' |
|
'previous frame.') |
|
|
|
flags.DEFINE_bool('use_local_previous_frame_attention', True, |
|
'Whether to restrict the previous frame attention to a local ' |
|
'search window. Only has an effect, if ' |
|
'also_attend_to_previous_frame is True.') |
|
|
|
flags.DEFINE_integer('previous_frame_attention_window_size', 15, |
|
'The window size used for local previous frame attention,' |
|
' if use_local_previous_frame_attention is True.') |
|
|
|
flags.DEFINE_boolean('use_first_frame_matching', True, 'Whether to extract ' |
|
'features by matching to the reference frame. This should ' |
|
'always be true except for ablation experiments.') |
|
|
|
FLAGS = flags.FLAGS |
|
|
|
|
|
|
|
|
|
OUTPUT_TYPE = common.OUTPUT_TYPE |
|
|
|
|
|
LABELS_CLASS = common.LABELS_CLASS |
|
IMAGE = common.IMAGE |
|
HEIGHT = common.HEIGHT |
|
WIDTH = common.WIDTH |
|
IMAGE_NAME = common.IMAGE_NAME |
|
SOURCE_ID = 'source_id' |
|
VIDEO_ID = 'video_id' |
|
LABEL = common.LABEL |
|
ORIGINAL_IMAGE = common.ORIGINAL_IMAGE |
|
PRECEDING_FRAME_LABEL = 'preceding_frame_label' |
|
|
|
|
|
TEST_SET = common.TEST_SET |
|
|
|
|
|
OBJECT_LABEL = 'object_label' |
|
|
|
|
|
class VideoModelOptions(common.ModelOptions): |
|
"""Internal version of immutable class to hold model options.""" |
|
|
|
def __new__(cls, |
|
outputs_to_num_classes, |
|
crop_size=None, |
|
atrous_rates=None, |
|
output_stride=8): |
|
"""Constructor to set default values. |
|
|
|
Args: |
|
outputs_to_num_classes: A dictionary from output type to the number of |
|
classes. For example, for the task of semantic segmentation with 21 |
|
semantic classes, we would have outputs_to_num_classes['semantic'] = 21. |
|
crop_size: A tuple [crop_height, crop_width]. |
|
atrous_rates: A list of atrous convolution rates for ASPP. |
|
output_stride: The ratio of input to output spatial resolution. |
|
|
|
Returns: |
|
A new VideoModelOptions instance. |
|
""" |
|
self = super(VideoModelOptions, cls).__new__( |
|
cls, |
|
outputs_to_num_classes, |
|
crop_size, |
|
atrous_rates, |
|
output_stride) |
|
|
|
self.classification_loss = FLAGS.classification_loss |
|
|
|
return self |
|
|
|
|
|
def parse_decoder_output_stride(): |
|
"""Parses decoder output stride. |
|
|
|
FEELVOS assumes decoder_output_stride = 4. Thus, this function is created for |
|
this particular purpose. |
|
|
|
Returns: |
|
An integer specifying the decoder_output_stride. |
|
|
|
Raises: |
|
ValueError: If decoder_output_stride is None or contains more than one |
|
element. |
|
""" |
|
if FLAGS.decoder_output_stride: |
|
decoder_output_stride = [ |
|
int(x) for x in FLAGS.decoder_output_stride] |
|
if len(decoder_output_stride) != 1: |
|
raise ValueError('Expect decoder output stride has only one element.') |
|
decoder_output_stride = decoder_output_stride[0] |
|
else: |
|
raise ValueError('Expect flag decoder output stride not to be None.') |
|
return decoder_output_stride |
|
|