"""Interface for different embedders for modalities.""" |
import abc |
import numpy as np |
import tensorflow as tf |
import preprocessing |
from tensorflow.contrib.slim.nets import resnet_v2 |
slim = tf.contrib.slim |
class Embedder(object): |
"""Represents the embedder for different modalities. |
Modalities can be semantic segmentation, depth channel, object detection and |
so on, which require specific embedder for them. |
""" |
__metaclass__ = abc.ABCMeta |
@abc.abstractmethod |
def build(self, observation): |
"""Builds the model to embed the observation modality. |
Args: |
observation: tensor that contains the raw observation from modality. |
Returns: |
Embedding tensor for the given observation tensor. |
""" |
raise NotImplementedError( |
'Needs to be implemented as part of Embedder Interface') |
class DetectionBoxEmbedder(Embedder): |
"""Represents the model that encodes the detection boxes from images.""" |
def __init__(self, rnn_state_size, scope=None): |
self._rnn_state_size = rnn_state_size |
self._scope = scope |
def build(self, observations): |
"""Builds the model to embed object detection observations. |
Args: |
observations: a tuple of (dets, det_num). |
dets is a tensor of BxTxLxE that has the detection boxes in all the |
images of the batch. B is the batch size, T is the maximum length of |
episode, L is the maximum number of detections per image in the batch |
and E is the size of each detection embedding. |
det_num is a tensor of BxT that contains the number of detected boxes |
each image of each sequence in the batch. |
Returns: |
For each image in the batch, returns the accumulative embedding of all the |
detection boxes in that image. |
""" |
with tf.variable_scope(self._scope, default_name=''): |
shape = observations[0].shape |
dets = tf.reshape(observations[0], [-1, shape[-2], shape[-1]]) |
det_num = tf.reshape(observations[1], [-1]) |
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._rnn_state_size) |
batch_size = tf.shape(dets)[0] |
lstm_outputs, _ = tf.nn.dynamic_rnn( |
cell=lstm_cell, |
inputs=dets, |
sequence_length=det_num, |
initial_state=lstm_cell.zero_state(batch_size, dtype=tf.float32), |
dtype=tf.float32) |
batch_range = tf.range(batch_size) |
indices = tf.stack([batch_range, det_num - 1], axis=1) |
last_lstm_outputs = tf.gather_nd(lstm_outputs, indices) |
last_lstm_outputs = tf.reshape(last_lstm_outputs, |
[-1, shape[1], self._rnn_state_size]) |
return last_lstm_outputs |
class ResNet(Embedder): |
"""Residual net embedder for image data.""" |
def __init__(self, params, *args, **kwargs): |
super(ResNet, self).__init__(*args, **kwargs) |
self._params = params |
self._extra_train_ops = [] |
def build(self, images): |
shape = images.get_shape().as_list() |
if len(shape) == 5: |
images = tf.reshape(images, |
[shape[0] * shape[1], shape[2], shape[3], shape[4]]) |
embedding = self._build_model(images) |
if len(shape) == 5: |
embedding = tf.reshape(embedding, [shape[0], shape[1], -1]) |
return embedding |
@property |
def extra_train_ops(self): |
return self._extra_train_ops |
def _build_model(self, images): |
"""Builds the model.""" |
images = tf.to_float(images) |
bs = images.get_shape().as_list()[0] |
images = [ |
tf.image.per_image_standardization(tf.squeeze(i)) |
for i in tf.split(images, bs) |
] |
images = tf.concat([tf.expand_dims(i, axis=0) for i in images], axis=0) |
with tf.variable_scope('init'): |
x = self._conv('init_conv', images, 3, 3, 16, self._stride_arr(1)) |
strides = [1, 2, 2] |
activate_before_residual = [True, False, False] |
if self._params.use_bottleneck: |
res_func = self._bottleneck_residual |
filters = [16, 64, 128, 256] |
else: |
res_func = self._residual |
filters = [16, 16, 32, 128] |
with tf.variable_scope('unit_1_0'): |
x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]), |
activate_before_residual[0]) |
for i in xrange(1, self._params.num_residual_units): |
with tf.variable_scope('unit_1_%d' % i): |
x = res_func(x, filters[1], filters[1], self._stride_arr(1), False) |
with tf.variable_scope('unit_2_0'): |
x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]), |
activate_before_residual[1]) |
for i in xrange(1, self._params.num_residual_units): |
with tf.variable_scope('unit_2_%d' % i): |
x = res_func(x, filters[2], filters[2], self._stride_arr(1), False) |
with tf.variable_scope('unit_3_0'): |
x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]), |
activate_before_residual[2]) |
for i in xrange(1, self._params.num_residual_units): |
with tf.variable_scope('unit_3_%d' % i): |
x = res_func(x, filters[3], filters[3], self._stride_arr(1), False) |
with tf.variable_scope('unit_last'): |
x = self._batch_norm('final_bn', x) |
x = self._relu(x, self._params.relu_leakiness) |
with tf.variable_scope('pool_logit'): |
x = self._global_avg_pooling(x) |
return x |
def _stride_arr(self, stride): |
return [1, stride, stride, 1] |
def _batch_norm(self, name, x): |
"""batch norm implementation.""" |
with tf.variable_scope(name): |
params_shape = [x.shape[-1]] |
beta = tf.get_variable( |
'beta', |
params_shape, |
tf.float32, |
initializer=tf.constant_initializer(0.0, tf.float32)) |
gamma = tf.get_variable( |
'gamma', |
params_shape, |
tf.float32, |
initializer=tf.constant_initializer(1.0, tf.float32)) |
if self._params.is_train: |
mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments') |
moving_mean = tf.get_variable( |
'moving_mean', |
params_shape, |
tf.float32, |
initializer=tf.constant_initializer(0.0, tf.float32), |
trainable=False) |
moving_variance = tf.get_variable( |
'moving_variance', |
params_shape, |
tf.float32, |
initializer=tf.constant_initializer(1.0, tf.float32), |
trainable=False) |
self._extra_train_ops.append( |
tf.assign_moving_average(moving_mean, mean, 0.9)) |
self._extra_train_ops.append( |
tf.assign_moving_average(moving_variance, variance, 0.9)) |
else: |
mean = tf.get_variable( |
'moving_mean', |
params_shape, |
tf.float32, |
initializer=tf.constant_initializer(0.0, tf.float32), |
trainable=False) |
variance = tf.get_variable( |
'moving_variance', |
params_shape, |
tf.float32, |
initializer=tf.constant_initializer(1.0, tf.float32), |
trainable=False) |
tf.summary.histogram(mean.op.name, mean) |
tf.summary.histogram(variance.op.name, variance) |
y = tf.nn.batch_normalization(x, mean, variance, beta, gamma, 0.001) |
y.set_shape(x.shape) |
return y |
def _residual(self, |
x, |
in_filter, |
out_filter, |
stride, |
activate_before_residual=False): |
"""Residual unit with 2 sub layers.""" |
if activate_before_residual: |
with tf.variable_scope('shared_activation'): |
x = self._batch_norm('init_bn', x) |
x = self._relu(x, self._params.relu_leakiness) |
orig_x = x |
else: |
with tf.variable_scope('residual_only_activation'): |
orig_x = x |
x = self._batch_norm('init_bn', x) |
x = self._relu(x, self._params.relu_leakiness) |
with tf.variable_scope('sub1'): |
x = self._conv('conv1', x, 3, in_filter, out_filter, stride) |
with tf.variable_scope('sub2'): |
x = self._batch_norm('bn2', x) |
x = self._relu(x, self._params.relu_leakiness) |
x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1]) |
with tf.variable_scope('sub_add'): |
if in_filter != out_filter: |
orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID') |
orig_x = tf.pad( |
orig_x, [[0, 0], [0, 0], [0, 0], [(out_filter - in_filter) // 2, |
(out_filter - in_filter) // 2]]) |
x += orig_x |
return x |
def _bottleneck_residual(self, |
x, |
in_filter, |
out_filter, |
stride, |
activate_before_residual=False): |
"""A residual convolutional layer with a bottleneck. |
The layer is a composite of three convolutional layers with a ReLU non- |
linearity and batch normalization after each linear convolution. The depth |
if the second and third layer is out_filter / 4 (hence it is a bottleneck). |
Args: |
x: a float 4 rank Tensor representing the input to the layer. |
in_filter: a python integer representing depth of the input. |
out_filter: a python integer representing depth of the output. |
stride: a python integer denoting the stride of the layer applied before |
the first convolution. |
activate_before_residual: a python boolean. If True, then a ReLU is |
applied as a first operation on the input x before everything else. |
Returns: |
A 4 rank Tensor with batch_size = batch size of input, width and height = |
width / stride and height / stride of the input and depth = out_filter. |
""" |
if activate_before_residual: |
with tf.variable_scope('common_bn_relu'): |
x = self._batch_norm('init_bn', x) |
x = self._relu(x, self._params.relu_leakiness) |
orig_x = x |
else: |
with tf.variable_scope('residual_bn_relu'): |
orig_x = x |
x = self._batch_norm('init_bn', x) |
x = self._relu(x, self._params.relu_leakiness) |
with tf.variable_scope('sub1'): |
x = self._conv('conv1', x, 1, in_filter, out_filter / 4, stride) |
with tf.variable_scope('sub2'): |
x = self._batch_norm('bn2', x) |
x = self._relu(x, self._params.relu_leakiness) |
x = self._conv('conv2', x, 3, out_filter / 4, out_filter / 4, |
[1, 1, 1, 1]) |
with tf.variable_scope('sub3'): |
x = self._batch_norm('bn3', x) |
x = self._relu(x, self._params.relu_leakiness) |
x = self._conv('conv3', x, 1, out_filter / 4, out_filter, [1, 1, 1, 1]) |
with tf.variable_scope('sub_add'): |
if in_filter != out_filter: |
orig_x = self._conv('project', orig_x, 1, in_filter, out_filter, stride) |
x += orig_x |
return x |
def _decay(self): |
costs = [] |
for var in tf.trainable_variables(): |
if var.op.name.find(r'DW') > 0: |
costs.append(tf.nn.l2_loss(var)) |
return tf.mul(self._params.weight_decay_rate, tf.add_n(costs)) |
def _conv(self, name, x, filter_size, in_filters, out_filters, strides): |
"""Convolution.""" |
with tf.variable_scope(name): |
n = filter_size * filter_size * out_filters |
kernel = tf.get_variable( |
'DW', [filter_size, filter_size, in_filters, out_filters], |
tf.float32, |
initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / n))) |
return tf.nn.conv2d(x, kernel, strides, padding='SAME') |
def _relu(self, x, leakiness=0.0): |
return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu') |
def _fully_connected(self, x, out_dim): |
x = tf.reshape(x, [self._params.batch_size, -1]) |
w = tf.get_variable( |
'DW', [x.get_shape()[1], out_dim], |
initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) |
b = tf.get_variable( |
'biases', [out_dim], initializer=tf.constant_initializer()) |
return tf.nn.xw_plus_b(x, w, b) |
def _global_avg_pooling(self, x): |
assert x.get_shape().ndims == 4 |
return tf.reduce_mean(x, [1, 2]) |
class MLPEmbedder(Embedder): |
"""Embedder of vectorial data. |
The net is a multi-layer perceptron, with ReLU nonlinearities in all layers |
except the last one. |
""" |
def __init__(self, layers, *args, **kwargs): |
"""Constructs MLPEmbedder. |
Args: |
layers: a list of python integers representing layer sizes. |
*args: arguments for super constructor. |
**kwargs: keyed arguments for super constructor. |
""" |
super(MLPEmbedder, self).__init__(*args, **kwargs) |
self._layers = layers |
def build(self, features): |
shape = features.get_shape().as_list() |
if len(shape) == 3: |
features = tf.reshape(features, [shape[0] * shape[1], shape[2]]) |
x = features |
for i, dim in enumerate(self._layers): |
with tf.variable_scope('layer_%i' % i): |
x = self._fully_connected(x, dim) |
if i < len(self._layers) - 1: |
x = self._relu(x) |
if len(shape) == 3: |
x = tf.reshape(x, shape[:-1] + [self._layers[-1]]) |
return x |
def _fully_connected(self, x, out_dim): |
w = tf.get_variable( |
'DW', [x.get_shape()[1], out_dim], |
initializer=tf.variance_scaling_initializer(distribution='uniform')) |
b = tf.get_variable( |
'biases', [out_dim], initializer=tf.constant_initializer()) |
return tf.nn.xw_plus_b(x, w, b) |
def _relu(self, x, leakiness=0.0): |
return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu') |
class SmallNetworkEmbedder(Embedder): |
"""Embedder for image like observations. |
The network is comprised of multiple conv layers and a fully connected layer |
at the end. The number of conv layers and the parameters are configured from |
params. |
""" |
def __init__(self, params, *args, **kwargs): |
"""Constructs the small network. |
Args: |
params: params should be tf.hparams type. params need to have a list of |
conv_sizes, conv_strides, conv_channels. The length of these lists |
should be equal to each other and to the number of conv layers in the |
network. Plus, it also needs to have boolean variable named to_one_hot |
which indicates whether the input should be converted to one hot or not. |
The size of the fully connected layer is specified by |
params.embedding_size. |
*args: The rest of the parameters. |
**kwargs: the reset of the parameters. |
Raises: |
ValueError: If the length of params.conv_strides, params.conv_sizes, and |
params.conv_channels are not equal. |
""" |
super(SmallNetworkEmbedder, self).__init__(*args, **kwargs) |
self._params = params |
if len(self._params.conv_sizes) != len(self._params.conv_strides): |
raise ValueError( |
'Conv sizes and strides should have the same length: {} != {}'.format( |
len(self._params.conv_sizes), len(self._params.conv_strides))) |
if len(self._params.conv_sizes) != len(self._params.conv_channels): |
raise ValueError( |
'Conv sizes and channels should have the same length: {} != {}'. |
format(len(self._params.conv_sizes), len(self._params.conv_channels))) |
def build(self, images): |
"""Builds the embedder with the given speicifcation. |
Args: |
images: a tensor that contains the input images which has the shape of |
NxTxHxWxC where N is the batch size, T is the maximum length of the |
sequence, H and W are the height and width of the images and C is the |
number of channels. |
Returns: |
A tensor that is the embedding of the images. |
""" |
shape = images.get_shape().as_list() |
images = tf.reshape(images, |
[shape[0] * shape[1], shape[2], shape[3], shape[4]]) |
with slim.arg_scope( |
[slim.conv2d, slim.fully_connected], |
activation_fn=tf.nn.relu, |
weights_regularizer=slim.l2_regularizer(self._params.weight_decay_rate), |
biases_initializer=tf.zeros_initializer()): |
with slim.arg_scope([slim.conv2d], padding='SAME'): |
if self._params.to_one_hot: |
net = tf.one_hot( |
tf.squeeze(tf.to_int32(images), axis=[-1]), |
self._params.one_hot_length) |
else: |
net = images |
p = self._params |
for conv_id, kernel_stride_channel in enumerate( |
zip(p.conv_sizes, p.conv_strides, p.conv_channels)): |
kernel_size, stride, channels = kernel_stride_channel |
net = slim.conv2d( |
net, |
channels, [kernel_size, kernel_size], |
stride, |
scope='conv_{}'.format(conv_id + 1)) |
net = slim.flatten(net) |
net = slim.fully_connected(net, self._params.embedding_size, scope='fc') |
output = tf.reshape(net, [shape[0], shape[1], -1]) |
return output |
class ResNet50Embedder(Embedder): |
"""Uses ResNet50 to embed input images.""" |
def build(self, images): |
"""Builds a ResNet50 embedder for the input images. |
It assumes that the range of the pixel values in the images tensor is |
[0,255] and should be castable to tf.uint8. |
Args: |
images: a tensor that contains the input images which has the shape of |
NxTxHxWx3 where N is the batch size, T is the maximum length of the |
sequence, H and W are the height and width of the images and C is the |
number of channels. |
Returns: |
The embedding of the input image with the shape of NxTxL where L is the |
embedding size of the output. |
Raises: |
ValueError: if the shape of the input does not agree with the expected |
shape explained in the Args section. |
""" |
shape = images.get_shape().as_list() |
if len(shape) != 5: |
raise ValueError( |
'The tensor shape should have 5 elements, {} is provided'.format( |
len(shape))) |
if shape[4] != 3: |
raise ValueError('Three channels are expected for the input image') |
images = tf.cast(images, tf.uint8) |
images = tf.reshape(images, |
[shape[0] * shape[1], shape[2], shape[3], shape[4]]) |
with slim.arg_scope(resnet_v2.resnet_arg_scope()): |
def preprocess_fn(x): |
x = tf.expand_dims(x, 0) |
x = tf.image.resize_bilinear(x, [299, 299], |
align_corners=False) |
return(tf.squeeze(x, [0])) |
images = tf.map_fn(preprocess_fn, images, dtype=tf.float32) |
net, _ = resnet_v2.resnet_v2_50( |
images, is_training=False, global_pool=True) |
output = tf.reshape(net, [shape[0], shape[1], -1]) |
return output |
class IdentityEmbedder(Embedder): |
"""This embedder just returns the input as the output. |
Used for modalitites that the embedding of the modality is the same as the |
modality itself. For example, it can be used for one_hot goal. |
""" |
def build(self, images): |
return images |