|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Utilities and helper functions.""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
|
|
|
|
def make_finite(t, replacement): |
|
"""Replaces non-finite tensor values with the replacement value.""" |
|
return tf.where(tf.is_finite(t), t, replacement) |
|
|
|
|
|
def asinh(x): |
|
"""Computes the inverse hyperbolic sine function (in tensorflow).""" |
|
return tf.log(x + tf.sqrt(1. + x ** 2)) |
|
|
|
|
|
def affine(inputs, output_size, scope="Affine", scale=0.1, vec_mean=0., |
|
include_bias=True, bias_init=0., random_seed=None): |
|
"""Computes an affine function of the inputs. |
|
|
|
Creates or recalls tensorflow variables "Matrix" and "Bias" |
|
to generate an affine operation on the input. |
|
|
|
If the inputs are a list of tensors, they are concatenated together. |
|
|
|
Initial weights for the matrix are drawn from a Gaussian with zero |
|
mean and standard deviation that is the given scale divided by the |
|
square root of the input dimension. Initial weights for the bias are |
|
set to zero. |
|
|
|
Args: |
|
inputs: List of tensors with shape (batch_size, input_size) |
|
output_size: Size (dimension) of the output |
|
scope: Variable scope for these parameters (default: "Affine") |
|
scale: Initial weight scale for the matrix parameters (default: 0.1), |
|
this constant is divided by the sqrt of the input size to get the |
|
std. deviation of the initial weights |
|
vec_mean: The mean for the random initializer |
|
include_bias: Whether to include the bias term |
|
bias_init: The initializer bias (default 0.) |
|
random_seed: Random seed for random initializers. (Default: None) |
|
|
|
Returns: |
|
output: Tensor with shape (batch_size, output_size) |
|
""" |
|
|
|
|
|
x = tf.concat(inputs, 1) |
|
|
|
with tf.variable_scope(scope): |
|
input_size = x.get_shape().as_list()[1] |
|
|
|
sigma = scale / np.sqrt(input_size) |
|
rand_init = tf.random_normal_initializer(mean=vec_mean, stddev=sigma, |
|
seed=random_seed) |
|
|
|
matrix = tf.get_variable("Matrix", [input_size, output_size], |
|
dtype=tf.float32, initializer=rand_init) |
|
|
|
if include_bias: |
|
bias = tf.get_variable("Bias", [output_size], dtype=tf.float32, |
|
initializer=tf.constant_initializer(bias_init, |
|
tf.float32)) |
|
else: |
|
bias = 0. |
|
output = tf.matmul(x, matrix) + bias |
|
|
|
return output |
|
|
|
|
|
def project(inputs, weights, bias=0., activation=tf.identity): |
|
"""Computes an affine or linear projection of the inputs. |
|
|
|
Projects the inputs onto the given weight vector and (optionally) |
|
adds a bias and passes the result through an activation function. |
|
|
|
Args: |
|
inputs: matrix of inputs with shape [batch_size, dim] |
|
weights: weight matrix with shape [dim, output_dim] |
|
bias: bias vector with shape [output_dim] (default: 0) |
|
activation: nonlinear activation function (default: tf.identity) |
|
|
|
Returns: |
|
outputs: an op which computes activation(inputs @ weights + bias) |
|
""" |
|
return activation(tf.matmul(inputs, weights) + bias) |
|
|
|
|
|
def new_mean_squared(grad_vec, decay, ms): |
|
"""Calculates the new accumulated mean squared of the gradient. |
|
|
|
Args: |
|
grad_vec: the vector for the current gradient |
|
decay: the decay term |
|
ms: the previous mean_squared value |
|
|
|
Returns: |
|
the new mean_squared value |
|
""" |
|
decay_size = decay.get_shape().num_elements() |
|
decay_check_ops = [ |
|
tf.assert_less_equal(decay, 1., summarize=decay_size), |
|
tf.assert_greater_equal(decay, 0., summarize=decay_size)] |
|
|
|
with tf.control_dependencies(decay_check_ops): |
|
grad_squared = tf.square(grad_vec) |
|
|
|
|
|
|
|
decay = tf.cond(tf.reduce_all(tf.equal(ms, 0.)), |
|
lambda: tf.zeros_like(decay, dtype=tf.float32), lambda: decay) |
|
|
|
|
|
epsilon = 1e-12 |
|
return (1. - decay) * (grad_squared + epsilon) + decay * ms |
|
|
|
|
|
def rms_scaling(gradient, decay, ms, update_ms=True): |
|
"""Vectorizes and scales a tensor of gradients. |
|
|
|
Args: |
|
gradient: the current gradient |
|
decay: the current decay value. |
|
ms: the previous mean squared value |
|
update_ms: Whether to update the mean squared value (default: True) |
|
|
|
Returns: |
|
The scaled gradient and the new ms value if update_ms is True, |
|
the old ms value otherwise. |
|
""" |
|
|
|
|
|
grad_vec = tf.reshape(gradient, [-1, 1]) |
|
|
|
if update_ms: |
|
ms = new_mean_squared(grad_vec, decay, ms) |
|
|
|
|
|
scaled_gradient = asinh(grad_vec / tf.sqrt(ms + 1e-16)) |
|
|
|
return scaled_gradient, ms |
|
|
|
|
|
def accumulate_sparse_gradients(grad): |
|
"""Accumulates repeated indices of a sparse gradient update. |
|
|
|
Args: |
|
grad: a tf.IndexedSlices gradient |
|
|
|
Returns: |
|
grad_indices: unique indices |
|
grad_values: gradient values corresponding to the indices |
|
""" |
|
|
|
grad_indices, grad_segments = tf.unique(grad.indices) |
|
grad_values = tf.unsorted_segment_sum(grad.values, grad_segments, |
|
tf.shape(grad_indices)[0]) |
|
return grad_indices, grad_values |
|
|
|
|
|
def slice_tensor(dense_tensor, indices, head_dims): |
|
"""Extracts slices from a partially flattened dense tensor. |
|
|
|
indices is assumed to index into the first dimension of head_dims. |
|
dense_tensor is assumed to have a shape [D_0, D_1, ...] such that |
|
prod(head_dims) == D_0. This function will extract slices along the |
|
first_dimension of head_dims. |
|
|
|
Example: |
|
|
|
Consider a tensor with shape head_dims = [100, 2] and a dense_tensor with |
|
shape [200, 3]. Note that the first dimension of dense_tensor equals the |
|
product of head_dims. This function will reshape dense_tensor such that |
|
its shape is now [100, 2, 3] (i.e. the first dimension became head-dims) |
|
and then slice it along the first dimension. After slicing, the slices will |
|
have their initial dimensions flattened just as they were in dense_tensor |
|
(e.g. if there are 4 indices, the return value will have a shape of [4, 3]). |
|
|
|
Args: |
|
dense_tensor: a N-D dense tensor. Shape: [D_0, D_1, ...] |
|
indices: a 1-D integer tensor. Shape: [K] |
|
head_dims: True dimensions of the dense_tensor's first dimension. |
|
|
|
Returns: |
|
Extracted slices. Shape [K, D_1, ...] |
|
""" |
|
|
|
tail_dims = tf.shape(dense_tensor)[1:] |
|
dense_tensor = tf.reshape(dense_tensor, |
|
tf.concat([head_dims, tail_dims], 0)) |
|
|
|
slices = tf.gather(dense_tensor, indices) |
|
|
|
return tf.reshape(slices, tf.concat([[-1], tail_dims], 0)) |
|
|
|
|
|
def stack_tensor(slices, indices, dense_tensor, head_dims): |
|
"""Reconsititutes a tensor from slices and corresponding indices. |
|
|
|
This is an inverse operation to slice_tensor. Missing slices are set to 0. |
|
|
|
Args: |
|
slices: a tensor. Shape [K, D_1, ...] |
|
indices: a 1-D integer tensor. Shape: [K] |
|
dense_tensor: the original tensor the slices were taken |
|
from. Shape: [D_0, D_1, ...] |
|
head_dims: True dimensions of the dense_tensor's first dimension. |
|
|
|
Returns: |
|
Reconsituted tensor. Shape: [D_0, D_1, ...] |
|
""" |
|
|
|
indices = tf.cast(indices, tf.int32) |
|
|
|
tail_dims = tf.shape(dense_tensor)[1:] |
|
dense_shape = tf.concat([head_dims, tail_dims], 0) |
|
|
|
slices = tf.reshape(slices, tf.concat([[-1], dense_shape[1:]], 0)) |
|
indices = tf.expand_dims(indices, -1) |
|
|
|
return tf.reshape(tf.scatter_nd(indices, slices, dense_shape), |
|
tf.shape(dense_tensor)) |
|
|
|
|
|
def update_slices(slices, indices, dense_tensor, head_dims): |
|
"""Reconstitutes a tensor from slices and corresponding indices. |
|
|
|
Like _stack_tensor, but instead of setting missing slices to 0, sets them to |
|
what they were in the original tensor. The return value is reshaped to be |
|
the same as dense_tensor. |
|
|
|
Args: |
|
slices: a tensor. Shape [K, D_1, ...] |
|
indices: a 1-D integer tensor. Shape: [K] |
|
dense_tensor: the original tensor the slices were taken |
|
from. Shape: [D_0, D_1, ...] |
|
head_dims: True dimensions of the dense_tensor's first dimension. |
|
|
|
Returns: |
|
Reconsituted tensor. Shape: [D_0, D_1, ...] |
|
""" |
|
|
|
indices = tf.cast(indices, tf.int32) |
|
|
|
tail_dims = tf.shape(dense_tensor)[1:] |
|
dense_shape = tf.concat([head_dims, tail_dims], 0) |
|
|
|
update_mask_vals = tf.fill(tf.shape(indices), 1) |
|
reshaped_indices = tf.expand_dims(indices, -1) |
|
update_mask = tf.equal( |
|
tf.scatter_nd(reshaped_indices, update_mask_vals, head_dims[:1]), 1) |
|
|
|
reshaped_dense_slices = tf.reshape( |
|
stack_tensor(slices, indices, dense_tensor, head_dims), dense_shape) |
|
reshaped_dense_tensor = tf.reshape(dense_tensor, dense_shape) |
|
|
|
return tf.reshape( |
|
tf.where(update_mask, reshaped_dense_slices, reshaped_dense_tensor), |
|
tf.shape(dense_tensor)) |
|
|