|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Layers for VatxtModel.""" |
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
|
|
|
|
from six.moves import xrange |
|
import tensorflow as tf |
|
K = tf.keras |
|
|
|
|
|
def cl_logits_subgraph(layer_sizes, input_size, num_classes, keep_prob=1.): |
|
"""Construct multiple ReLU layers with dropout and a linear layer.""" |
|
subgraph = K.models.Sequential(name='cl_logits') |
|
for i, layer_size in enumerate(layer_sizes): |
|
if i == 0: |
|
subgraph.add( |
|
K.layers.Dense(layer_size, activation='relu', input_dim=input_size)) |
|
else: |
|
subgraph.add(K.layers.Dense(layer_size, activation='relu')) |
|
|
|
if keep_prob < 1.: |
|
subgraph.add(K.layers.Dropout(1. - keep_prob)) |
|
subgraph.add(K.layers.Dense(1 if num_classes == 2 else num_classes)) |
|
return subgraph |
|
|
|
|
|
class Embedding(K.layers.Layer): |
|
"""Embedding layer with frequency-based normalization and dropout.""" |
|
|
|
def __init__(self, |
|
vocab_size, |
|
embedding_dim, |
|
normalize=False, |
|
vocab_freqs=None, |
|
keep_prob=1., |
|
**kwargs): |
|
self.vocab_size = vocab_size |
|
self.embedding_dim = embedding_dim |
|
self.normalized = normalize |
|
self.keep_prob = keep_prob |
|
|
|
if normalize: |
|
assert vocab_freqs is not None |
|
self.vocab_freqs = tf.constant( |
|
vocab_freqs, dtype=tf.float32, shape=(vocab_size, 1)) |
|
|
|
super(Embedding, self).__init__(**kwargs) |
|
|
|
def build(self, input_shape): |
|
with tf.device('/cpu:0'): |
|
self.var = self.add_weight( |
|
shape=(self.vocab_size, self.embedding_dim), |
|
initializer=tf.random_uniform_initializer(-1., 1.), |
|
name='embedding', |
|
dtype=tf.float32) |
|
|
|
if self.normalized: |
|
self.var = self._normalize(self.var) |
|
|
|
super(Embedding, self).build(input_shape) |
|
|
|
def call(self, x): |
|
embedded = tf.nn.embedding_lookup(self.var, x) |
|
if self.keep_prob < 1.: |
|
shape = embedded.get_shape().as_list() |
|
|
|
|
|
|
|
|
|
|
|
embedded = tf.nn.dropout( |
|
embedded, self.keep_prob, noise_shape=(shape[0], 1, shape[2])) |
|
return embedded |
|
|
|
def _normalize(self, emb): |
|
weights = self.vocab_freqs / tf.reduce_sum(self.vocab_freqs) |
|
mean = tf.reduce_sum(weights * emb, 0, keep_dims=True) |
|
var = tf.reduce_sum(weights * tf.pow(emb - mean, 2.), 0, keep_dims=True) |
|
stddev = tf.sqrt(1e-6 + var) |
|
return (emb - mean) / stddev |
|
|
|
|
|
class LSTM(object): |
|
"""LSTM layer using dynamic_rnn. |
|
|
|
Exposes variables in `trainable_weights` property. |
|
""" |
|
|
|
def __init__(self, cell_size, num_layers=1, keep_prob=1., name='LSTM'): |
|
self.cell_size = cell_size |
|
self.num_layers = num_layers |
|
self.keep_prob = keep_prob |
|
self.reuse = None |
|
self.trainable_weights = None |
|
self.name = name |
|
|
|
def __call__(self, x, initial_state, seq_length): |
|
with tf.variable_scope(self.name, reuse=self.reuse) as vs: |
|
cell = tf.contrib.rnn.MultiRNNCell([ |
|
tf.contrib.rnn.BasicLSTMCell( |
|
self.cell_size, |
|
forget_bias=0.0, |
|
reuse=tf.get_variable_scope().reuse) |
|
for _ in xrange(self.num_layers) |
|
]) |
|
|
|
|
|
|
|
lstm_out, next_state = tf.nn.dynamic_rnn( |
|
cell, x, initial_state=initial_state, sequence_length=seq_length) |
|
|
|
|
|
|
|
if self.keep_prob < 1.: |
|
lstm_out = tf.nn.dropout(lstm_out, self.keep_prob) |
|
|
|
if self.reuse is None: |
|
self.trainable_weights = vs.global_variables() |
|
|
|
self.reuse = True |
|
|
|
return lstm_out, next_state |
|
|
|
|
|
class SoftmaxLoss(K.layers.Layer): |
|
"""Softmax xentropy loss with candidate sampling.""" |
|
|
|
def __init__(self, |
|
vocab_size, |
|
num_candidate_samples=-1, |
|
vocab_freqs=None, |
|
**kwargs): |
|
self.vocab_size = vocab_size |
|
self.num_candidate_samples = num_candidate_samples |
|
self.vocab_freqs = vocab_freqs |
|
super(SoftmaxLoss, self).__init__(**kwargs) |
|
self.multiclass_dense_layer = K.layers.Dense(self.vocab_size) |
|
|
|
def build(self, input_shape): |
|
input_shape = input_shape[0].as_list() |
|
with tf.device('/cpu:0'): |
|
self.lin_w = self.add_weight( |
|
shape=(input_shape[-1], self.vocab_size), |
|
name='lm_lin_w', |
|
initializer=K.initializers.glorot_uniform()) |
|
self.lin_b = self.add_weight( |
|
shape=(self.vocab_size,), |
|
name='lm_lin_b', |
|
initializer=K.initializers.glorot_uniform()) |
|
self.multiclass_dense_layer.build(input_shape) |
|
|
|
super(SoftmaxLoss, self).build(input_shape) |
|
|
|
def call(self, inputs): |
|
x, labels, weights = inputs |
|
if self.num_candidate_samples > -1: |
|
assert self.vocab_freqs is not None |
|
labels_reshaped = tf.reshape(labels, [-1]) |
|
labels_reshaped = tf.expand_dims(labels_reshaped, -1) |
|
sampled = tf.nn.fixed_unigram_candidate_sampler( |
|
true_classes=labels_reshaped, |
|
num_true=1, |
|
num_sampled=self.num_candidate_samples, |
|
unique=True, |
|
range_max=self.vocab_size, |
|
unigrams=self.vocab_freqs) |
|
inputs_reshaped = tf.reshape(x, [-1, int(x.get_shape()[2])]) |
|
|
|
lm_loss = tf.nn.sampled_softmax_loss( |
|
weights=tf.transpose(self.lin_w), |
|
biases=self.lin_b, |
|
labels=labels_reshaped, |
|
inputs=inputs_reshaped, |
|
num_sampled=self.num_candidate_samples, |
|
num_classes=self.vocab_size, |
|
sampled_values=sampled) |
|
lm_loss = tf.reshape( |
|
lm_loss, |
|
[int(x.get_shape()[0]), int(x.get_shape()[1])]) |
|
else: |
|
logits = self.multiclass_dense_layer(x) |
|
lm_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( |
|
logits=logits, labels=labels) |
|
|
|
lm_loss = tf.identity( |
|
tf.reduce_sum(lm_loss * weights) / _num_labels(weights), |
|
name='lm_xentropy_loss') |
|
return lm_loss |
|
|
|
|
|
def classification_loss(logits, labels, weights): |
|
"""Computes cross entropy loss between logits and labels. |
|
|
|
Args: |
|
logits: 2-D [timesteps*batch_size, m] float tensor, where m=1 if |
|
num_classes=2, otherwise m=num_classes. |
|
labels: 1-D [timesteps*batch_size] integer tensor. |
|
weights: 1-D [timesteps*batch_size] float tensor. |
|
|
|
Returns: |
|
Loss scalar of type float. |
|
""" |
|
inner_dim = logits.get_shape().as_list()[-1] |
|
with tf.name_scope('classifier_loss'): |
|
|
|
if inner_dim == 1: |
|
loss = tf.nn.sigmoid_cross_entropy_with_logits( |
|
logits=tf.squeeze(logits, -1), labels=tf.cast(labels, tf.float32)) |
|
|
|
else: |
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits( |
|
logits=logits, labels=labels) |
|
|
|
num_lab = _num_labels(weights) |
|
tf.summary.scalar('num_labels', num_lab) |
|
return tf.identity( |
|
tf.reduce_sum(weights * loss) / num_lab, name='classification_xentropy') |
|
|
|
|
|
def accuracy(logits, targets, weights): |
|
"""Computes prediction accuracy. |
|
|
|
Args: |
|
logits: 2-D classifier logits [timesteps*batch_size, num_classes] |
|
targets: 1-D [timesteps*batch_size] integer tensor. |
|
weights: 1-D [timesteps*batch_size] float tensor. |
|
|
|
Returns: |
|
Accuracy: float scalar. |
|
""" |
|
with tf.name_scope('accuracy'): |
|
eq = tf.cast(tf.equal(predictions(logits), targets), tf.float32) |
|
return tf.identity( |
|
tf.reduce_sum(weights * eq) / _num_labels(weights), name='accuracy') |
|
|
|
|
|
def predictions(logits): |
|
"""Class prediction from logits.""" |
|
inner_dim = logits.get_shape().as_list()[-1] |
|
with tf.name_scope('predictions'): |
|
|
|
if inner_dim == 1: |
|
pred = tf.cast(tf.greater(tf.squeeze(logits, -1), 0.), tf.int64) |
|
|
|
else: |
|
pred = tf.argmax(logits, 2) |
|
return pred |
|
|
|
|
|
def _num_labels(weights): |
|
"""Number of 1's in weights. Returns 1. if 0.""" |
|
num_labels = tf.reduce_sum(weights) |
|
num_labels = tf.where(tf.equal(num_labels, 0.), 1., num_labels) |
|
return num_labels |
|
|
|
|
|
def optimize(loss, |
|
global_step, |
|
max_grad_norm, |
|
lr, |
|
lr_decay, |
|
sync_replicas=False, |
|
replicas_to_aggregate=1, |
|
task_id=0): |
|
"""Builds optimization graph. |
|
|
|
* Creates an optimizer, and optionally wraps with SyncReplicasOptimizer |
|
* Computes, clips, and applies gradients |
|
* Maintains moving averages for all trainable variables |
|
* Summarizes variables and gradients |
|
|
|
Args: |
|
loss: scalar loss to minimize. |
|
global_step: integer scalar Variable. |
|
max_grad_norm: float scalar. Grads will be clipped to this value. |
|
lr: float scalar, learning rate. |
|
lr_decay: float scalar, learning rate decay rate. |
|
sync_replicas: bool, whether to use SyncReplicasOptimizer. |
|
replicas_to_aggregate: int, number of replicas to aggregate when using |
|
SyncReplicasOptimizer. |
|
task_id: int, id of the current task; used to ensure proper initialization |
|
of SyncReplicasOptimizer. |
|
|
|
Returns: |
|
train_op |
|
""" |
|
with tf.name_scope('optimization'): |
|
|
|
tvars = tf.trainable_variables() |
|
grads = tf.gradients( |
|
loss, |
|
tvars, |
|
aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) |
|
|
|
|
|
non_embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars) |
|
if 'embedding' not in v.op.name] |
|
embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars) |
|
if 'embedding' in v.op.name] |
|
|
|
ne_grads, ne_vars = zip(*non_embedding_grads_and_vars) |
|
ne_grads, _ = tf.clip_by_global_norm(ne_grads, max_grad_norm) |
|
non_embedding_grads_and_vars = zip(ne_grads, ne_vars) |
|
|
|
grads_and_vars = embedding_grads_and_vars + list(non_embedding_grads_and_vars) |
|
|
|
|
|
_summarize_vars_and_grads(grads_and_vars) |
|
|
|
|
|
lr = tf.train.exponential_decay( |
|
lr, global_step, 1, lr_decay, staircase=True) |
|
tf.summary.scalar('learning_rate', lr) |
|
opt = tf.train.AdamOptimizer(lr) |
|
|
|
|
|
variable_averages = tf.train.ExponentialMovingAverage(0.999, global_step) |
|
|
|
|
|
if sync_replicas: |
|
opt = tf.train.SyncReplicasOptimizer( |
|
opt, |
|
replicas_to_aggregate, |
|
variable_averages=variable_averages, |
|
variables_to_average=tvars, |
|
total_num_replicas=replicas_to_aggregate) |
|
apply_gradient_op = opt.apply_gradients( |
|
grads_and_vars, global_step=global_step) |
|
with tf.control_dependencies([apply_gradient_op]): |
|
train_op = tf.no_op(name='train_op') |
|
|
|
|
|
tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, |
|
opt.get_chief_queue_runner()) |
|
if task_id == 0: |
|
local_init_op = opt.chief_init_op |
|
tf.add_to_collection('chief_init_op', opt.get_init_tokens_op()) |
|
else: |
|
local_init_op = opt.local_step_init_op |
|
tf.add_to_collection('local_init_op', local_init_op) |
|
tf.add_to_collection('ready_for_local_init_op', |
|
opt.ready_for_local_init_op) |
|
else: |
|
|
|
apply_gradient_op = opt.apply_gradients(grads_and_vars, global_step) |
|
with tf.control_dependencies([apply_gradient_op]): |
|
train_op = variable_averages.apply(tvars) |
|
|
|
return train_op |
|
|
|
|
|
def _summarize_vars_and_grads(grads_and_vars): |
|
tf.logging.info('Trainable variables:') |
|
tf.logging.info('-' * 60) |
|
for grad, var in grads_and_vars: |
|
tf.logging.info(var) |
|
|
|
def tag(name, v=var): |
|
return v.op.name + '_' + name |
|
|
|
|
|
mean = tf.reduce_mean(var) |
|
tf.summary.scalar(tag('mean'), mean) |
|
with tf.name_scope(tag('stddev')): |
|
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) |
|
tf.summary.scalar(tag('stddev'), stddev) |
|
tf.summary.scalar(tag('max'), tf.reduce_max(var)) |
|
tf.summary.scalar(tag('min'), tf.reduce_min(var)) |
|
tf.summary.histogram(tag('histogram'), var) |
|
|
|
|
|
if grad is not None: |
|
if isinstance(grad, tf.IndexedSlices): |
|
grad_values = grad.values |
|
else: |
|
grad_values = grad |
|
|
|
tf.summary.histogram(tag('gradient'), grad_values) |
|
tf.summary.scalar(tag('gradient_norm'), tf.global_norm([grad_values])) |
|
else: |
|
tf.logging.info('Var %s has no gradient', var.op.name) |
|
|