Spaces:

NCTCMumbai
/

AdvaitBERT-AI_Explanability

Sleeping

App Files Files Community

AdvaitBERT-AI_Explanability / models /research /neural_gpu /neural_gpu.py

NCTCMumbai

Upload 2583 files

18ddfe2 verified over 1 year ago

raw

history blame

32.5 kB

	# Copyright 2015 Google Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""The Neural GPU Model."""

	import time

	import numpy as np
	from six.moves import xrange
	import tensorflow as tf

	from tensorflow.python.framework import function
	import data_utils as data

	do_jit = False # Gives more speed but experimental for now.
	jit_scope = tf.contrib.compiler.jit.experimental_jit_scope


	def conv_linear(args, kw, kh, nin, nout, rate, do_bias, bias_start, prefix):
	"""Convolutional linear map."""
	if not isinstance(args, (list, tuple)):
	args = [args]
	with tf.variable_scope(prefix):
	with tf.device("/cpu:0"):
	k = tf.get_variable("CvK", [kw, kh, nin, nout])
	if len(args) == 1:
	arg = args[0]
	else:
	arg = tf.concat(axis=3, values=args)
	res = tf.nn.convolution(arg, k, dilation_rate=(rate, 1), padding="SAME")
	if not do_bias: return res
	with tf.device("/cpu:0"):
	bias_term = tf.get_variable(
	"CvB", [nout], initializer=tf.constant_initializer(bias_start))
	bias_term = tf.reshape(bias_term, [1, 1, 1, nout])
	return res + bias_term


	def sigmoid_cutoff(x, cutoff):
	"""Sigmoid with cutoff, e.g., 1.2sigmoid(x) - 0.1."""
	y = tf.sigmoid(x)
	if cutoff < 1.01: return y
	d = (cutoff - 1.0) / 2.0
	return tf.minimum(1.0, tf.maximum(0.0, cutoff * y - d), name="cutoff_min")


	@function.Defun(tf.float32, noinline=True)
	def sigmoid_cutoff_12(x):
	"""Sigmoid with cutoff 1.2, specialized for speed and memory use."""
	y = tf.sigmoid(x)
	return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1), name="cutoff_min_12")


	@function.Defun(tf.float32, noinline=True)
	def sigmoid_hard(x):
	"""Hard sigmoid."""
	return tf.minimum(1.0, tf.maximum(0.0, 0.25 * x + 0.5))


	def place_at14(decided, selected, it):
	"""Place selected at it-th coordinate of decided, dim=1 of 4."""
	slice1 = decided[:, :it, :, :]
	slice2 = decided[:, it + 1:, :, :]
	return tf.concat(axis=1, values=[slice1, selected, slice2])


	def place_at13(decided, selected, it):
	"""Place selected at it-th coordinate of decided, dim=1 of 3."""
	slice1 = decided[:, :it, :]
	slice2 = decided[:, it + 1:, :]
	return tf.concat(axis=1, values=[slice1, selected, slice2])


	def tanh_cutoff(x, cutoff):
	"""Tanh with cutoff, e.g., 1.1tanh(x) cut to [-1. 1]."""
	y = tf.tanh(x)
	if cutoff < 1.01: return y
	d = (cutoff - 1.0) / 2.0
	return tf.minimum(1.0, tf.maximum(-1.0, (1.0 + d) * y))


	@function.Defun(tf.float32, noinline=True)
	def tanh_hard(x):
	"""Hard tanh."""
	return tf.minimum(1.0, tf.maximum(0.0, x))


	def layer_norm(x, nmaps, prefix, epsilon=1e-5):
	"""Layer normalize the 4D tensor x, averaging over the last dimension."""
	with tf.variable_scope(prefix):
	scale = tf.get_variable("layer_norm_scale", [nmaps],
	initializer=tf.ones_initializer())
	bias = tf.get_variable("layer_norm_bias", [nmaps],
	initializer=tf.zeros_initializer())
	mean, variance = tf.nn.moments(x, [3], keep_dims=True)
	norm_x = (x - mean) / tf.sqrt(variance + epsilon)
	return norm_x * scale + bias


	def conv_gru(inpts, mem, kw, kh, nmaps, rate, cutoff, prefix, do_layer_norm,
	args_len=None):
	"""Convolutional GRU."""
	def conv_lin(args, suffix, bias_start):
	total_args_len = args_len or len(args) * nmaps
	res = conv_linear(args, kw, kh, total_args_len, nmaps, rate, True,
	bias_start, prefix + "/" + suffix)
	if do_layer_norm:
	return layer_norm(res, nmaps, prefix + "/" + suffix)
	else:
	return res
	if cutoff == 1.2:
	reset = sigmoid_cutoff_12(conv_lin(inpts + [mem], "r", 1.0))
	gate = sigmoid_cutoff_12(conv_lin(inpts + [mem], "g", 1.0))
	elif cutoff > 10:
	reset = sigmoid_hard(conv_lin(inpts + [mem], "r", 1.0))
	gate = sigmoid_hard(conv_lin(inpts + [mem], "g", 1.0))
	else:
	reset = sigmoid_cutoff(conv_lin(inpts + [mem], "r", 1.0), cutoff)
	gate = sigmoid_cutoff(conv_lin(inpts + [mem], "g", 1.0), cutoff)
	if cutoff > 10:
	candidate = tanh_hard(conv_lin(inpts + [reset * mem], "c", 0.0))
	else:
	# candidate = tanh_cutoff(conv_lin(inpts + [reset * mem], "c", 0.0), cutoff)
	candidate = tf.tanh(conv_lin(inpts + [reset * mem], "c", 0.0))
	return gate * mem + (1 - gate) * candidate


	CHOOSE_K = 256


	def memory_call(q, l, nmaps, mem_size, vocab_size, num_gpus, update_mem):
	raise ValueError("Fill for experiments with additional memory structures.")


	def memory_run(step, nmaps, mem_size, batch_size, vocab_size,
	global_step, do_training, update_mem, decay_factor, num_gpus,
	target_emb_weights, output_w, gpu_targets_tn, it):
	"""Run memory."""
	q = step[:, 0, it, :]
	mlabels = gpu_targets_tn[:, it, 0]
	res, mask, mem_loss = memory_call(
	q, mlabels, nmaps, mem_size, vocab_size, num_gpus, update_mem)
	res = tf.gather(target_emb_weights, res) * tf.expand_dims(mask[:, 0], 1)

	# Mix gold and original in the first steps, 20% later.
	gold = tf.nn.dropout(tf.gather(target_emb_weights, mlabels), 0.7)
	use_gold = 1.0 - tf.cast(global_step, tf.float32) / (1000. * decay_factor)
	use_gold = tf.maximum(use_gold, 0.2) * do_training
	mem = tf.cond(tf.less(tf.random_uniform([]), use_gold),
	lambda: use_gold * gold + (1.0 - use_gold) * res,
	lambda: res)
	mem = tf.reshape(mem, [-1, 1, 1, nmaps])
	return mem, mem_loss, update_mem


	@tf.RegisterGradient("CustomIdG")
	def _custom_id_grad(_, grads):
	return grads


	def quantize(t, quant_scale, max_value=1.0):
	"""Quantize a tensor t with each element in [-max_value, max_value]."""
	t = tf.minimum(max_value, tf.maximum(t, -max_value))
	big = quant_scale * (t + max_value) + 0.5
	with tf.get_default_graph().gradient_override_map({"Floor": "CustomIdG"}):
	res = (tf.floor(big) / quant_scale) - max_value
	return res


	def quantize_weights_op(quant_scale, max_value):
	ops = [v.assign(quantize(v, quant_scale, float(max_value)))
	for v in tf.trainable_variables()]
	return tf.group(*ops)


	def autoenc_quantize(x, nbits, nmaps, do_training, layers=1):
	"""Autoencoder into nbits vectors of bits, using noise and sigmoids."""
	enc_x = tf.reshape(x, [-1, nmaps])
	for i in xrange(layers - 1):
	enc_x = tf.layers.dense(enc_x, nmaps, name="autoenc_%d" % i)
	enc_x = tf.layers.dense(enc_x, nbits, name="autoenc_%d" % (layers - 1))
	noise = tf.truncated_normal(tf.shape(enc_x), stddev=2.0)
	dec_x = sigmoid_cutoff_12(enc_x + noise * do_training)
	dec_x = tf.reshape(dec_x, [-1, nbits])
	for i in xrange(layers):
	dec_x = tf.layers.dense(dec_x, nmaps, name="autodec_%d" % i)
	return tf.reshape(dec_x, tf.shape(x))


	def make_dense(targets, noclass, low_param):
	"""Move a batch of targets to a dense 1-hot representation."""
	low = low_param / float(noclass - 1)
	high = 1.0 - low * (noclass - 1)
	targets = tf.cast(targets, tf.int64)
	return tf.one_hot(targets, depth=noclass, on_value=high, off_value=low)


	def reorder_beam(beam_size, batch_size, beam_val, output, is_first,
	tensors_to_reorder):
	"""Reorder to minimize beam costs."""
	# beam_val is [batch_size x beam_size]; let b = batch_size * beam_size
	# decided is len x b x a x b
	# output is b x out_size; step is b x len x a x b;
	outputs = tf.split(axis=0, num_or_size_splits=beam_size, value=tf.nn.log_softmax(output))
	all_beam_vals, all_beam_idx = [], []
	beam_range = 1 if is_first else beam_size
	for i in xrange(beam_range):
	top_out, top_out_idx = tf.nn.top_k(outputs[i], k=beam_size)
	cur_beam_val = beam_val[:, i]
	top_out = tf.Print(top_out, [top_out, top_out_idx, beam_val, i,
	cur_beam_val], "GREPO", summarize=8)
	all_beam_vals.append(top_out + tf.expand_dims(cur_beam_val, 1))
	all_beam_idx.append(top_out_idx)
	all_beam_idx = tf.reshape(tf.transpose(tf.concat(axis=1, values=all_beam_idx), [1, 0]),
	[-1])
	top_beam, top_beam_idx = tf.nn.top_k(tf.concat(axis=1, values=all_beam_vals), k=beam_size)
	top_beam_idx = tf.Print(top_beam_idx, [top_beam, top_beam_idx],
	"GREP", summarize=8)
	reordered = [[] for _ in xrange(len(tensors_to_reorder) + 1)]
	top_out_idx = []
	for i in xrange(beam_size):
	which_idx = top_beam_idx[:, i] * batch_size + tf.range(batch_size)
	top_out_idx.append(tf.gather(all_beam_idx, which_idx))
	which_beam = top_beam_idx[:, i] / beam_size # [batch]
	which_beam = which_beam * batch_size + tf.range(batch_size)
	reordered[0].append(tf.gather(output, which_beam))
	for i, t in enumerate(tensors_to_reorder):
	reordered[i + 1].append(tf.gather(t, which_beam))
	new_tensors = [tf.concat(axis=0, values=t) for t in reordered]
	top_out_idx = tf.concat(axis=0, values=top_out_idx)
	return (top_beam, new_tensors[0], top_out_idx, new_tensors[1:])


	class NeuralGPU(object):
	"""Neural GPU Model."""

	def __init__(self, nmaps, vec_size, niclass, noclass, dropout,
	max_grad_norm, cutoff, nconvs, kw, kh, height, mem_size,
	learning_rate, min_length, num_gpus, num_replicas,
	grad_noise_scale, sampling_rate, act_noise=0.0, do_rnn=False,
	atrous=False, beam_size=1, backward=True, do_layer_norm=False,
	autoenc_decay=1.0):
	# Feeds for parameters and ops to update them.
	self.nmaps = nmaps
	if backward:
	self.global_step = tf.Variable(0, trainable=False, name="global_step")
	self.cur_length = tf.Variable(min_length, trainable=False)
	self.cur_length_incr_op = self.cur_length.assign_add(1)
	self.lr = tf.Variable(learning_rate, trainable=False)
	self.lr_decay_op = self.lr.assign(self.lr * 0.995)
	self.do_training = tf.placeholder(tf.float32, name="do_training")
	self.update_mem = tf.placeholder(tf.int32, name="update_mem")
	self.noise_param = tf.placeholder(tf.float32, name="noise_param")

	# Feeds for inputs, targets, outputs, losses, etc.
	self.input = tf.placeholder(tf.int32, name="inp")
	self.target = tf.placeholder(tf.int32, name="tgt")
	self.prev_step = tf.placeholder(tf.float32, name="prev_step")
	gpu_input = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.input)
	gpu_target = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.target)
	gpu_prev_step = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.prev_step)
	batch_size = tf.shape(gpu_input[0])[0]

	if backward:
	adam_lr = 0.005 * self.lr
	adam = tf.train.AdamOptimizer(adam_lr, epsilon=1e-3)

	def adam_update(grads):
	return adam.apply_gradients(zip(grads, tf.trainable_variables()),
	global_step=self.global_step,
	name="adam_update")

	# When switching from Adam to SGD we perform reverse-decay.
	if backward:
	global_step_float = tf.cast(self.global_step, tf.float32)
	sampling_decay_exponent = global_step_float / 100000.0
	sampling_decay = tf.maximum(0.05, tf.pow(0.5, sampling_decay_exponent))
	self.sampling = sampling_rate * 0.05 / sampling_decay
	else:
	self.sampling = tf.constant(0.0)

	# Cache variables on cpu if needed.
	if num_replicas > 1 or num_gpus > 1:
	with tf.device("/cpu:0"):
	caching_const = tf.constant(0)
	tf.get_variable_scope().set_caching_device(caching_const.op.device)
	# partitioner = tf.variable_axis_size_partitioner(10242564)
	# tf.get_variable_scope().set_partitioner(partitioner)

	def gpu_avg(l):
	if l[0] is None:
	for elem in l:
	assert elem is None
	return 0.0
	if len(l) < 2:
	return l[0]
	return sum(l) / float(num_gpus)

	self.length_tensor = tf.placeholder(tf.int32, name="length")

	with tf.device("/cpu:0"):
	emb_weights = tf.get_variable(
	"embedding", [niclass, vec_size],
	initializer=tf.random_uniform_initializer(-1.7, 1.7))
	if beam_size > 0:
	target_emb_weights = tf.get_variable(
	"target_embedding", [noclass, nmaps],
	initializer=tf.random_uniform_initializer(-1.7, 1.7))
	e0 = tf.scatter_update(emb_weights,
	tf.constant(0, dtype=tf.int32, shape=[1]),
	tf.zeros([1, vec_size]))
	output_w = tf.get_variable("output_w", [nmaps, noclass], tf.float32)

	def conv_rate(layer):
	if atrous:
	return 2**layer
	return 1

	# pylint: disable=cell-var-from-loop
	def enc_step(step):
	"""Encoder step."""
	if autoenc_decay < 1.0:
	quant_step = autoenc_quantize(step, 16, nmaps, self.do_training)
	if backward:
	exp_glob = tf.train.exponential_decay(1.0, self.global_step - 10000,
	1000, autoenc_decay)
	dec_factor = 1.0 - exp_glob # * self.do_training
	dec_factor = tf.cond(tf.less(self.global_step, 10500),
	lambda: tf.constant(0.05), lambda: dec_factor)
	else:
	dec_factor = 1.0
	cur = tf.cond(tf.less(tf.random_uniform([]), dec_factor),
	lambda: quant_step, lambda: step)
	else:
	cur = step
	if dropout > 0.0001:
	cur = tf.nn.dropout(cur, keep_prob)
	if act_noise > 0.00001:
	cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
	# Do nconvs-many CGRU steps.
	if do_jit and tf.get_variable_scope().reuse:
	with jit_scope():
	for layer in xrange(nconvs):
	cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer),
	cutoff, "ecgru_%d" % layer, do_layer_norm)
	else:
	for layer in xrange(nconvs):
	cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer),
	cutoff, "ecgru_%d" % layer, do_layer_norm)
	return cur

	zero_tgt = tf.zeros([batch_size, nmaps, 1])
	zero_tgt.set_shape([None, nmaps, 1])

	def dec_substep(step, decided):
	"""Decoder sub-step."""
	cur = step
	if dropout > 0.0001:
	cur = tf.nn.dropout(cur, keep_prob)
	if act_noise > 0.00001:
	cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
	# Do nconvs-many CGRU steps.
	if do_jit and tf.get_variable_scope().reuse:
	with jit_scope():
	for layer in xrange(nconvs):
	cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer),
	cutoff, "dcgru_%d" % layer, do_layer_norm)
	else:
	for layer in xrange(nconvs):
	cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer),
	cutoff, "dcgru_%d" % layer, do_layer_norm)
	return cur
	# pylint: enable=cell-var-from-loop

	def dec_step(step, it, it_int, decided, output_ta, tgts,
	mloss, nupd_in, out_idx, beam_cost):
	"""Decoder step."""
	nupd, mem_loss = 0, 0.0
	if mem_size > 0:
	it_incr = tf.minimum(it+1, length - 1)
	mem, mem_loss, nupd = memory_run(
	step, nmaps, mem_size, batch_size, noclass, self.global_step,
	self.do_training, self.update_mem, 10, num_gpus,
	target_emb_weights, output_w, gpu_targets_tn, it_incr)
	step = dec_substep(step, decided)
	output_l = tf.expand_dims(tf.expand_dims(step[:, it, 0, :], 1), 1)
	# Calculate argmax output.
	output = tf.reshape(output_l, [-1, nmaps])
	# pylint: disable=cell-var-from-loop
	output = tf.matmul(output, output_w)
	if beam_size > 1:
	beam_cost, output, out, reordered = reorder_beam(
	beam_size, batch_size, beam_cost, output, it_int == 0,
	[output_l, out_idx, step, decided])
	[output_l, out_idx, step, decided] = reordered
	else:
	# Scheduled sampling.
	out = tf.multinomial(tf.stop_gradient(output), 1)
	out = tf.to_int32(tf.squeeze(out, [1]))
	out_write = output_ta.write(it, output_l[:batch_size, :, :, :])
	output = tf.gather(target_emb_weights, out)
	output = tf.reshape(output, [-1, 1, nmaps])
	output = tf.concat(axis=1, values=[output] * height)
	tgt = tgts[it, :, :, :]
	selected = tf.cond(tf.less(tf.random_uniform([]), self.sampling),
	lambda: output, lambda: tgt)
	# pylint: enable=cell-var-from-loop
	dec_write = place_at14(decided, tf.expand_dims(selected, 1), it)
	out_idx = place_at13(
	out_idx, tf.reshape(out, [beam_size * batch_size, 1, 1]), it)
	if mem_size > 0:
	mem = tf.concat(axis=2, values=[mem] * height)
	dec_write = place_at14(dec_write, mem, it_incr)
	return (step, dec_write, out_write, mloss + mem_loss, nupd_in + nupd,
	out_idx, beam_cost)

	# Main model construction.
	gpu_outputs = []
	gpu_losses = []
	gpu_grad_norms = []
	grads_list = []
	gpu_out_idx = []
	self.after_enc_step = []
	for gpu in xrange(num_gpus): # Multi-GPU towers, average gradients later.
	length = self.length_tensor
	length_float = tf.cast(length, tf.float32)
	if gpu > 0:
	tf.get_variable_scope().reuse_variables()
	gpu_outputs.append([])
	gpu_losses.append([])
	gpu_grad_norms.append([])
	with tf.name_scope("gpu%d" % gpu), tf.device("/gpu:%d" % gpu):
	# Main graph creation loop.
	data.print_out("Creating model.")
	start_time = time.time()

	# Embed inputs and calculate mask.
	with tf.device("/cpu:0"):
	tgt_shape = tf.shape(tf.squeeze(gpu_target[gpu], [1]))
	weights = tf.where(tf.squeeze(gpu_target[gpu], [1]) > 0,
	tf.ones(tgt_shape), tf.zeros(tgt_shape))

	# Embed inputs and targets.
	with tf.control_dependencies([e0]):
	start = tf.gather(emb_weights, gpu_input[gpu]) # b x h x l x nmaps
	gpu_targets_tn = gpu_target[gpu] # b x 1 x len
	if beam_size > 0:
	embedded_targets_tn = tf.gather(target_emb_weights,
	gpu_targets_tn)
	embedded_targets_tn = tf.transpose(
	embedded_targets_tn, [2, 0, 1, 3]) # len x b x 1 x nmaps
	embedded_targets_tn = tf.concat(axis=2, values=[embedded_targets_tn] * height)

	# First image comes from start by applying convolution and adding 0s.
	start = tf.transpose(start, [0, 2, 1, 3]) # Now b x len x h x vec_s
	first = conv_linear(start, 1, 1, vec_size, nmaps, 1, True, 0.0, "input")
	first = layer_norm(first, nmaps, "input")

	# Computation steps.
	keep_prob = dropout * 3.0 / tf.sqrt(length_float)
	keep_prob = 1.0 - self.do_training * keep_prob
	act_noise_scale = act_noise * self.do_training

	# Start with a convolutional gate merging previous step.
	step = conv_gru([gpu_prev_step[gpu]], first,
	kw, kh, nmaps, 1, cutoff, "first", do_layer_norm)

	# This is just for running a baseline RNN seq2seq model.
	if do_rnn:
	self.after_enc_step.append(step) # Not meaningful here, but needed.
	def lstm_cell():
	return tf.contrib.rnn.BasicLSTMCell(height * nmaps)
	cell = tf.contrib.rnn.MultiRNNCell(
	[lstm_cell() for _ in range(nconvs)])
	with tf.variable_scope("encoder"):
	encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
	cell, tf.reshape(step, [batch_size, length, height * nmaps]),
	dtype=tf.float32, time_major=False)

	# Attention.
	attn = tf.layers.dense(
	encoder_outputs, height * nmaps, name="attn1")

	# pylint: disable=cell-var-from-loop
	@function.Defun(noinline=True)
	def attention_query(query, attn_v):
	vecs = tf.tanh(attn + tf.expand_dims(query, 1))
	mask = tf.reduce_sum(vecs * tf.reshape(attn_v, [1, 1, -1]), 2)
	mask = tf.nn.softmax(mask)
	return tf.reduce_sum(encoder_outputs * tf.expand_dims(mask, 2), 1)

	with tf.variable_scope("decoder"):
	def decoder_loop_fn(state__prev_cell_out__unused, cell_inp__cur_tgt):
	"""Decoder loop function."""
	state, prev_cell_out, _ = state__prev_cell_out__unused
	cell_inp, cur_tgt = cell_inp__cur_tgt
	attn_q = tf.layers.dense(prev_cell_out, height * nmaps,
	name="attn_query")
	attn_res = attention_query(attn_q, tf.get_variable(
	"attn_v", [height * nmaps],
	initializer=tf.random_uniform_initializer(-0.1, 0.1)))
	concatenated = tf.reshape(tf.concat(axis=1, values=[cell_inp, attn_res]),
	[batch_size, 2 * height * nmaps])
	cell_inp = tf.layers.dense(
	concatenated, height * nmaps, name="attn_merge")
	output, new_state = cell(cell_inp, state)

	mem_loss = 0.0
	if mem_size > 0:
	res, mask, mem_loss = memory_call(
	output, cur_tgt, height * nmaps, mem_size, noclass,
	num_gpus, self.update_mem)
	res = tf.gather(target_emb_weights, res)
	res *= tf.expand_dims(mask[:, 0], 1)
	output = tf.layers.dense(
	tf.concat(axis=1, values=[output, res]), height * nmaps, name="rnnmem")

	return new_state, output, mem_loss
	# pylint: enable=cell-var-from-loop
	gpu_targets = tf.squeeze(gpu_target[gpu], [1]) # b x len
	gpu_tgt_trans = tf.transpose(gpu_targets, [1, 0])
	dec_zero = tf.zeros([batch_size, 1], dtype=tf.int32)
	dec_inp = tf.concat(axis=1, values=[dec_zero, gpu_targets])
	dec_inp = dec_inp[:, :length]
	embedded_dec_inp = tf.gather(target_emb_weights, dec_inp)
	embedded_dec_inp_proj = tf.layers.dense(
	embedded_dec_inp, height * nmaps, name="dec_proj")
	embedded_dec_inp_proj = tf.transpose(embedded_dec_inp_proj,
	[1, 0, 2])
	init_vals = (encoder_state,
	tf.zeros([batch_size, height * nmaps]), 0.0)
	_, dec_outputs, mem_losses = tf.scan(
	decoder_loop_fn, (embedded_dec_inp_proj, gpu_tgt_trans),
	initializer=init_vals)
	mem_loss = tf.reduce_mean(mem_losses)
	outputs = tf.layers.dense(dec_outputs, nmaps, name="out_proj")
	# Final convolution to get logits, list outputs.
	outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w)
	outputs = tf.reshape(outputs, [length, batch_size, noclass])
	gpu_out_idx.append(tf.argmax(outputs, 2))
	else: # Here we go with the Neural GPU.
	# Encoder.
	enc_length = length
	step = enc_step(step) # First step hard-coded.
	# pylint: disable=cell-var-from-loop
	i = tf.constant(1)
	c = lambda i, _s: tf.less(i, enc_length)
	def enc_step_lambda(i, step):
	with tf.variable_scope(tf.get_variable_scope(), reuse=True):
	new_step = enc_step(step)
	return (i + 1, new_step)
	_, step = tf.while_loop(
	c, enc_step_lambda, [i, step],
	parallel_iterations=1, swap_memory=True)
	# pylint: enable=cell-var-from-loop

	self.after_enc_step.append(step)

	# Decoder.
	if beam_size > 0:
	output_ta = tf.TensorArray(
	dtype=tf.float32, size=length, dynamic_size=False,
	infer_shape=False, name="outputs")
	out_idx = tf.zeros([beam_size * batch_size, length, 1],
	dtype=tf.int32)
	decided_t = tf.zeros([beam_size * batch_size, length,
	height, vec_size])

	# Prepare for beam search.
	tgts = tf.concat(axis=1, values=[embedded_targets_tn] * beam_size)
	beam_cost = tf.zeros([batch_size, beam_size])
	step = tf.concat(axis=0, values=[step] * beam_size)
	# First step hard-coded.
	step, decided_t, output_ta, mem_loss, nupd, oi, bc = dec_step(
	step, 0, 0, decided_t, output_ta, tgts, 0.0, 0, out_idx,
	beam_cost)
	tf.get_variable_scope().reuse_variables()
	# pylint: disable=cell-var-from-loop
	def step_lambda(i, step, dec_t, out_ta, ml, nu, oi, bc):
	with tf.variable_scope(tf.get_variable_scope(), reuse=True):
	s, d, t, nml, nu, oi, bc = dec_step(
	step, i, 1, dec_t, out_ta, tgts, ml, nu, oi, bc)
	return (i + 1, s, d, t, nml, nu, oi, bc)
	i = tf.constant(1)
	c = lambda i, _s, _d, _o, _ml, _nu, _oi, _bc: tf.less(i, length)
	_, step, _, output_ta, mem_loss, nupd, out_idx, _ = tf.while_loop(
	c, step_lambda,
	[i, step, decided_t, output_ta, mem_loss, nupd, oi, bc],
	parallel_iterations=1, swap_memory=True)
	# pylint: enable=cell-var-from-loop
	gpu_out_idx.append(tf.squeeze(out_idx, [2]))
	outputs = output_ta.stack()
	outputs = tf.squeeze(outputs, [2, 3]) # Now l x b x nmaps
	else:
	# If beam_size is 0 or less, we don't have a decoder.
	mem_loss = 0.0
	outputs = tf.transpose(step[:, :, 1, :], [1, 0, 2])
	gpu_out_idx.append(tf.argmax(outputs, 2))

	# Final convolution to get logits, list outputs.
	outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w)
	outputs = tf.reshape(outputs, [length, batch_size, noclass])
	gpu_outputs[gpu] = tf.nn.softmax(outputs)

	# Calculate cross-entropy loss and normalize it.
	targets_soft = make_dense(tf.squeeze(gpu_target[gpu], [1]),
	noclass, 0.1)
	targets_soft = tf.reshape(targets_soft, [-1, noclass])
	targets_hard = make_dense(tf.squeeze(gpu_target[gpu], [1]),
	noclass, 0.0)
	targets_hard = tf.reshape(targets_hard, [-1, noclass])
	output = tf.transpose(outputs, [1, 0, 2])
	xent_soft = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
	logits=tf.reshape(output, [-1, noclass]), labels=targets_soft),
	[batch_size, length])
	xent_hard = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
	logits=tf.reshape(output, [-1, noclass]), labels=targets_hard),
	[batch_size, length])
	low, high = 0.1 / float(noclass - 1), 0.9
	const = high * tf.log(high) + float(noclass - 1) * low * tf.log(low)
	weight_sum = tf.reduce_sum(weights) + 1e-20
	true_perp = tf.reduce_sum(xent_hard * weights) / weight_sum
	soft_loss = tf.reduce_sum(xent_soft * weights) / weight_sum
	perp_loss = soft_loss + const
	# Final loss: cross-entropy + shared parameter relaxation part + extra.
	mem_loss = 0.5 * tf.reduce_mean(mem_loss) / length_float
	total_loss = perp_loss + mem_loss
	gpu_losses[gpu].append(true_perp)

	# Gradients.
	if backward:
	data.print_out("Creating backward pass for the model.")
	grads = tf.gradients(
	total_loss, tf.trainable_variables(),
	colocate_gradients_with_ops=True)
	for g_i, g in enumerate(grads):
	if isinstance(g, tf.IndexedSlices):
	grads[g_i] = tf.convert_to_tensor(g)
	grads, norm = tf.clip_by_global_norm(grads, max_grad_norm)
	gpu_grad_norms[gpu].append(norm)
	for g in grads:
	if grad_noise_scale > 0.001:
	g += tf.truncated_normal(tf.shape(g)) * self.noise_param
	grads_list.append(grads)
	else:
	gpu_grad_norms[gpu].append(0.0)
	data.print_out("Created model for gpu %d in %.2f s."
	% (gpu, time.time() - start_time))

	self.updates = []
	self.after_enc_step = tf.concat(axis=0, values=self.after_enc_step) # Concat GPUs.
	if backward:
	tf.get_variable_scope()._reuse = False
	tf.get_variable_scope().set_caching_device(None)
	grads = [gpu_avg([grads_list[g][i] for g in xrange(num_gpus)])
	for i in xrange(len(grads_list[0]))]
	update = adam_update(grads)
	self.updates.append(update)
	else:
	self.updates.append(tf.no_op())

	self.losses = [gpu_avg([gpu_losses[g][i] for g in xrange(num_gpus)])
	for i in xrange(len(gpu_losses[0]))]
	self.out_idx = tf.concat(axis=0, values=gpu_out_idx)
	self.grad_norms = [gpu_avg([gpu_grad_norms[g][i] for g in xrange(num_gpus)])
	for i in xrange(len(gpu_grad_norms[0]))]
	self.outputs = [tf.concat(axis=1, values=[gpu_outputs[g] for g in xrange(num_gpus)])]
	self.quantize_op = quantize_weights_op(512, 8)
	if backward:
	self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)

	def step(self, sess, inp, target, do_backward_in, noise_param=None,
	beam_size=2, eos_id=2, eos_cost=0.0, update_mem=None, state=None):
	"""Run a step of the network."""
	batch_size, height, length = inp.shape[0], inp.shape[1], inp.shape[2]
	do_backward = do_backward_in
	train_mode = True
	if do_backward_in is None:
	do_backward = False
	train_mode = False
	if update_mem is None:
	update_mem = do_backward
	feed_in = {}
	# print " feeding sequences of length %d" % length
	if state is None:
	state = np.zeros([batch_size, length, height, self.nmaps])
	feed_in[self.prev_step.name] = state
	feed_in[self.length_tensor.name] = length
	feed_in[self.noise_param.name] = noise_param if noise_param else 0.0
	feed_in[self.do_training.name] = 1.0 if do_backward else 0.0
	feed_in[self.update_mem.name] = 1 if update_mem else 0
	if do_backward_in is False:
	feed_in[self.sampling.name] = 0.0
	index = 0 # We're dynamic now.
	feed_out = []
	if do_backward:
	feed_out.append(self.updates[index])
	feed_out.append(self.grad_norms[index])
	if train_mode:
	feed_out.append(self.losses[index])
	feed_in[self.input.name] = inp
	feed_in[self.target.name] = target
	feed_out.append(self.outputs[index])
	if train_mode:
	# Make a full-sequence training step with one call to session.run.
	res = sess.run([self.after_enc_step] + feed_out, feed_in)
	after_enc_state, res = res[0], res[1:]
	else:
	# Make a full-sequence decoding step with one call to session.run.
	feed_in[self.sampling.name] = 1.1 # Sample every time.
	res = sess.run([self.after_enc_step, self.out_idx] + feed_out, feed_in)
	after_enc_state, out_idx = res[0], res[1]
	res = [res[2][l] for l in xrange(length)]
	outputs = [out_idx[:, i] for i in xrange(length)]
	cost = [0.0 for _ in xrange(beam_size * batch_size)]
	seen_eos = [0 for _ in xrange(beam_size * batch_size)]
	for idx, logit in enumerate(res):
	best = outputs[idx]
	for b in xrange(batch_size):
	if seen_eos[b] > 1:
	cost[b] -= eos_cost
	else:
	cost[b] += np.log(logit[b][best[b]])
	if best[b] in [eos_id]:
	seen_eos[b] += 1
	res = [[-c for c in cost]] + outputs
	# Collect and output results.
	offset = 0
	norm = None
	if do_backward:
	offset = 2
	norm = res[1]
	if train_mode:
	outputs = res[offset + 1]
	outputs = [outputs[l] for l in xrange(length)]
	return res[offset], outputs, norm, after_enc_state