Spaces:

deanna-emery
/

ASL-MoViNet-T5-translator

Runtime error

App Files Files Community

ASL-MoViNet-T5-translator / official /nlp /modeling /layers /moe_test.py

deanna-emery

updates

93528c6 over 1 year ago

raw

history blame

9.41 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Tests for moe.py."""

	import numpy as np
	import tensorflow as tf, tf_keras

	from official.nlp.modeling.layers import moe


	def small_config():
	"""Creates a small model config that can be used by all tests."""
	config = {}
	config['d_ff'] = 32
	config['output_dropout'] = 0.1

	config['num_experts'] = 2
	config['expert_d_ff'] = 33
	config['expert_dropout_rate'] = 0.1
	config['jitter_noise'] = 0.1
	config['train_capacity_factor'] = 1.0
	config['eval_capacity_factor'] = 1.0
	config['examples_per_group'] = 2.0

	config['backbone_d_ff'] = 13
	return config


	def make_input_ones(batch_size: int = 4,
	seq_length: int = 10,
	hidden_dim: int = 7) -> tf.Tensor:
	return tf.ones((batch_size, seq_length, hidden_dim))


	def make_experts_input_ones(num_groups: int = 1,
	num_experts: int = 2,
	expert_capacity: int = 5,
	hidden_dim: int = 7) -> tf.Tensor:
	return tf.ones((num_groups, num_experts, expert_capacity, hidden_dim))


	class MoeTest(tf.test.TestCase):

	def tearDown(self):
	super().tearDown()
	tf_keras.mixed_precision.set_global_policy('float32')

	def test_router_z_loss_dtype(self):
	x = tf.constant([[[10.0, 5.0]]], dtype=tf.float32)
	y = moe._router_z_loss(x)
	expected = (5 + np.log(np.exp(5) + 1))**2
	self.assertAllClose(expected, y, atol=1e-7)
	self.assertDTypeEqual(y, tf.float32)

	def test_router_z_loss_shape(self):
	x = make_input_ones(2, 5, 7)
	y = moe._router_z_loss(x)
	expected = (np.log(7) + 1)**2
	self.assertAllClose(expected, y, atol=1e-7)

	def test_experts_choose_masked_router_dtype_shape(self):
	tf_keras.mixed_precision.set_global_policy('mixed_bfloat16')
	num_groups = 2
	tokens_per_group = 3
	hidden_dim = tokens_per_group
	num_experts = tokens_per_group
	expert_capacity = 2
	x = np.zeros([num_groups, tokens_per_group, hidden_dim])
	x[0, 0, 0] += 1
	x[0, :2, :2] += 1
	x[1, 1:, 1:] += 1
	x[1, -1, -1] += 1

	router = moe.ExpertsChooseMaskedRouter(
	num_experts=num_experts,
	jitter_noise=0.1,
	use_bias=True,
	kernel_initializer=tf_keras.initializers.get('identity'),
	bias_initializer=tf_keras.initializers.get('ones'))
	router_mask = router(x, expert_capacity=expert_capacity, training=False)

	self.assertDTypeEqual(router_mask.dispatch_mask, tf.bfloat16)
	self.assertDTypeEqual(router_mask.combine_array, tf.bfloat16)

	expect_shape = [num_groups, tokens_per_group, num_experts, expert_capacity]
	self.assertEqual(expect_shape, router_mask.dispatch_mask.shape)
	self.assertEqual(expect_shape, router_mask.combine_array.shape)

	# top_k call may not be sorted, so can't compare the output directly
	# Check that the output contains only 0s and 1s
	out_dm = router_mask.dispatch_mask.numpy()
	self.assertSetEqual({0, 1}, set(out_dm.flatten().astype(np.int32)))
	# Check that the right tokens for selected
	out_dm_indices = np.dot(
	out_dm.transpose((0, 2, 3, 1)), np.arange(tokens_per_group))
	# Shape [num_groups, num_experts, expert_capacity]
	self.assertSetEqual({0, 1}, set(out_dm_indices[0, 0, :].astype(np.int32)))
	self.assertSetEqual({1, 2}, set(out_dm_indices[0, 1, :].astype(np.int32)))
	self.assertSetEqual({1, 2}, set(out_dm_indices[0, 2, :].astype(np.int32)))
	self.assertSetEqual({0, 1}, set(out_dm_indices[1, 0, :].astype(np.int32)))
	self.assertSetEqual({0, 1}, set(out_dm_indices[1, 1, :].astype(np.int32)))
	self.assertSetEqual({1, 2}, set(out_dm_indices[1, 2, :].astype(np.int32)))

	out_ca = router_mask.combine_array.numpy()
	out_ca = np.dot(out_ca, np.ones((expert_capacity,)))

	expected_combine_array = np.array([[[0.66, 0.0, 0.0], [0.42, 0.42, 0.16],
	[0.0, 0.33, 0.33]],
	[[0.33, 0.33, 0.0], [0.16, 0.42, 0.42],
	[0.0, 0.0, 0.66]]])
	self.assertAllClose(expected_combine_array, out_ca, atol=1e-2)

	def test_feed_forward_shape_and_vars(self):
	config = small_config()
	layer = moe.FeedForward(
	d_ff=config['d_ff'], output_dropout=config['output_dropout'])
	inputs = make_input_ones()
	outputs = layer(inputs)
	self.assertAllEqual(tf.shape(inputs), tf.shape(outputs))
	var_names = sorted([v.name for v in layer.trainable_variables])
	self.assertAllEqual([
	'feed_forward/intermediate/bias:0',
	'feed_forward/intermediate/kernel:0', 'feed_forward/output/bias:0',
	'feed_forward/output/kernel:0'
	], var_names)

	def test_feed_forward_manual(self):
	config = small_config()
	layer = moe.FeedForward(
	d_ff=config['d_ff'],
	output_dropout=config['output_dropout'],
	activation=tf_keras.activations.relu,
	kernel_initializer=tf_keras.initializers.get('ones'),
	bias_initializer=tf_keras.initializers.get('ones'))
	inputs = make_input_ones(1, 2, 3)
	outputs = layer(inputs, training=False)
	manual_outputs = tf.constant([[[129.0, 129.0, 129.0], [129.0, 129.0,
	129.0]]])
	self.assertAllClose(manual_outputs, outputs, atol=1e-7)

	def test_feed_forward_experts_shape_and_vars(self):
	config = small_config()
	layer = moe.FeedForwardExperts(
	num_experts=config['num_experts'],
	d_ff=config['expert_d_ff'],
	output_dropout=config['expert_dropout_rate'])
	inputs = make_experts_input_ones()
	outputs = layer(inputs)
	self.assertAllEqual(tf.shape(inputs), tf.shape(outputs))
	var_names = sorted([v.name for v in layer.trainable_variables])
	self.assertAllEqual([
	'experts/intermediate/bias:0', 'experts/intermediate/kernel:0',
	'experts/output/bias:0', 'experts/output/kernel:0'
	], var_names)

	def test_feed_forward_experts_manual(self):
	config = small_config()
	layer = moe.FeedForwardExperts(
	num_experts=1,
	d_ff=config['expert_d_ff'],
	output_dropout=config['expert_dropout_rate'],
	activation=tf_keras.activations.relu,
	kernel_initializer=tf_keras.initializers.get('ones'),
	bias_initializer=tf_keras.initializers.get('ones'))
	inputs = make_experts_input_ones(1, 1, 2, 3)
	outputs = layer(inputs, training=False)
	manual_outputs = tf.constant([[[[133.0, 133.0, 133.0],
	[133.0, 133.0, 133.0]]]])
	self.assertAllClose(manual_outputs, outputs, atol=1e-7)

	def test_moe_layer(self):
	config = small_config()
	experts = moe.FeedForwardExperts(
	num_experts=config['num_experts'],
	d_ff=config['expert_d_ff'],
	output_dropout=config['expert_dropout_rate'])
	router = moe.ExpertsChooseMaskedRouter(
	config['num_experts'], jitter_noise=config['jitter_noise'])
	moe_layer = moe.MoeLayer(
	experts,
	router,
	train_capacity_factor=config['train_capacity_factor'],
	eval_capacity_factor=config['eval_capacity_factor'],
	examples_per_group=config['examples_per_group'])

	inputs = make_input_ones()
	outputs = moe_layer(inputs, training=True)
	self.assertAllEqual(tf.shape(inputs), tf.shape(outputs))

	var_names = sorted([v.name for v in moe_layer.trainable_variables])
	self.assertAllEqual([
	'moe/experts/intermediate/bias:0', 'moe/experts/intermediate/kernel:0',
	'moe/experts/output/bias:0', 'moe/experts/output/kernel:0',
	'moe/router/router_weights/bias:0', 'moe/router/router_weights/kernel:0'
	], var_names)
	self.assertLen(moe_layer.losses, 1)
	metrics = [metric.name for metric in moe_layer.metrics]
	self.assertSetEqual(
	{
	'router_z_loss', 'unscaled_router_z_loss', 'load_balancing_loss',
	'fraction_tokens_left_behind', 'router_confidence', 'expert_usage'
	}, set(metrics))

	def test_moe_layer_with_backbone(self):
	config = small_config()
	experts = moe.FeedForwardExperts(
	num_experts=config['num_experts'],
	d_ff=config['expert_d_ff'],
	output_dropout=config['expert_dropout_rate'])
	router = moe.ExpertsChooseMaskedRouter(
	config['num_experts'], jitter_noise=config['jitter_noise'])
	moe_layer = moe.MoeLayer(
	experts,
	router,
	train_capacity_factor=config['train_capacity_factor'],
	eval_capacity_factor=config['eval_capacity_factor'],
	examples_per_group=config['examples_per_group'])
	layer = moe.MoeLayerWithBackbone(moe_layer, config['backbone_d_ff'])

	inputs = make_input_ones()
	outputs = layer(inputs)
	self.assertAllEqual(tf.shape(inputs), tf.shape(outputs))


	if __name__ == '__main__':
	tf.test.main()