Spaces:
Runtime error
Runtime error
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Tests for moe.py.""" | |
import numpy as np | |
import tensorflow as tf, tf_keras | |
from official.nlp.modeling.layers import moe | |
def small_config(): | |
"""Creates a small model config that can be used by all tests.""" | |
config = {} | |
config['d_ff'] = 32 | |
config['output_dropout'] = 0.1 | |
config['num_experts'] = 2 | |
config['expert_d_ff'] = 33 | |
config['expert_dropout_rate'] = 0.1 | |
config['jitter_noise'] = 0.1 | |
config['train_capacity_factor'] = 1.0 | |
config['eval_capacity_factor'] = 1.0 | |
config['examples_per_group'] = 2.0 | |
config['backbone_d_ff'] = 13 | |
return config | |
def make_input_ones(batch_size: int = 4, | |
seq_length: int = 10, | |
hidden_dim: int = 7) -> tf.Tensor: | |
return tf.ones((batch_size, seq_length, hidden_dim)) | |
def make_experts_input_ones(num_groups: int = 1, | |
num_experts: int = 2, | |
expert_capacity: int = 5, | |
hidden_dim: int = 7) -> tf.Tensor: | |
return tf.ones((num_groups, num_experts, expert_capacity, hidden_dim)) | |
class MoeTest(tf.test.TestCase): | |
def tearDown(self): | |
super().tearDown() | |
tf_keras.mixed_precision.set_global_policy('float32') | |
def test_router_z_loss_dtype(self): | |
x = tf.constant([[[10.0, 5.0]]], dtype=tf.float32) | |
y = moe._router_z_loss(x) | |
expected = (5 + np.log(np.exp(5) + 1))**2 | |
self.assertAllClose(expected, y, atol=1e-7) | |
self.assertDTypeEqual(y, tf.float32) | |
def test_router_z_loss_shape(self): | |
x = make_input_ones(2, 5, 7) | |
y = moe._router_z_loss(x) | |
expected = (np.log(7) + 1)**2 | |
self.assertAllClose(expected, y, atol=1e-7) | |
def test_experts_choose_masked_router_dtype_shape(self): | |
tf_keras.mixed_precision.set_global_policy('mixed_bfloat16') | |
num_groups = 2 | |
tokens_per_group = 3 | |
hidden_dim = tokens_per_group | |
num_experts = tokens_per_group | |
expert_capacity = 2 | |
x = np.zeros([num_groups, tokens_per_group, hidden_dim]) | |
x[0, 0, 0] += 1 | |
x[0, :2, :2] += 1 | |
x[1, 1:, 1:] += 1 | |
x[1, -1, -1] += 1 | |
router = moe.ExpertsChooseMaskedRouter( | |
num_experts=num_experts, | |
jitter_noise=0.1, | |
use_bias=True, | |
kernel_initializer=tf_keras.initializers.get('identity'), | |
bias_initializer=tf_keras.initializers.get('ones')) | |
router_mask = router(x, expert_capacity=expert_capacity, training=False) | |
self.assertDTypeEqual(router_mask.dispatch_mask, tf.bfloat16) | |
self.assertDTypeEqual(router_mask.combine_array, tf.bfloat16) | |
expect_shape = [num_groups, tokens_per_group, num_experts, expert_capacity] | |
self.assertEqual(expect_shape, router_mask.dispatch_mask.shape) | |
self.assertEqual(expect_shape, router_mask.combine_array.shape) | |
# top_k call may not be sorted, so can't compare the output directly | |
# Check that the output contains only 0s and 1s | |
out_dm = router_mask.dispatch_mask.numpy() | |
self.assertSetEqual({0, 1}, set(out_dm.flatten().astype(np.int32))) | |
# Check that the right tokens for selected | |
out_dm_indices = np.dot( | |
out_dm.transpose((0, 2, 3, 1)), np.arange(tokens_per_group)) | |
# Shape [num_groups, num_experts, expert_capacity] | |
self.assertSetEqual({0, 1}, set(out_dm_indices[0, 0, :].astype(np.int32))) | |
self.assertSetEqual({1, 2}, set(out_dm_indices[0, 1, :].astype(np.int32))) | |
self.assertSetEqual({1, 2}, set(out_dm_indices[0, 2, :].astype(np.int32))) | |
self.assertSetEqual({0, 1}, set(out_dm_indices[1, 0, :].astype(np.int32))) | |
self.assertSetEqual({0, 1}, set(out_dm_indices[1, 1, :].astype(np.int32))) | |
self.assertSetEqual({1, 2}, set(out_dm_indices[1, 2, :].astype(np.int32))) | |
out_ca = router_mask.combine_array.numpy() | |
out_ca = np.dot(out_ca, np.ones((expert_capacity,))) | |
expected_combine_array = np.array([[[0.66, 0.0, 0.0], [0.42, 0.42, 0.16], | |
[0.0, 0.33, 0.33]], | |
[[0.33, 0.33, 0.0], [0.16, 0.42, 0.42], | |
[0.0, 0.0, 0.66]]]) | |
self.assertAllClose(expected_combine_array, out_ca, atol=1e-2) | |
def test_feed_forward_shape_and_vars(self): | |
config = small_config() | |
layer = moe.FeedForward( | |
d_ff=config['d_ff'], output_dropout=config['output_dropout']) | |
inputs = make_input_ones() | |
outputs = layer(inputs) | |
self.assertAllEqual(tf.shape(inputs), tf.shape(outputs)) | |
var_names = sorted([v.name for v in layer.trainable_variables]) | |
self.assertAllEqual([ | |
'feed_forward/intermediate/bias:0', | |
'feed_forward/intermediate/kernel:0', 'feed_forward/output/bias:0', | |
'feed_forward/output/kernel:0' | |
], var_names) | |
def test_feed_forward_manual(self): | |
config = small_config() | |
layer = moe.FeedForward( | |
d_ff=config['d_ff'], | |
output_dropout=config['output_dropout'], | |
activation=tf_keras.activations.relu, | |
kernel_initializer=tf_keras.initializers.get('ones'), | |
bias_initializer=tf_keras.initializers.get('ones')) | |
inputs = make_input_ones(1, 2, 3) | |
outputs = layer(inputs, training=False) | |
manual_outputs = tf.constant([[[129.0, 129.0, 129.0], [129.0, 129.0, | |
129.0]]]) | |
self.assertAllClose(manual_outputs, outputs, atol=1e-7) | |
def test_feed_forward_experts_shape_and_vars(self): | |
config = small_config() | |
layer = moe.FeedForwardExperts( | |
num_experts=config['num_experts'], | |
d_ff=config['expert_d_ff'], | |
output_dropout=config['expert_dropout_rate']) | |
inputs = make_experts_input_ones() | |
outputs = layer(inputs) | |
self.assertAllEqual(tf.shape(inputs), tf.shape(outputs)) | |
var_names = sorted([v.name for v in layer.trainable_variables]) | |
self.assertAllEqual([ | |
'experts/intermediate/bias:0', 'experts/intermediate/kernel:0', | |
'experts/output/bias:0', 'experts/output/kernel:0' | |
], var_names) | |
def test_feed_forward_experts_manual(self): | |
config = small_config() | |
layer = moe.FeedForwardExperts( | |
num_experts=1, | |
d_ff=config['expert_d_ff'], | |
output_dropout=config['expert_dropout_rate'], | |
activation=tf_keras.activations.relu, | |
kernel_initializer=tf_keras.initializers.get('ones'), | |
bias_initializer=tf_keras.initializers.get('ones')) | |
inputs = make_experts_input_ones(1, 1, 2, 3) | |
outputs = layer(inputs, training=False) | |
manual_outputs = tf.constant([[[[133.0, 133.0, 133.0], | |
[133.0, 133.0, 133.0]]]]) | |
self.assertAllClose(manual_outputs, outputs, atol=1e-7) | |
def test_moe_layer(self): | |
config = small_config() | |
experts = moe.FeedForwardExperts( | |
num_experts=config['num_experts'], | |
d_ff=config['expert_d_ff'], | |
output_dropout=config['expert_dropout_rate']) | |
router = moe.ExpertsChooseMaskedRouter( | |
config['num_experts'], jitter_noise=config['jitter_noise']) | |
moe_layer = moe.MoeLayer( | |
experts, | |
router, | |
train_capacity_factor=config['train_capacity_factor'], | |
eval_capacity_factor=config['eval_capacity_factor'], | |
examples_per_group=config['examples_per_group']) | |
inputs = make_input_ones() | |
outputs = moe_layer(inputs, training=True) | |
self.assertAllEqual(tf.shape(inputs), tf.shape(outputs)) | |
var_names = sorted([v.name for v in moe_layer.trainable_variables]) | |
self.assertAllEqual([ | |
'moe/experts/intermediate/bias:0', 'moe/experts/intermediate/kernel:0', | |
'moe/experts/output/bias:0', 'moe/experts/output/kernel:0', | |
'moe/router/router_weights/bias:0', 'moe/router/router_weights/kernel:0' | |
], var_names) | |
self.assertLen(moe_layer.losses, 1) | |
metrics = [metric.name for metric in moe_layer.metrics] | |
self.assertSetEqual( | |
{ | |
'router_z_loss', 'unscaled_router_z_loss', 'load_balancing_loss', | |
'fraction_tokens_left_behind', 'router_confidence', 'expert_usage' | |
}, set(metrics)) | |
def test_moe_layer_with_backbone(self): | |
config = small_config() | |
experts = moe.FeedForwardExperts( | |
num_experts=config['num_experts'], | |
d_ff=config['expert_d_ff'], | |
output_dropout=config['expert_dropout_rate']) | |
router = moe.ExpertsChooseMaskedRouter( | |
config['num_experts'], jitter_noise=config['jitter_noise']) | |
moe_layer = moe.MoeLayer( | |
experts, | |
router, | |
train_capacity_factor=config['train_capacity_factor'], | |
eval_capacity_factor=config['eval_capacity_factor'], | |
examples_per_group=config['examples_per_group']) | |
layer = moe.MoeLayerWithBackbone(moe_layer, config['backbone_d_ff']) | |
inputs = make_input_ones() | |
outputs = layer(inputs) | |
self.assertAllEqual(tf.shape(inputs), tf.shape(outputs)) | |
if __name__ == '__main__': | |
tf.test.main() | |