# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tests for nn_blocks.""" from typing import Any, Iterable, Tuple # Import libraries from absl.testing import parameterized import numpy as np import tensorflow as tf, tf_keras from tensorflow.python.distribute import combinations from tensorflow.python.distribute import strategy_combinations from official.vision.modeling.layers import nn_blocks from official.vision.modeling.layers import nn_layers def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]: """Returns the combinations of end-to-end tests to run.""" return combinations.combine( distribution=[ strategy_combinations.default_strategy, strategy_combinations.cloud_tpu_strategy, strategy_combinations.one_device_strategy_gpu, ],) class NNBlocksTest(parameterized.TestCase, tf.test.TestCase): @parameterized.parameters( (nn_blocks.ResidualBlock, 1, False, 0.0, None), (nn_blocks.ResidualBlock, 2, True, 0.2, 0.25), ) def test_residual_block_creation(self, block_fn, strides, use_projection, stochastic_depth_drop_rate, se_ratio): input_size = 128 filter_size = 256 inputs = tf_keras.Input( shape=(input_size, input_size, filter_size), batch_size=1) block = block_fn( filter_size, strides, use_projection=use_projection, se_ratio=se_ratio, stochastic_depth_drop_rate=stochastic_depth_drop_rate, ) features = block(inputs) self.assertAllEqual( [1, input_size // strides, input_size // strides, filter_size], features.shape.as_list()) def test_layerscale_call(self): # Set up test inputs input_shape = (2, 3, 4) init_values = 1e-4 inputs = tf.ones(input_shape, dtype=tf.float32) # Instantiate LayerScale object layer_scale = nn_blocks.LayerScale(init_values) # Call LayerScale object on test inputs output = layer_scale(inputs) # Check output shape expected_output_shape = input_shape self.assertAllEqual(output.shape, expected_output_shape) # Check that output values are correct expected_output_values = init_values * np.ones(input_shape) self.assertAllClose( output.numpy(), expected_output_values, rtol=1e-5, atol=1e-5) def test_layerscale_training(self): # Verify that gamma values have changed from their initial values in one # step forward pass. # Set up test inputs input_shape = (1, 3, 4) init_values = 1e-4 inputs = tf.ones(input_shape, dtype=tf.float32) targets = tf.ones(input_shape, dtype=tf.float32) # Instantiate LayerScale object layer_scale = nn_blocks.LayerScale(init_values) # Define optimizer and loss function optimizer = tf_keras.optimizers.Adam() loss_fn = tf_keras.losses.MeanSquaredError() # Train the model for one step with tf.GradientTape() as tape: predictions = layer_scale(inputs) loss = loss_fn(targets, predictions) grads = tape.gradient(loss, layer_scale.trainable_variables) optimizer.apply_gradients(zip(grads, layer_scale.trainable_variables)) # Check that gamma values have changed updated_gamma = layer_scale.gamma.numpy()[0, 0, 0] self.assertNotEqual(updated_gamma, init_values) @parameterized.parameters( (nn_blocks.BottleneckBlock, 1, False, 0.0, None), (nn_blocks.BottleneckBlock, 2, True, 0.2, 0.25), ) def test_bottleneck_block_creation(self, block_fn, strides, use_projection, stochastic_depth_drop_rate, se_ratio): input_size = 128 filter_size = 256 inputs = tf_keras.Input( shape=(input_size, input_size, filter_size * 4), batch_size=1) block = block_fn( filter_size, strides, use_projection=use_projection, se_ratio=se_ratio, stochastic_depth_drop_rate=stochastic_depth_drop_rate) features = block(inputs) self.assertAllEqual( [1, input_size // strides, input_size // strides, filter_size * 4], features.shape.as_list()) @parameterized.parameters( (nn_blocks.InvertedBottleneckBlock, 1, 1, None, None), (nn_blocks.InvertedBottleneckBlock, 6, 1, None, None), (nn_blocks.InvertedBottleneckBlock, 1, 2, None, None), (nn_blocks.InvertedBottleneckBlock, 1, 1, 0.2, None), (nn_blocks.InvertedBottleneckBlock, 1, 1, None, 0.2), ) def test_invertedbottleneck_block_creation(self, block_fn, expand_ratio, strides, se_ratio, stochastic_depth_drop_rate): input_size = 128 in_filters = 24 out_filters = 40 inputs = tf_keras.Input( shape=(input_size, input_size, in_filters), batch_size=1) block = block_fn( in_filters=in_filters, out_filters=out_filters, expand_ratio=expand_ratio, strides=strides, se_ratio=se_ratio, stochastic_depth_drop_rate=stochastic_depth_drop_rate) features = block(inputs) self.assertAllEqual( [1, input_size // strides, input_size // strides, out_filters], features.shape.as_list()) @parameterized.parameters( (nn_blocks.TuckerConvBlock, 1, 0.25, 0.25), (nn_blocks.TuckerConvBlock, 2, 0.25, 0.25), ) def test_tucker_conv_block(self, block_fn, strides, input_compression_ratio, output_compression_ratio): input_size = 128 in_filters = 24 out_filters = 24 inputs = tf_keras.Input( shape=(input_size, input_size, in_filters), batch_size=1) block = block_fn( in_filters=in_filters, out_filters=out_filters, input_compression_ratio=input_compression_ratio, output_compression_ratio=output_compression_ratio, strides=strides) features = block(inputs) self.assertAllEqual( [1, input_size // strides, input_size // strides, out_filters], features.shape.as_list()) class ResidualInnerTest(parameterized.TestCase, tf.test.TestCase): @combinations.generate(distribution_strategy_combinations()) def test_shape(self, distribution): bsz, h, w, c = 8, 32, 32, 32 filters = 64 strides = 2 input_tensor = tf.random.uniform(shape=[bsz, h, w, c]) with distribution.scope(): test_layer = nn_blocks.ResidualInner(filters, strides) output = test_layer(input_tensor) expected_output_shape = [bsz, h // strides, w // strides, filters] self.assertEqual(expected_output_shape, output.shape.as_list()) class BottleneckResidualInnerTest(parameterized.TestCase, tf.test.TestCase): @combinations.generate(distribution_strategy_combinations()) def test_shape(self, distribution): bsz, h, w, c = 8, 32, 32, 32 filters = 64 strides = 2 input_tensor = tf.random.uniform(shape=[bsz, h, w, c]) with distribution.scope(): test_layer = nn_blocks.BottleneckResidualInner(filters, strides) output = test_layer(input_tensor) expected_output_shape = [bsz, h // strides, w // strides, filters * 4] self.assertEqual(expected_output_shape, output.shape.as_list()) class DepthwiseSeparableConvBlockTest(parameterized.TestCase, tf.test.TestCase): @combinations.generate(distribution_strategy_combinations()) def test_shape(self, distribution): batch_size, height, width, num_channels = 8, 32, 32, 32 num_filters = 64 strides = 2 input_tensor = tf.random.normal( shape=[batch_size, height, width, num_channels]) with distribution.scope(): block = nn_blocks.DepthwiseSeparableConvBlock( num_filters, strides=strides) config_dict = block.get_config() recreate_block = nn_blocks.DepthwiseSeparableConvBlock(**config_dict) output_tensor = block(input_tensor) expected_output_shape = [ batch_size, height // strides, width // strides, num_filters ] self.assertEqual(output_tensor.shape.as_list(), expected_output_shape) output_tensor = recreate_block(input_tensor) self.assertEqual(output_tensor.shape.as_list(), expected_output_shape) class ReversibleLayerTest(parameterized.TestCase, tf.test.TestCase): @combinations.generate(distribution_strategy_combinations()) def test_downsampling_non_reversible_step(self, distribution): bsz, h, w, c = 8, 32, 32, 32 filters = 64 strides = 2 input_tensor = tf.random.uniform(shape=[bsz, h, w, c]) with distribution.scope(): f = nn_blocks.ResidualInner( filters=filters // 2, strides=strides, batch_norm_first=True) g = nn_blocks.ResidualInner( filters=filters // 2, strides=1, batch_norm_first=True) test_layer = nn_blocks.ReversibleLayer(f, g) test_layer.build(input_tensor.shape) optimizer = tf_keras.optimizers.SGD(learning_rate=0.01) @tf.function def step_fn(): with tf.GradientTape() as tape: output = test_layer(input_tensor, training=True) grads = tape.gradient(output, test_layer.trainable_variables) # Test applying gradients with optimizer works optimizer.apply_gradients(zip(grads, test_layer.trainable_variables)) return output replica_output = distribution.run(step_fn) outputs = distribution.experimental_local_results(replica_output) # Assert forward pass shape expected_output_shape = [bsz, h // strides, w // strides, filters] for output in outputs: self.assertEqual(expected_output_shape, output.shape.as_list()) @combinations.generate(distribution_strategy_combinations()) def test_reversible_step(self, distribution): # Reversible layers satisfy: (a) strides = 1 (b) in_filter = out_filter bsz, h, w, c = 8, 32, 32, 32 filters = c strides = 1 input_tensor = tf.random.uniform(shape=[bsz, h, w, c]) with distribution.scope(): f = nn_blocks.ResidualInner( filters=filters // 2, strides=strides, batch_norm_first=False) g = nn_blocks.ResidualInner( filters=filters // 2, strides=1, batch_norm_first=False) test_layer = nn_blocks.ReversibleLayer(f, g) test_layer(input_tensor, training=False) # init weights optimizer = tf_keras.optimizers.SGD(learning_rate=0.01) @tf.function def step_fn(): with tf.GradientTape() as tape: output = test_layer(input_tensor, training=True) grads = tape.gradient(output, test_layer.trainable_variables) # Test applying gradients with optimizer works optimizer.apply_gradients(zip(grads, test_layer.trainable_variables)) return output @tf.function def fwd(): test_layer(input_tensor) distribution.run(fwd) # Initialize variables prev_variables = tf.identity_n(test_layer.trainable_variables) replica_output = distribution.run(step_fn) outputs = distribution.experimental_local_results(replica_output) # Assert variables values have changed values for v0, v1 in zip(prev_variables, test_layer.trainable_variables): self.assertNotAllEqual(v0, v1) # Assert forward pass shape expected_output_shape = [bsz, h // strides, w // strides, filters] for output in outputs: self.assertEqual(expected_output_shape, output.shape.as_list()) @combinations.generate(distribution_strategy_combinations()) def test_manual_gradients_correctness(self, distribution): bsz, h, w, c = 8, 32, 32, 32 filters = c strides = 1 input_tensor = tf.random.uniform(shape=[bsz, h, w, c * 4]) # bottleneck with distribution.scope(): f_manual = nn_blocks.BottleneckResidualInner( filters=filters // 2, strides=strides, batch_norm_first=False) g_manual = nn_blocks.BottleneckResidualInner( filters=filters // 2, strides=1, batch_norm_first=False) manual_grad_layer = nn_blocks.ReversibleLayer(f_manual, g_manual) manual_grad_layer(input_tensor, training=False) # init weights f_auto = nn_blocks.BottleneckResidualInner( filters=filters // 2, strides=strides, batch_norm_first=False) g_auto = nn_blocks.BottleneckResidualInner( filters=filters // 2, strides=1, batch_norm_first=False) auto_grad_layer = nn_blocks.ReversibleLayer( f_auto, g_auto, manual_grads=False) auto_grad_layer(input_tensor) # init weights # Clone all weights (tf_keras.layers.Layer has no .clone()) auto_grad_layer._f.set_weights(manual_grad_layer._f.get_weights()) auto_grad_layer._g.set_weights(manual_grad_layer._g.get_weights()) @tf.function def manual_fn(): with tf.GradientTape() as tape: output = manual_grad_layer(input_tensor, training=True) grads = tape.gradient(output, manual_grad_layer.trainable_variables) return grads @tf.function def auto_fn(): with tf.GradientTape() as tape: output = auto_grad_layer(input_tensor, training=True) grads = tape.gradient(output, auto_grad_layer.trainable_variables) return grads manual_grads = distribution.run(manual_fn) auto_grads = distribution.run(auto_fn) # Assert gradients calculated manually are close to that from autograd for manual_grad, auto_grad in zip(manual_grads, auto_grads): self.assertAllClose( distribution.experimental_local_results(manual_grad), distribution.experimental_local_results(auto_grad), atol=5e-3, rtol=5e-3) # Verify that BN moving mean and variance is correct. for manual_var, auto_var in zip(manual_grad_layer.non_trainable_variables, auto_grad_layer.non_trainable_variables): self.assertAllClose(manual_var, auto_var) # Test class that wraps a standard attention layer. If this layer is called # at any point, the list passed to the config object will be filled with a # boolean 'True'. We register this class as a Keras serializable so we can # test serialization below. @tf_keras.utils.register_keras_serializable(package='TestOnlyAttention') class ValidatedAttentionLayer(nn_layers.MultiHeadAttention): def __init__(self, call_list, **kwargs): super(ValidatedAttentionLayer, self).__init__(**kwargs) self.list = call_list def call( self, query, value, attention_mask=None, return_attention_scores=False, ): self.list.append(True) return super(ValidatedAttentionLayer, self).call( query, value, attention_mask=attention_mask, return_attention_scores=return_attention_scores) def get_config(self): config = super(ValidatedAttentionLayer, self).get_config() config['call_list'] = self.list return config # Test class implements a simple feedforward layer. If this layer is called # at any point, the list passed to the config object will be filled with a # boolean 'True'. We register this class as a Keras serializable so we can # test serialization below. @tf_keras.utils.register_keras_serializable(package='TestOnlyFeedforward') class ValidatedFeedforwardLayer(tf_keras.layers.Layer): def __init__(self, call_list, activation, **kwargs): super(ValidatedFeedforwardLayer, self).__init__(**kwargs) self.list = call_list self.activation = activation def build(self, input_shape): hidden_size = input_shape[-1] self._feedforward_dense = tf_keras.layers.EinsumDense( '...x,xy->...y', output_shape=hidden_size, bias_axes='y', activation=self.activation, name='feedforward') def call(self, inputs): self.list.append(True) return self._feedforward_dense(inputs) def get_config(self): config = super(ValidatedFeedforwardLayer, self).get_config() config['call_list'] = [] config['activation'] = self.activation return config class TransformerLayerTest(tf.test.TestCase, parameterized.TestCase): def tearDown(self): super(TransformerLayerTest, self).tearDown() tf_keras.mixed_precision.set_global_policy('float32') @parameterized.parameters(None, 2) def test_layer_creation(self, max_attention_inference_parallelism): sequence_length = 21 width = 80 attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': [] } test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, num_attention_heads=10, inner_dim=2048, inner_activation='relu', max_attention_inference_parallelism=max_attention_inference_parallelism, ) # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) output_tensor = test_layer(data_tensor) # The default output of a transformer layer should be the same as the input. self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list()) call_list = test_layer._attention_layer.get_config()['call_list'] # If call_list[0] exists and is True, the passed layer class was # instantiated from the given config properly. self.assertNotEmpty(call_list) self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.") def test_layer_creation_with_feedforward_cls(self): sequence_length = 21 width = 80 call_list = [] attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': call_list, } feedforward_call_list = [] feedforward_layer_cfg = { 'activation': 'relu', 'call_list': feedforward_call_list, } test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, feedforward_cls=ValidatedFeedforwardLayer, feedforward_cfg=feedforward_layer_cfg, num_attention_heads=10, inner_dim=None, inner_activation=None) # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) output_tensor = test_layer(data_tensor) # The default output of a transformer layer should be the same as the input. self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list()) # If call_list[0] exists and is True, the passed layer class was # instantiated from the given config properly. self.assertNotEmpty(call_list) self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.") self.assertNotEmpty(feedforward_call_list) self.assertTrue(feedforward_call_list[0], "The passed layer class wasn't instantiated.") def test_layer_creation_with_mask(self): sequence_length = 21 width = 80 call_list = [] attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': call_list, } test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, num_attention_heads=10, inner_dim=2048, inner_activation='relu') # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) # Create a 2-dimensional input (the first dimension is implicit). mask_tensor = tf_keras.Input(shape=(sequence_length, sequence_length)) output_tensor = test_layer([data_tensor, mask_tensor]) # The default output of a transformer layer should be the same as the input. self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list()) # If call_list[0] exists and is True, the passed layer class was # instantiated from the given config properly. self.assertNotEmpty(call_list) self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.") @parameterized.parameters(None, 2) def test_layer_invocation(self, max_attention_inference_parallelism): sequence_length = 21 width = 80 attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': [], } test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, num_attention_heads=10, inner_dim=2048, inner_activation='relu', max_attention_inference_parallelism=max_attention_inference_parallelism) # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) output_tensor = test_layer(data_tensor) # Create a model from the test layer. model = tf_keras.Model(data_tensor, output_tensor) # Invoke the model on test data. We can't validate the output data itself # (the NN is too complex) but this will rule out structural runtime errors. batch_size = 6 input_data = 10 * np.random.random_sample( (batch_size, sequence_length, width)) _ = model.predict(input_data) call_list = test_layer._attention_layer.get_config()['call_list'] # If call_list[0] exists and is True, the passed layer class was # instantiated from the given config properly. self.assertNotEmpty(call_list) self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.") def test_layer_invocation_with_feedforward_cls(self): sequence_length = 21 width = 80 call_list = [] attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': call_list, } feedforward_call_list = [] feedforward_layer_cfg = { 'activation': 'relu', 'call_list': feedforward_call_list, } feedforward_layer = ValidatedFeedforwardLayer(**feedforward_layer_cfg) test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, feedforward_cls=feedforward_layer, num_attention_heads=10, inner_dim=None, inner_activation=None) # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) # Create a 2-dimensional input (the first dimension is implicit). mask_tensor = tf_keras.Input(shape=(sequence_length, sequence_length)) output_tensor = test_layer([data_tensor, mask_tensor]) # Create a model from the test layer. model = tf_keras.Model([data_tensor, mask_tensor], output_tensor) # Invoke the model on test data. We can't validate the output data itself # (the NN is too complex) but this will rule out structural runtime errors. batch_size = 6 input_data = 10 * np.random.random_sample( (batch_size, sequence_length, width)) # The attention mask should be of shape (batch, from_seq_len, to_seq_len), # which here is (batch, sequence_length, sequence_length) mask_data = np.random.randint( 2, size=(batch_size, sequence_length, sequence_length)) _ = model.predict([input_data, mask_data]) # If call_list[0] exists and is True, the passed layer class was # instantiated from the given config properly. self.assertNotEmpty(call_list) self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.") self.assertNotEmpty(feedforward_call_list) self.assertTrue(feedforward_call_list[0], "The passed layer class wasn't instantiated.") def test_layer_invocation_with_mask(self): sequence_length = 21 width = 80 call_list = [] attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': call_list, } test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, num_attention_heads=10, inner_dim=2048, inner_activation='relu') # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) # Create a 2-dimensional input (the first dimension is implicit). mask_tensor = tf_keras.Input(shape=(sequence_length, sequence_length)) output_tensor = test_layer([data_tensor, mask_tensor]) # Create a model from the test layer. model = tf_keras.Model([data_tensor, mask_tensor], output_tensor) # Invoke the model on test data. We can't validate the output data itself # (the NN is too complex) but this will rule out structural runtime errors. batch_size = 6 input_data = 10 * np.random.random_sample( (batch_size, sequence_length, width)) # The attention mask should be of shape (batch, from_seq_len, to_seq_len), # which here is (batch, sequence_length, sequence_length) mask_data = np.random.randint( 2, size=(batch_size, sequence_length, sequence_length)) _ = model.predict([input_data, mask_data]) # If call_list[0] exists and is True, the passed layer class was # instantiated from the given config properly. self.assertNotEmpty(call_list) self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.") def test_layer_invocation_with_float16_dtype(self): tf_keras.mixed_precision.set_global_policy('mixed_float16') sequence_length = 21 width = 80 call_list = [] attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': call_list, } test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, num_attention_heads=10, inner_dim=2048, inner_activation='relu') # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) # Create a 2-dimensional input (the first dimension is implicit). mask_tensor = tf_keras.Input(shape=(sequence_length, sequence_length)) output_tensor = test_layer([data_tensor, mask_tensor]) # Create a model from the test layer. model = tf_keras.Model([data_tensor, mask_tensor], output_tensor) # Invoke the model on test data. We can't validate the output data itself # (the NN is too complex) but this will rule out structural runtime errors. batch_size = 6 input_data = (10 * np.random.random_sample( (batch_size, sequence_length, width))) # The attention mask should be of shape (batch, from_seq_len, to_seq_len), # which here is (batch, sequence_length, sequence_length) mask_data = np.random.randint( 2, size=(batch_size, sequence_length, sequence_length)) _ = model.predict([input_data, mask_data]) # If call_list[0] exists and is True, the passed layer class was # instantiated from the given config properly. self.assertNotEmpty(call_list) self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.") def test_transform_with_initializer(self): sequence_length = 21 width = 80 call_list = [] attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': call_list, } test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, num_attention_heads=10, inner_dim=2048, inner_activation='relu', kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=0.02)) # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) output = test_layer(data_tensor) # The default output of a transformer layer should be the same as the input. self.assertEqual(data_tensor.shape.as_list(), output.shape.as_list()) # If call_list[0] exists and is True, the passed layer class was # instantiated from the given config properly. self.assertNotEmpty(call_list) self.assertTrue(call_list[0]) def test_layer_restoration_from_config(self): sequence_length = 21 width = 80 call_list = [] attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': call_list, 'name': 'test_layer', } test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, num_attention_heads=10, inner_dim=2048, inner_activation='relu') # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) # Create a 2-dimensional input (the first dimension is implicit). mask_tensor = tf_keras.Input(shape=(sequence_length, sequence_length)) output_tensor = test_layer([data_tensor, mask_tensor]) # Create a model from the test layer. model = tf_keras.Model([data_tensor, mask_tensor], output_tensor) # Invoke the model on test data. We can't validate the output data itself # (the NN is too complex) but this will rule out structural runtime errors. batch_size = 6 input_data = 10 * np.random.random_sample( (batch_size, sequence_length, width)) # The attention mask should be of shape (batch, from_seq_len, to_seq_len), # which here is (batch, sequence_length, sequence_length) mask_data = np.random.randint( 2, size=(batch_size, sequence_length, sequence_length)) pre_serialization_output = model.predict([input_data, mask_data]) # Serialize the model config. Pass the serialized data through json to # ensure that we can serialize this layer to disk. serialized_data = model.get_config() # Create a new model from the old config, and copy the weights. These models # should have identical outputs. new_model = tf_keras.Model.from_config(serialized_data) new_model.set_weights(model.get_weights()) output = new_model.predict([input_data, mask_data]) self.assertAllClose(pre_serialization_output, output) # If the layer was configured correctly, it should have a list attribute # (since it should have the custom class and config passed to it). new_model.summary() new_call_list = new_model.get_layer( name='transformer_scaffold')._attention_layer.list self.assertNotEmpty(new_call_list) self.assertTrue(new_call_list[0], "The passed layer class wasn't instantiated.") def test_layer_with_feedforward_cls_restoration_from_config(self): sequence_length = 21 width = 80 call_list = [] attention_layer_cfg = { 'num_heads': 10, 'key_dim': 8, 'call_list': call_list, 'name': 'test_layer', } feedforward_call_list = [] feedforward_layer_cfg = { 'activation': 'relu', 'call_list': feedforward_call_list, } test_layer = nn_blocks.TransformerScaffold( attention_cls=ValidatedAttentionLayer, attention_cfg=attention_layer_cfg, feedforward_cls=ValidatedFeedforwardLayer, feedforward_cfg=feedforward_layer_cfg, num_attention_heads=10, inner_dim=None, inner_activation=None) # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf_keras.Input(shape=(sequence_length, width)) # Create a 2-dimensional input (the first dimension is implicit). mask_tensor = tf_keras.Input(shape=(sequence_length, sequence_length)) output_tensor = test_layer([data_tensor, mask_tensor]) # Create a model from the test layer. model = tf_keras.Model([data_tensor, mask_tensor], output_tensor) # Invoke the model on test data. We can't validate the output data itself # (the NN is too complex) but this will rule out structural runtime errors. batch_size = 6 input_data = 10 * np.random.random_sample( (batch_size, sequence_length, width)) # The attention mask should be of shape (batch, from_seq_len, to_seq_len), # which here is (batch, sequence_length, sequence_length) mask_data = np.random.randint( 2, size=(batch_size, sequence_length, sequence_length)) pre_serialization_output = model.predict([input_data, mask_data]) serialized_data = model.get_config() # Create a new model from the old config, and copy the weights. These models # should have identical outputs. new_model = tf_keras.Model.from_config(serialized_data) new_model.set_weights(model.get_weights()) output = new_model.predict([input_data, mask_data]) self.assertAllClose(pre_serialization_output, output) # If the layer was configured correctly, it should have a list attribute # (since it should have the custom class and config passed to it). new_model.summary() new_call_list = new_model.get_layer( name='transformer_scaffold')._attention_layer.list self.assertNotEmpty(new_call_list) self.assertTrue(new_call_list[0], "The passed layer class wasn't instantiated.") new_feedforward_call_list = new_model.get_layer( name='transformer_scaffold')._feedforward_block.list self.assertNotEmpty(new_feedforward_call_list) self.assertTrue(new_feedforward_call_list[0], "The passed layer class wasn't instantiated.") if __name__ == '__main__': tf.test.main()