ASL-MoViNet-T5-translator

Sleeping

App Files Files Community

ASL-MoViNet-T5-translator / official /nlp /modeling /networks /encoder_scaffold.py

deanna-emery

updates

93528c6 over 1 year ago

raw

history blame contribute delete

17.2 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Transformer-based text encoder network."""
	# pylint: disable=g-classes-have-attributes
	import copy
	import inspect

	from absl import logging
	import gin
	import tensorflow as tf, tf_keras

	from official.modeling import tf_utils
	from official.nlp.modeling import layers


	@tf_keras.utils.register_keras_serializable(package='Text')
	@gin.configurable
	class EncoderScaffold(tf_keras.Model):
	"""Bi-directional Transformer-based encoder network scaffold.

	This network allows users to flexibly implement an encoder similar to the one
	described in "BERT: Pre-training of Deep Bidirectional Transformers for
	Language Understanding" (https://arxiv.org/abs/1810.04805).

	In this network, users can choose to provide a custom embedding subnetwork
	(which will replace the standard embedding logic) and/or a custom hidden layer
	class (which will replace the Transformer instantiation in the encoder). For
	each of these custom injection points, users can pass either a class or a
	class instance. If a class is passed, that class will be instantiated using
	the `embedding_cfg` or `hidden_cfg` argument, respectively; if an instance
	is passed, that instance will be invoked. (In the case of hidden_cls, the
	instance will be invoked 'num_hidden_instances' times.

	If the hidden_cls is not overridden, a default transformer layer will be
	instantiated.

	Note that the network is constructed by
	[Keras Functional API](https://keras.io/guides/functional_api/).

	Args:
	pooled_output_dim: The dimension of pooled output.
	pooler_layer_initializer: The initializer for the classification layer.
	embedding_cls: The class or instance to use to embed the input data. This
	class or instance defines the inputs to this encoder and outputs (1)
	embeddings tensor with shape `(batch_size, seq_length, hidden_size)` and
	(2) attention masking with tensor `(batch_size, seq_length, seq_length)`.
	If `embedding_cls` is not set, a default embedding network (from the
	original BERT paper) will be created.
	embedding_cfg: A dict of kwargs to pass to the embedding_cls, if it needs to
	be instantiated. If `embedding_cls` is not set, a config dict must be
	passed to `embedding_cfg` with the following values:
	`vocab_size`: The size of the token vocabulary.
	`type_vocab_size`: The size of the type vocabulary.
	`hidden_size`: The hidden size for this encoder.
	`max_seq_length`: The maximum sequence length for this encoder.
	`seq_length`: The sequence length for this encoder.
	`initializer`: The initializer for the embedding portion of this encoder.
	`dropout_rate`: The dropout rate to apply before the encoding layers.
	embedding_data: A reference to the embedding weights that will be used to
	train the masked language model, if necessary. This is optional, and only
	needed if (1) you are overriding `embedding_cls` and (2) are doing
	standard pretraining.
	num_hidden_instances: The number of times to instantiate and/or invoke the
	hidden_cls.
	hidden_cls: Three types of input are supported: (1) class (2) instance
	(3) list of classes or instances, to encode the input data. If
	`hidden_cls` is not set, a KerasBERT transformer layer will be used as the
	encoder class. If `hidden_cls` is a list of classes or instances, these
	classes (instances) are sequentially instantiated (invoked) on top of
	embedding layer. Mixing classes and instances in the list is allowed.
	hidden_cfg: A dict of kwargs to pass to the hidden_cls, if it needs to be
	instantiated. If hidden_cls is not set, a config dict must be passed to
	`hidden_cfg` with the following values:
	`num_attention_heads`: The number of attention heads. The hidden size
	must be divisible by `num_attention_heads`.
	`intermediate_size`: The intermediate size of the transformer.
	`intermediate_activation`: The activation to apply in the transfomer.
	`dropout_rate`: The overall dropout rate for the transformer layers.
	`attention_dropout_rate`: The dropout rate for the attention layers.
	`kernel_initializer`: The initializer for the transformer layers.
	mask_cls: The class to generate masks passed into hidden_cls() from inputs
	and 2D mask indicating positions we can attend to. It is the caller's job
	to make sure the output of the mask_layer can be used by hidden_layer.
	A mask_cls is usually mapped to a hidden_cls.
	mask_cfg: A dict of kwargs pass to mask_cls.
	layer_norm_before_pooling: Whether to add a layer norm before the pooling
	layer. You probably want to turn this on if you set `norm_first=True` in
	transformer layers.
	return_all_layer_outputs: Whether to output sequence embedding outputs of
	all encoder transformer layers.
	dict_outputs: Whether to use a dictionary as the model outputs.
	layer_idx_as_attention_seed: Whether to include layer_idx in
	attention_cfg in hidden_cfg.
	feed_layer_idx: whether the scaffold should feed layer index to hidden_cls.
	recursive: whether to pass the second return of the hidden layer as the last
	element among the inputs. None will be passed as the initial state.
	"""

	def __init__(self,
	pooled_output_dim,
	pooler_layer_initializer=tf_keras.initializers.TruncatedNormal(
	stddev=0.02),
	embedding_cls=None,
	embedding_cfg=None,
	embedding_data=None,
	num_hidden_instances=1,
	hidden_cls=layers.Transformer,
	hidden_cfg=None,
	mask_cls=layers.SelfAttentionMask,
	mask_cfg=None,
	layer_norm_before_pooling=False,
	return_all_layer_outputs=False,
	dict_outputs=False,
	layer_idx_as_attention_seed=False,
	feed_layer_idx=False,
	recursive=False,
	**kwargs):

	if embedding_cls:
	if inspect.isclass(embedding_cls):
	embedding_network = embedding_cls(
	**embedding_cfg) if embedding_cfg else embedding_cls()
	else:
	embedding_network = embedding_cls
	inputs = embedding_network.inputs
	embeddings, attention_mask = embedding_network(inputs)
	embedding_layer = None
	position_embedding_layer = None
	type_embedding_layer = None
	embedding_norm_layer = None
	else:
	embedding_network = None
	seq_length = embedding_cfg.get('seq_length', None)
	word_ids = tf_keras.layers.Input(
	shape=(seq_length,), dtype=tf.int32, name='input_word_ids')
	mask = tf_keras.layers.Input(
	shape=(seq_length,), dtype=tf.int32, name='input_mask')
	type_ids = tf_keras.layers.Input(
	shape=(seq_length,), dtype=tf.int32, name='input_type_ids')
	inputs = [word_ids, mask, type_ids]

	embedding_layer = layers.OnDeviceEmbedding(
	vocab_size=embedding_cfg['vocab_size'],
	embedding_width=embedding_cfg['hidden_size'],
	initializer=tf_utils.clone_initializer(embedding_cfg['initializer']),
	name='word_embeddings')

	word_embeddings = embedding_layer(word_ids)

	# Always uses dynamic slicing for simplicity.
	position_embedding_layer = layers.PositionEmbedding(
	initializer=tf_utils.clone_initializer(embedding_cfg['initializer']),
	max_length=embedding_cfg['max_seq_length'],
	name='position_embedding')
	position_embeddings = position_embedding_layer(word_embeddings)

	type_embedding_layer = layers.OnDeviceEmbedding(
	vocab_size=embedding_cfg['type_vocab_size'],
	embedding_width=embedding_cfg['hidden_size'],
	initializer=tf_utils.clone_initializer(embedding_cfg['initializer']),
	use_one_hot=True,
	name='type_embeddings')
	type_embeddings = type_embedding_layer(type_ids)

	embeddings = tf_keras.layers.Add()(
	[word_embeddings, position_embeddings, type_embeddings])

	embedding_norm_layer = tf_keras.layers.LayerNormalization(
	name='embeddings/layer_norm',
	axis=-1,
	epsilon=1e-12,
	dtype=tf.float32)
	embeddings = embedding_norm_layer(embeddings)

	embeddings = (
	tf_keras.layers.Dropout(
	rate=embedding_cfg['dropout_rate'])(embeddings))

	mask_cfg = {} if mask_cfg is None else mask_cfg
	if inspect.isclass(mask_cls):
	mask_layer = mask_cls(**mask_cfg)
	else:
	mask_layer = mask_cls
	attention_mask = mask_layer(embeddings, mask)

	data = embeddings

	layer_output_data = []
	hidden_layers = []
	hidden_cfg = hidden_cfg if hidden_cfg else {}

	if isinstance(hidden_cls, list) and len(hidden_cls) != num_hidden_instances:
	raise RuntimeError(
	('When input hidden_cls to EncoderScaffold %s is a list, it must '
	'contain classes or instances with size specified by '
	'num_hidden_instances, got %d vs %d.') % self.name, len(hidden_cls),
	num_hidden_instances)
	# Consider supporting customized init states.
	recursive_states = None
	for i in range(num_hidden_instances):
	if isinstance(hidden_cls, list):
	cur_hidden_cls = hidden_cls[i]
	else:
	cur_hidden_cls = hidden_cls
	if inspect.isclass(cur_hidden_cls):
	if hidden_cfg and 'attention_cfg' in hidden_cfg and (
	layer_idx_as_attention_seed):
	hidden_cfg = copy.deepcopy(hidden_cfg)
	hidden_cfg['attention_cfg']['seed'] = i
	if feed_layer_idx:
	hidden_cfg['layer_idx'] = i
	layer = cur_hidden_cls(**hidden_cfg)
	else:
	layer = cur_hidden_cls
	if recursive:
	data, recursive_states = layer([data, attention_mask, recursive_states])
	else:
	data = layer([data, attention_mask])
	layer_output_data.append(data)
	hidden_layers.append(layer)

	if layer_norm_before_pooling:
	# Normalize the final output.
	output_layer_norm = tf_keras.layers.LayerNormalization(
	name='final_layer_norm',
	axis=-1,
	epsilon=1e-12)
	layer_output_data[-1] = output_layer_norm(layer_output_data[-1])

	last_layer_output = layer_output_data[-1]
	# Applying a tf.slice op (through subscript notation) to a Keras tensor
	# like this will create a SliceOpLambda layer. This is better than a Lambda
	# layer with Python code, because that is fundamentally less portable.
	first_token_tensor = last_layer_output[:, 0, :]
	pooler_layer_initializer = tf_keras.initializers.get(
	pooler_layer_initializer)
	pooler_layer = tf_keras.layers.Dense(
	units=pooled_output_dim,
	activation='tanh',
	kernel_initializer=pooler_layer_initializer,
	name='cls_transform')
	cls_output = pooler_layer(first_token_tensor)

	if dict_outputs:
	outputs = dict(
	sequence_output=layer_output_data[-1],
	pooled_output=cls_output,
	encoder_outputs=layer_output_data,
	)
	elif return_all_layer_outputs:
	outputs = [layer_output_data, cls_output]
	else:
	outputs = [layer_output_data[-1], cls_output]

	# b/164516224
	# Once we've created the network using the Functional API, we call
	# super().__init__ as though we were invoking the Functional API Model
	# constructor, resulting in this object having all the properties of a model
	# created using the Functional API. Once super().__init__ is called, we
	# can assign attributes to `self` - note that all `self` assignments are
	# below this line.
	super().__init__(
	inputs=inputs, outputs=outputs, **kwargs)

	self._hidden_cls = hidden_cls
	self._hidden_cfg = hidden_cfg
	self._mask_cls = mask_cls
	self._mask_cfg = mask_cfg
	self._num_hidden_instances = num_hidden_instances
	self._pooled_output_dim = pooled_output_dim
	self._pooler_layer_initializer = pooler_layer_initializer
	self._embedding_cls = embedding_cls
	self._embedding_cfg = embedding_cfg
	self._embedding_data = embedding_data
	self._layer_norm_before_pooling = layer_norm_before_pooling
	self._return_all_layer_outputs = return_all_layer_outputs
	self._dict_outputs = dict_outputs
	self._kwargs = kwargs

	self._embedding_layer = embedding_layer
	self._embedding_network = embedding_network
	self._position_embedding_layer = position_embedding_layer
	self._type_embedding_layer = type_embedding_layer
	self._embedding_norm_layer = embedding_norm_layer
	self._hidden_layers = hidden_layers
	if self._layer_norm_before_pooling:
	self._output_layer_norm = output_layer_norm
	self._pooler_layer = pooler_layer
	self._layer_idx_as_attention_seed = layer_idx_as_attention_seed

	logging.info('EncoderScaffold configs: %s', self.get_config())

	def get_config(self):
	config_dict = {
	'num_hidden_instances': self._num_hidden_instances,
	'pooled_output_dim': self._pooled_output_dim,
	'pooler_layer_initializer': tf_keras.initializers.serialize(
	self._pooler_layer_initializer),
	'embedding_cls': self._embedding_network,
	'embedding_cfg': self._embedding_cfg,
	'layer_norm_before_pooling': self._layer_norm_before_pooling,
	'return_all_layer_outputs': self._return_all_layer_outputs,
	'dict_outputs': self._dict_outputs,
	'layer_idx_as_attention_seed': self._layer_idx_as_attention_seed
	}
	cfgs = {
	'hidden_cfg': self._hidden_cfg,
	'mask_cfg': self._mask_cfg
	}

	for cfg_name, cfg in cfgs.items():
	if cfg:
	config_dict[cfg_name] = {}
	for k, v in cfg.items():
	# `self._hidden_cfg` may contain `class`, e.g., when `hidden_cfg` is
	# `TransformerScaffold`, `attention_cls` argument can be a `class`.
	if inspect.isclass(v):
	config_dict[cfg_name][k] = tf_keras.utils.get_registered_name(v)
	else:
	config_dict[cfg_name][k] = v

	clss = {
	'hidden_cls': self._hidden_cls,
	'mask_cls': self._mask_cls
	}

	for cls_name, cls in clss.items():
	if inspect.isclass(cls):
	key = '{}_string'.format(cls_name)
	config_dict[key] = tf_keras.utils.get_registered_name(cls)
	else:
	config_dict[cls_name] = cls

	config_dict.update(self._kwargs)
	return config_dict

	@classmethod
	def from_config(cls, config, custom_objects=None):
	cls_names = ['hidden_cls', 'mask_cls']
	for cls_name in cls_names:
	cls_string = '{}_string'.format(cls_name)
	if cls_string in config:
	config[cls_name] = tf_keras.utils.get_registered_object(
	config[cls_string], custom_objects=custom_objects)
	del config[cls_string]
	return cls(**config)

	def get_embedding_table(self):
	if self._embedding_network is None:
	# In this case, we don't have a custom embedding network and can return
	# the standard embedding data.
	return self._embedding_layer.embeddings

	if self._embedding_data is None:
	raise RuntimeError(('The EncoderScaffold %s does not have a reference '
	'to the embedding data. This is required when you '
	'pass a custom embedding network to the scaffold. '
	'It is also possible that you are trying to get '
	'embedding data from an embedding scaffold with a '
	'custom embedding network where the scaffold has '
	'been serialized and deserialized. Unfortunately, '
	'accessing custom embedding references after '
	'serialization is not yet supported.') % self.name)
	else:
	return self._embedding_data

	@property
	def embedding_network(self):
	if self._embedding_network is None:
	raise RuntimeError(
	('The EncoderScaffold %s does not have a reference '
	'to the embedding network. This is required when you '
	'pass a custom embedding network to the scaffold.') % self.name)
	return self._embedding_network

	@property
	def hidden_layers(self):
	"""List of hidden layers in the encoder."""
	return self._hidden_layers

	@property
	def pooler_layer(self):
	"""The pooler dense layer after the transformer layers."""
	return self._pooler_layer