Spaces:

deanna-emery
/

ASL-MoViNet-T5-translator

Runtime error

App Files Files Community

ASL-MoViNet-T5-translator / official /legacy /transformer /model_params.py

deanna-emery

updates

93528c6 about 1 year ago

raw

history blame

2.94 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Defines Transformer model parameters."""

	import collections


	BASE_PARAMS = collections.defaultdict(
	lambda: None, # Set default value to None.

	# Input params
	default_batch_size=2048, # Maximum number of tokens per batch of examples.
	default_batch_size_tpu=32768,
	max_length=256, # Maximum number of tokens per example.

	# Model params
	initializer_gain=1.0, # Used in trainable variable initialization.
	vocab_size=33708, # Number of tokens defined in the vocabulary file.
	hidden_size=512, # Model dimension in the hidden layers.
	num_hidden_layers=6, # Number of layers in the encoder and decoder stacks.
	num_heads=8, # Number of heads to use in multi-headed attention.
	filter_size=2048, # Inner layer dimension in the feedforward network.

	# Dropout values (only used when training)
	layer_postprocess_dropout=0.1,
	attention_dropout=0.1,
	relu_dropout=0.1,

	# Training params
	label_smoothing=0.1,
	learning_rate=2.0,
	learning_rate_decay_rate=1.0,
	learning_rate_warmup_steps=16000,

	# Optimizer params
	optimizer_adam_beta1=0.9,
	optimizer_adam_beta2=0.997,
	optimizer_adam_epsilon=1e-09,

	# Default prediction params
	extra_decode_length=50,
	beam_size=4,
	alpha=0.6, # used to calculate length normalization in beam search

	# TPU specific parameters
	use_tpu=False,
	static_batch=False,
	allow_ffn_pad=True,
	)

	BIG_PARAMS = BASE_PARAMS.copy()
	BIG_PARAMS.update(
	default_batch_size=4096,

	# default batch size is smaller than for BASE_PARAMS due to memory limits.
	default_batch_size_tpu=16384,

	hidden_size=1024,
	filter_size=4096,
	num_heads=16,
	)

	# Parameters for running the model in multi gpu. These should not change the
	# params that modify the model shape (such as the hidden_size or num_heads).
	BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
	BASE_MULTI_GPU_PARAMS.update(
	learning_rate_warmup_steps=8000
	)

	BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
	BIG_MULTI_GPU_PARAMS.update(
	layer_postprocess_dropout=0.3,
	learning_rate_warmup_steps=8000
	)

	# Parameters for testing the model
	TINY_PARAMS = BASE_PARAMS.copy()
	TINY_PARAMS.update(
	default_batch_size=1024,
	default_batch_size_tpu=1024,
	hidden_size=32,
	num_heads=4,
	filter_size=256,
	)