ASL-MoViNet-T5-translator

Sleeping

App Files Files Community

ASL-MoViNet-T5-translator / official /modeling /optimization /configs /optimizer_config.py

deanna-emery

updates

93528c6 over 1 year ago

raw

history blame contribute delete

13.1 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Dataclasses for optimizer configs."""
	from typing import List, Optional

	import dataclasses
	from official.modeling.hyperparams import base_config


	@dataclasses.dataclass
	class BaseOptimizerConfig(base_config.Config):
	"""Base optimizer config.

	Attributes:
	clipnorm: float >= 0 or None. If not None, Gradients will be clipped when
	their L2 norm exceeds this value.
	clipvalue: float >= 0 or None. If not None, Gradients will be clipped when
	their absolute value exceeds this value.
	global_clipnorm: float >= 0 or None. If not None, gradient of all weights is
	clipped so that their global norm is no higher than this value
	"""
	clipnorm: Optional[float] = None
	clipvalue: Optional[float] = None
	global_clipnorm: Optional[float] = None


	@dataclasses.dataclass
	class SGDConfig(BaseOptimizerConfig):
	"""Configuration for SGD optimizer.

	The attributes for this class matches the arguments of tf_keras.optimizer.SGD.

	Attributes:
	name: name of the optimizer.
	decay: decay rate for SGD optimizer.
	nesterov: nesterov for SGD optimizer.
	momentum: momentum for SGD optimizer.
	"""
	name: str = "SGD"
	decay: float = 0.0
	nesterov: bool = False
	momentum: float = 0.0


	# TODO(b/216129465): Merge this config with SGDConfig after the experimental
	# optimizer graduates.
	@dataclasses.dataclass
	class SGDExperimentalConfig(BaseOptimizerConfig):
	"""Configuration for SGD optimizer.

	The attributes for this class matches the arguments of
	`tf_keras.optimizer.experimental.SGD`.

	Attributes:
	name: name of the optimizer.
	nesterov: nesterov for SGD optimizer.
	momentum: momentum for SGD optimizer.
	jit_compile: if True, jit compile will be used.
	"""
	name: str = "SGD"
	nesterov: bool = False
	momentum: float = 0.0
	jit_compile: bool = False


	@dataclasses.dataclass
	class RMSPropConfig(BaseOptimizerConfig):
	"""Configuration for RMSProp optimizer.

	The attributes for this class matches the arguments of
	tf_keras.optimizers.RMSprop.

	Attributes:
	name: name of the optimizer.
	rho: discounting factor for RMSprop optimizer.
	momentum: momentum for RMSprop optimizer.
	epsilon: epsilon value for RMSprop optimizer, help with numerical stability.
	centered: Whether to normalize gradients or not.
	"""
	name: str = "RMSprop"
	rho: float = 0.9
	momentum: float = 0.0
	epsilon: float = 1e-7
	centered: bool = False


	@dataclasses.dataclass
	class AdagradConfig(BaseOptimizerConfig):
	"""Configuration for Adagrad optimizer.

	The attributes of this class match the arguments of
	tf_keras.optimizer.Adagrad.

	Attributes:
	name: name of the optimizer.
	initial_accumulator_value: A floating point value. Starting value for the
	accumulators, must be non-negative.
	epsilon: A small floating point value to avoid zero denominator.
	"""
	name: str = "Adagrad"
	initial_accumulator_value: float = 0.1
	epsilon: float = 1e-07


	@dataclasses.dataclass
	class AdamConfig(BaseOptimizerConfig):
	"""Configuration for Adam optimizer.

	The attributes for this class matches the arguments of
	tf_keras.optimizer.Adam.

	Attributes:
	name: name of the optimizer.
	beta_1: decay rate for 1st order moments.
	beta_2: decay rate for 2st order moments.
	epsilon: epsilon value used for numerical stability in Adam optimizer.
	amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
	the paper "On the Convergence of Adam and beyond".
	"""
	name: str = "Adam"
	beta_1: float = 0.9
	beta_2: float = 0.999
	epsilon: float = 1e-07
	amsgrad: bool = False


	@dataclasses.dataclass
	class AdamExperimentalConfig(BaseOptimizerConfig):
	"""Configuration for experimental Adam optimizer.

	The attributes for this class matches the arguments of
	`tf_keras.optimizer.experimental.Adam`.

	Attributes:
	name: name of the optimizer.
	beta_1: decay rate for 1st order moments.
	beta_2: decay rate for 2st order moments.
	epsilon: epsilon value used for numerical stability in Adam optimizer.
	amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
	the paper "On the Convergence of Adam and beyond".
	jit_compile: if True, jit compile will be used.
	"""
	name: str = "Adam"
	beta_1: float = 0.9
	beta_2: float = 0.999
	epsilon: float = 1e-07
	amsgrad: bool = False
	jit_compile: bool = False


	@dataclasses.dataclass
	class AdamWeightDecayConfig(BaseOptimizerConfig):
	"""Configuration for Adam optimizer with weight decay.

	Attributes:
	name: name of the optimizer.
	beta_1: decay rate for 1st order moments.
	beta_2: decay rate for 2st order moments.
	epsilon: epsilon value used for numerical stability in the optimizer.
	amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
	the paper "On the Convergence of Adam and beyond".
	weight_decay_rate: float. Weight decay rate. Default to 0.
	include_in_weight_decay: list[str], or None. List of weight names to include
	in weight decay.
	exclude_from_weight_decay: list[str], or None. List of weight names to not
	include in weight decay.
	gradient_clip_norm: A positive float. Clips the gradients to this maximum
	L2-norm. Default to 1.0.
	"""
	name: str = "AdamWeightDecay"
	beta_1: float = 0.9
	beta_2: float = 0.999
	epsilon: float = 1e-07
	amsgrad: bool = False
	weight_decay_rate: float = 0.0
	include_in_weight_decay: Optional[List[str]] = None
	exclude_from_weight_decay: Optional[List[str]] = None
	gradient_clip_norm: float = 1.0


	@dataclasses.dataclass
	class AdamWeightDecayExperimentalConfig(BaseOptimizerConfig):
	"""Configuration for Adam optimizer with weight decay.

	Attributes:
	name: name of the optimizer.
	beta_1: decay rate for 1st order moments.
	beta_2: decay rate for 2st order moments.
	epsilon: epsilon value used for numerical stability in the optimizer.
	amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
	the paper "On the Convergence of Adam and beyond".
	weight_decay: float. Weight decay rate. Default to 0.
	global_clipnorm: A positive float. Clips the gradients to this maximum
	L2-norm. Default to 1.0.
	jit_compile: if True, jit compile will be used.
	"""
	name: str = "AdamWeightDecayExperimental"
	beta_1: float = 0.9
	beta_2: float = 0.999
	epsilon: float = 1e-07
	amsgrad: bool = False
	weight_decay: float = 0.0
	global_clipnorm: float = 1.0
	jit_compile: bool = False


	@dataclasses.dataclass
	class LAMBConfig(BaseOptimizerConfig):
	"""Configuration for LAMB optimizer.

	The attributes for this class matches the arguments of LAMB optimizer.

	Attributes:
	name: name of the optimizer.
	beta_1: decay rate for 1st order moments.
	beta_2: decay rate for 2st order moments.
	epsilon: epsilon value used for numerical stability in LAMB optimizer.
	weight_decay_rate: float. Weight decay rate. Default to 0.
	exclude_from_weight_decay: List of regex patterns of variables excluded from
	weight decay. Variables whose name contain a substring matching the
	pattern will be excluded.
	exclude_from_layer_adaptation: List of regex patterns of variables excluded
	from layer adaptation. Variables whose name contain a substring matching
	the pattern will be excluded.
	"""
	name: str = "LAMB"
	beta_1: float = 0.9
	beta_2: float = 0.999
	epsilon: float = 1e-6
	weight_decay_rate: float = 0.0
	exclude_from_weight_decay: Optional[List[str]] = None
	exclude_from_layer_adaptation: Optional[List[str]] = None


	@dataclasses.dataclass
	class EMAConfig(BaseOptimizerConfig):
	"""Exponential moving average optimizer config.

	Attributes:
	name: 'str', name of the optimizer.
	trainable_weights_only: 'bool', if True, only model trainable weights will
	be updated. Otherwise, all model weights will be updated. This mainly
	affects batch normalization parameters.
	average_decay: 'float', average decay value.
	start_step: 'int', start step to apply moving average.
	dynamic_decay: 'bool', whether to apply dynamic decay or not.
	"""
	name: str = "ExponentialMovingAverage"
	trainable_weights_only: bool = True
	average_decay: float = 0.99
	start_step: int = 0
	dynamic_decay: bool = True


	@dataclasses.dataclass
	class LARSConfig(BaseOptimizerConfig):
	"""Layer-wise adaptive rate scaling config.

	Attributes:
	name: 'str', name of the optimizer.
	momentum: `float` hyperparameter >= 0 that accelerates gradient descent in
	the relevant direction and dampens oscillations. Defaults to 0.9.
	eeta: `float` LARS coefficient as used in the paper. Default set to LARS
	coefficient from the paper. (eeta / weight_decay) determines the highest
	scaling factor in LARS..
	weight_decay_rate: `float` for weight decay.
	nesterov: 'boolean' for whether to use nesterov momentum.
	classic_momentum: `boolean` for whether to use classic (or popular)
	momentum. The learning rate is applied during momentum update in classic
	momentum, but after momentum for popular momentum.
	exclude_from_weight_decay: A list of `string` for variable screening, if any
	of the string appears in a variable's name, the variable will be excluded
	for computing weight decay. For example, one could specify the list like
	['batch_normalization', 'bias'] to exclude BN and bias from weight decay.
	exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for
	layer adaptation. If it is None, it will be defaulted the same as
	exclude_from_weight_decay.
	"""
	name: str = "LARS"
	momentum: float = 0.9
	eeta: float = 0.001
	weight_decay_rate: float = 0.0
	nesterov: bool = False
	classic_momentum: bool = True
	exclude_from_weight_decay: Optional[List[str]] = None
	exclude_from_layer_adaptation: Optional[List[str]] = None


	@dataclasses.dataclass
	class SLIDEConfig(BaseOptimizerConfig):
	"""Configuration for SLIDE optimizer.

	Details coming soon.
	"""
	name: str = "SLIDE"
	beta_1: float = 0.9
	beta_2: float = 0.999
	epsilon: float = 1e-6
	weight_decay_rate: float = 0.0
	weight_decay_type: str = "inner"
	exclude_from_weight_decay: Optional[List[str]] = None
	exclude_from_layer_adaptation: Optional[List[str]] = None
	include_in_sparse_layer_adaptation: Optional[List[str]] = None
	sparse_layer_learning_rate: float = 0.1
	do_gradient_rescaling: bool = True
	norm_type: str = "layer"
	ratio_clip_norm: float = 1e5


	@dataclasses.dataclass
	class AdafactorConfig(BaseOptimizerConfig):
	"""Configuration for Adafactor optimizer.

	The attributes for this class matches the arguments of the Adafactor
	implementation.
	"""
	name: str = "Adafactor"
	factored: bool = True
	multiply_by_parameter_scale: bool = True
	beta1: Optional[float] = None
	decay_rate: float = 0.8
	step_offset: int = 0
	clipping_threshold: float = 1.0
	min_dim_size_to_factor: int = 128
	epsilon1: float = 1e-30
	epsilon2: float = 1e-3
	weight_decay: Optional[float] = None
	include_in_weight_decay: Optional[str] = None


	@dataclasses.dataclass
	class AdafactorKerasConfig(BaseOptimizerConfig):
	"""Configuration for AdafactorKeras optimizer.

	The attributes for this class matches the arguments of the Adafactor
	implementation provided by keras.

	Attributes:
	learning_rate: Initial value for the learning rate: either a floating
	point value, or a
	`tf_keras.optimizers.schedules.LearningRateSchedule` instance.
	Defaults to 0.001.
	beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`.
	epsilon_1: float, defaults to 1e-30. A small offset to keep denominator
	away from 0.
	epsilon_2: float, defaults to 1e-3. A small offset to avoid learning
	rate becoming too small by time.
	clip_threshold: float, defaults to 1.0. Clipping threshold. This is a
	part of Adafactor algorithm, independent from `clipnorm`, `clipvalue`
	and `global_clipnorm`.
	relative_step: bool, defaults to True. If `learning_rate` is a constant
	and `relative_step=True`, learning rate will be adjusted based on
	current iterations. This is a default learning rate decay in
	Adafactor.
	"""
	name: str = "Adafactor"
	learning_rate: float = 0.001
	beta_2_decay: float = -0.8
	epsilon_1: float = 1e-30
	epsilon_2: float = 1e-3
	clip_threshold: float = 1.0
	relative_step: bool = True