Spaces:
Sleeping
Sleeping
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Dataclasses for optimizer configs.""" | |
from typing import List, Optional | |
import dataclasses | |
from official.modeling.hyperparams import base_config | |
class BaseOptimizerConfig(base_config.Config): | |
"""Base optimizer config. | |
Attributes: | |
clipnorm: float >= 0 or None. If not None, Gradients will be clipped when | |
their L2 norm exceeds this value. | |
clipvalue: float >= 0 or None. If not None, Gradients will be clipped when | |
their absolute value exceeds this value. | |
global_clipnorm: float >= 0 or None. If not None, gradient of all weights is | |
clipped so that their global norm is no higher than this value | |
""" | |
clipnorm: Optional[float] = None | |
clipvalue: Optional[float] = None | |
global_clipnorm: Optional[float] = None | |
class SGDConfig(BaseOptimizerConfig): | |
"""Configuration for SGD optimizer. | |
The attributes for this class matches the arguments of tf_keras.optimizer.SGD. | |
Attributes: | |
name: name of the optimizer. | |
decay: decay rate for SGD optimizer. | |
nesterov: nesterov for SGD optimizer. | |
momentum: momentum for SGD optimizer. | |
""" | |
name: str = "SGD" | |
decay: float = 0.0 | |
nesterov: bool = False | |
momentum: float = 0.0 | |
# TODO(b/216129465): Merge this config with SGDConfig after the experimental | |
# optimizer graduates. | |
class SGDExperimentalConfig(BaseOptimizerConfig): | |
"""Configuration for SGD optimizer. | |
The attributes for this class matches the arguments of | |
`tf_keras.optimizer.experimental.SGD`. | |
Attributes: | |
name: name of the optimizer. | |
nesterov: nesterov for SGD optimizer. | |
momentum: momentum for SGD optimizer. | |
jit_compile: if True, jit compile will be used. | |
""" | |
name: str = "SGD" | |
nesterov: bool = False | |
momentum: float = 0.0 | |
jit_compile: bool = False | |
class RMSPropConfig(BaseOptimizerConfig): | |
"""Configuration for RMSProp optimizer. | |
The attributes for this class matches the arguments of | |
tf_keras.optimizers.RMSprop. | |
Attributes: | |
name: name of the optimizer. | |
rho: discounting factor for RMSprop optimizer. | |
momentum: momentum for RMSprop optimizer. | |
epsilon: epsilon value for RMSprop optimizer, help with numerical stability. | |
centered: Whether to normalize gradients or not. | |
""" | |
name: str = "RMSprop" | |
rho: float = 0.9 | |
momentum: float = 0.0 | |
epsilon: float = 1e-7 | |
centered: bool = False | |
class AdagradConfig(BaseOptimizerConfig): | |
"""Configuration for Adagrad optimizer. | |
The attributes of this class match the arguments of | |
tf_keras.optimizer.Adagrad. | |
Attributes: | |
name: name of the optimizer. | |
initial_accumulator_value: A floating point value. Starting value for the | |
accumulators, must be non-negative. | |
epsilon: A small floating point value to avoid zero denominator. | |
""" | |
name: str = "Adagrad" | |
initial_accumulator_value: float = 0.1 | |
epsilon: float = 1e-07 | |
class AdamConfig(BaseOptimizerConfig): | |
"""Configuration for Adam optimizer. | |
The attributes for this class matches the arguments of | |
tf_keras.optimizer.Adam. | |
Attributes: | |
name: name of the optimizer. | |
beta_1: decay rate for 1st order moments. | |
beta_2: decay rate for 2st order moments. | |
epsilon: epsilon value used for numerical stability in Adam optimizer. | |
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from | |
the paper "On the Convergence of Adam and beyond". | |
""" | |
name: str = "Adam" | |
beta_1: float = 0.9 | |
beta_2: float = 0.999 | |
epsilon: float = 1e-07 | |
amsgrad: bool = False | |
class AdamExperimentalConfig(BaseOptimizerConfig): | |
"""Configuration for experimental Adam optimizer. | |
The attributes for this class matches the arguments of | |
`tf_keras.optimizer.experimental.Adam`. | |
Attributes: | |
name: name of the optimizer. | |
beta_1: decay rate for 1st order moments. | |
beta_2: decay rate for 2st order moments. | |
epsilon: epsilon value used for numerical stability in Adam optimizer. | |
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from | |
the paper "On the Convergence of Adam and beyond". | |
jit_compile: if True, jit compile will be used. | |
""" | |
name: str = "Adam" | |
beta_1: float = 0.9 | |
beta_2: float = 0.999 | |
epsilon: float = 1e-07 | |
amsgrad: bool = False | |
jit_compile: bool = False | |
class AdamWeightDecayConfig(BaseOptimizerConfig): | |
"""Configuration for Adam optimizer with weight decay. | |
Attributes: | |
name: name of the optimizer. | |
beta_1: decay rate for 1st order moments. | |
beta_2: decay rate for 2st order moments. | |
epsilon: epsilon value used for numerical stability in the optimizer. | |
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from | |
the paper "On the Convergence of Adam and beyond". | |
weight_decay_rate: float. Weight decay rate. Default to 0. | |
include_in_weight_decay: list[str], or None. List of weight names to include | |
in weight decay. | |
exclude_from_weight_decay: list[str], or None. List of weight names to not | |
include in weight decay. | |
gradient_clip_norm: A positive float. Clips the gradients to this maximum | |
L2-norm. Default to 1.0. | |
""" | |
name: str = "AdamWeightDecay" | |
beta_1: float = 0.9 | |
beta_2: float = 0.999 | |
epsilon: float = 1e-07 | |
amsgrad: bool = False | |
weight_decay_rate: float = 0.0 | |
include_in_weight_decay: Optional[List[str]] = None | |
exclude_from_weight_decay: Optional[List[str]] = None | |
gradient_clip_norm: float = 1.0 | |
class AdamWeightDecayExperimentalConfig(BaseOptimizerConfig): | |
"""Configuration for Adam optimizer with weight decay. | |
Attributes: | |
name: name of the optimizer. | |
beta_1: decay rate for 1st order moments. | |
beta_2: decay rate for 2st order moments. | |
epsilon: epsilon value used for numerical stability in the optimizer. | |
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from | |
the paper "On the Convergence of Adam and beyond". | |
weight_decay: float. Weight decay rate. Default to 0. | |
global_clipnorm: A positive float. Clips the gradients to this maximum | |
L2-norm. Default to 1.0. | |
jit_compile: if True, jit compile will be used. | |
""" | |
name: str = "AdamWeightDecayExperimental" | |
beta_1: float = 0.9 | |
beta_2: float = 0.999 | |
epsilon: float = 1e-07 | |
amsgrad: bool = False | |
weight_decay: float = 0.0 | |
global_clipnorm: float = 1.0 | |
jit_compile: bool = False | |
class LAMBConfig(BaseOptimizerConfig): | |
"""Configuration for LAMB optimizer. | |
The attributes for this class matches the arguments of LAMB optimizer. | |
Attributes: | |
name: name of the optimizer. | |
beta_1: decay rate for 1st order moments. | |
beta_2: decay rate for 2st order moments. | |
epsilon: epsilon value used for numerical stability in LAMB optimizer. | |
weight_decay_rate: float. Weight decay rate. Default to 0. | |
exclude_from_weight_decay: List of regex patterns of variables excluded from | |
weight decay. Variables whose name contain a substring matching the | |
pattern will be excluded. | |
exclude_from_layer_adaptation: List of regex patterns of variables excluded | |
from layer adaptation. Variables whose name contain a substring matching | |
the pattern will be excluded. | |
""" | |
name: str = "LAMB" | |
beta_1: float = 0.9 | |
beta_2: float = 0.999 | |
epsilon: float = 1e-6 | |
weight_decay_rate: float = 0.0 | |
exclude_from_weight_decay: Optional[List[str]] = None | |
exclude_from_layer_adaptation: Optional[List[str]] = None | |
class EMAConfig(BaseOptimizerConfig): | |
"""Exponential moving average optimizer config. | |
Attributes: | |
name: 'str', name of the optimizer. | |
trainable_weights_only: 'bool', if True, only model trainable weights will | |
be updated. Otherwise, all model weights will be updated. This mainly | |
affects batch normalization parameters. | |
average_decay: 'float', average decay value. | |
start_step: 'int', start step to apply moving average. | |
dynamic_decay: 'bool', whether to apply dynamic decay or not. | |
""" | |
name: str = "ExponentialMovingAverage" | |
trainable_weights_only: bool = True | |
average_decay: float = 0.99 | |
start_step: int = 0 | |
dynamic_decay: bool = True | |
class LARSConfig(BaseOptimizerConfig): | |
"""Layer-wise adaptive rate scaling config. | |
Attributes: | |
name: 'str', name of the optimizer. | |
momentum: `float` hyperparameter >= 0 that accelerates gradient descent in | |
the relevant direction and dampens oscillations. Defaults to 0.9. | |
eeta: `float` LARS coefficient as used in the paper. Default set to LARS | |
coefficient from the paper. (eeta / weight_decay) determines the highest | |
scaling factor in LARS.. | |
weight_decay_rate: `float` for weight decay. | |
nesterov: 'boolean' for whether to use nesterov momentum. | |
classic_momentum: `boolean` for whether to use classic (or popular) | |
momentum. The learning rate is applied during momentum update in classic | |
momentum, but after momentum for popular momentum. | |
exclude_from_weight_decay: A list of `string` for variable screening, if any | |
of the string appears in a variable's name, the variable will be excluded | |
for computing weight decay. For example, one could specify the list like | |
['batch_normalization', 'bias'] to exclude BN and bias from weight decay. | |
exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for | |
layer adaptation. If it is None, it will be defaulted the same as | |
exclude_from_weight_decay. | |
""" | |
name: str = "LARS" | |
momentum: float = 0.9 | |
eeta: float = 0.001 | |
weight_decay_rate: float = 0.0 | |
nesterov: bool = False | |
classic_momentum: bool = True | |
exclude_from_weight_decay: Optional[List[str]] = None | |
exclude_from_layer_adaptation: Optional[List[str]] = None | |
class SLIDEConfig(BaseOptimizerConfig): | |
"""Configuration for SLIDE optimizer. | |
Details coming soon. | |
""" | |
name: str = "SLIDE" | |
beta_1: float = 0.9 | |
beta_2: float = 0.999 | |
epsilon: float = 1e-6 | |
weight_decay_rate: float = 0.0 | |
weight_decay_type: str = "inner" | |
exclude_from_weight_decay: Optional[List[str]] = None | |
exclude_from_layer_adaptation: Optional[List[str]] = None | |
include_in_sparse_layer_adaptation: Optional[List[str]] = None | |
sparse_layer_learning_rate: float = 0.1 | |
do_gradient_rescaling: bool = True | |
norm_type: str = "layer" | |
ratio_clip_norm: float = 1e5 | |
class AdafactorConfig(BaseOptimizerConfig): | |
"""Configuration for Adafactor optimizer. | |
The attributes for this class matches the arguments of the Adafactor | |
implementation. | |
""" | |
name: str = "Adafactor" | |
factored: bool = True | |
multiply_by_parameter_scale: bool = True | |
beta1: Optional[float] = None | |
decay_rate: float = 0.8 | |
step_offset: int = 0 | |
clipping_threshold: float = 1.0 | |
min_dim_size_to_factor: int = 128 | |
epsilon1: float = 1e-30 | |
epsilon2: float = 1e-3 | |
weight_decay: Optional[float] = None | |
include_in_weight_decay: Optional[str] = None | |
class AdafactorKerasConfig(BaseOptimizerConfig): | |
"""Configuration for AdafactorKeras optimizer. | |
The attributes for this class matches the arguments of the Adafactor | |
implementation provided by keras. | |
Attributes: | |
learning_rate: Initial value for the learning rate: either a floating | |
point value, or a | |
`tf_keras.optimizers.schedules.LearningRateSchedule` instance. | |
Defaults to 0.001. | |
beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`. | |
epsilon_1: float, defaults to 1e-30. A small offset to keep denominator | |
away from 0. | |
epsilon_2: float, defaults to 1e-3. A small offset to avoid learning | |
rate becoming too small by time. | |
clip_threshold: float, defaults to 1.0. Clipping threshold. This is a | |
part of Adafactor algorithm, independent from `clipnorm`, `clipvalue` | |
and `global_clipnorm`. | |
relative_step: bool, defaults to True. If `learning_rate` is a constant | |
and `relative_step=True`, learning rate will be adjusted based on | |
current iterations. This is a default learning rate decay in | |
Adafactor. | |
""" | |
name: str = "Adafactor" | |
learning_rate: float = 0.001 | |
beta_2_decay: float = -0.8 | |
epsilon_1: float = 1e-30 | |
epsilon_2: float = 1e-3 | |
clip_threshold: float = 1.0 | |
relative_step: bool = True | |