# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Optimizer factory class.""" from typing import Callable, List, Optional, Tuple, Union import gin import tensorflow as tf, tf_keras from official.modeling.optimization import slide_optimizer from official.modeling.optimization import adafactor_optimizer from official.modeling.optimization import ema_optimizer from official.modeling.optimization import lamb from official.modeling.optimization import lars from official.modeling.optimization import legacy_adamw from official.modeling.optimization import lr_schedule from official.modeling.optimization.configs import optimization_config as opt_cfg # Optimizer CLS to be used in both legacy and new path. SHARED_OPTIMIZERS = { 'sgd_experimental': tf_keras.optimizers.experimental.SGD, 'adam_experimental': tf_keras.optimizers.experimental.Adam, 'adamw': legacy_adamw.AdamWeightDecay, 'adamw_experimental': tf_keras.optimizers.experimental.AdamW, 'lamb': lamb.LAMB, 'lars': lars.LARS, 'slide': slide_optimizer.SLIDE, 'adafactor': adafactor_optimizer.Adafactor, 'adafactor_keras': tf_keras.optimizers.Adafactor, } LEGACY_OPTIMIZERS_CLS = { 'sgd': tf_keras.optimizers.legacy.SGD, 'adam': tf_keras.optimizers.legacy.Adam, 'rmsprop': tf_keras.optimizers.legacy.RMSprop, 'adagrad': tf_keras.optimizers.legacy.Adagrad, } LEGACY_OPTIMIZERS_CLS.update(SHARED_OPTIMIZERS) NEW_OPTIMIZERS_CLS = { 'sgd': tf_keras.optimizers.experimental.SGD, 'adam': tf_keras.optimizers.experimental.Adam, 'rmsprop': tf_keras.optimizers.experimental.RMSprop, 'adagrad': tf_keras.optimizers.experimental.Adagrad, } NEW_OPTIMIZERS_CLS.update(SHARED_OPTIMIZERS) LR_CLS = { 'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset, 'polynomial': lr_schedule.PolynomialDecayWithOffset, 'exponential': lr_schedule.ExponentialDecayWithOffset, 'cosine': lr_schedule.CosineDecayWithOffset, 'power': lr_schedule.DirectPowerDecay, 'power_linear': lr_schedule.PowerAndLinearDecay, 'power_with_offset': lr_schedule.PowerDecayWithOffset, 'step_cosine_with_offset': lr_schedule.StepCosineDecayWithOffset, } WARMUP_CLS = { 'linear': lr_schedule.LinearWarmup, 'polynomial': lr_schedule.PolynomialWarmUp } def register_optimizer_cls(key: str, optimizer_config_cls: Union[ tf_keras.optimizers.Optimizer, tf_keras.optimizers.legacy.Optimizer, tf_keras.optimizers.experimental.Optimizer ], use_legacy_optimizer: bool = True): """Register customize optimizer cls. The user will still need to subclass data classes in configs.optimization_config to be used with OptimizerFactory. Args: key: A string to that the optimizer_config_cls is registered with. optimizer_config_cls: A class which inherits tf_keras.optimizers.Optimizer. use_legacy_optimizer: A boolean that indicates if using legacy optimizers. """ if use_legacy_optimizer: if key in LEGACY_OPTIMIZERS_CLS: raise ValueError('%s already registered in LEGACY_OPTIMIZERS_CLS.' % key) LEGACY_OPTIMIZERS_CLS[key] = optimizer_config_cls else: if key in NEW_OPTIMIZERS_CLS: raise ValueError('%s already registered in NEW_OPTIMIZERS_CLS.' % key) NEW_OPTIMIZERS_CLS[key] = optimizer_config_cls class OptimizerFactory: """Optimizer factory class. This class builds learning rate and optimizer based on an optimization config. To use this class, you need to do the following: (1) Define optimization config, this includes optimizer, and learning rate schedule. (2) Initialize the class using the optimization config. (3) Build learning rate. (4) Build optimizer. This is a typical example for using this class: ``` params = { 'optimizer': { 'type': 'sgd', 'sgd': {'momentum': 0.9} }, 'learning_rate': { 'type': 'stepwise', 'stepwise': {'boundaries': [10000, 20000], 'values': [0.1, 0.01, 0.001]} }, 'warmup': { 'type': 'linear', 'linear': {'warmup_steps': 500, 'warmup_learning_rate': 0.01} } } opt_config = OptimizationConfig(params) opt_factory = OptimizerFactory(opt_config) lr = opt_factory.build_learning_rate() optimizer = opt_factory.build_optimizer(lr) ``` """ def __init__(self, config: opt_cfg.OptimizationConfig): """Initializing OptimizerFactory. Args: config: OptimizationConfig instance contain optimization config. """ self._config = config self._optimizer_config = config.optimizer.get() self._optimizer_type = config.optimizer.type self._use_ema = config.ema is not None self._ema_config = config.ema if self._optimizer_config is None: raise ValueError('Optimizer type must be specified') self._lr_config = config.learning_rate.get() self._lr_type = config.learning_rate.type if self._lr_type is None: raise ValueError('Learning rate type must be specified') self._warmup_config = config.warmup.get() self._warmup_type = config.warmup.type def build_learning_rate(self): """Build learning rate. Builds learning rate from config. Learning rate schedule is built according to the learning rate config. If learning rate type is consant, lr_config.learning_rate is returned. Returns: tf_keras.optimizers.schedules.LearningRateSchedule instance. If learning rate type is consant, lr_config.learning_rate is returned. """ if self._lr_type == 'constant': lr = self._lr_config.learning_rate else: lr = LR_CLS[self._lr_type](**self._lr_config.as_dict()) if self._warmup_config: lr = WARMUP_CLS[self._warmup_type](lr, **self._warmup_config.as_dict()) return lr @gin.configurable def build_optimizer( self, lr: Union[tf_keras.optimizers.schedules.LearningRateSchedule, float], gradient_aggregator: Optional[Callable[ [List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor, tf.Tensor]]]] = None, gradient_transformers: Optional[List[Callable[ [List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor, tf.Tensor]]]]] = None, postprocessor: Optional[Callable[[tf_keras.optimizers.Optimizer], tf_keras.optimizers.Optimizer]] = None, use_legacy_optimizer: bool = True): """Build optimizer. Builds optimizer from config. It takes learning rate as input, and builds the optimizer according to the optimizer config. Typically, the learning rate built using self.build_lr() is passed as an argument to this method. Args: lr: A floating point value, or a tf_keras.optimizers.schedules.LearningRateSchedule instance. gradient_aggregator: Optional function to overwrite gradient aggregation. gradient_transformers: Optional list of functions to use to transform gradients before applying updates to Variables. The functions are applied after gradient_aggregator. The functions should accept and return a list of (gradient, variable) tuples. clipvalue, clipnorm, global_clipnorm should not be set when gradient_transformers is passed. postprocessor: An optional function for postprocessing the optimizer. It takes an optimizer and returns an optimizer. use_legacy_optimizer: A boolean that indicates if using legacy optimizers. Returns: `tf_keras.optimizers.legacy.Optimizer` or `tf_keras.optimizers.experimental.Optimizer` instance. """ optimizer_dict = self._optimizer_config.as_dict() ## Delete clipnorm, clipvalue, global_clipnorm if None if optimizer_dict['clipnorm'] is None: del optimizer_dict['clipnorm'] if optimizer_dict['clipvalue'] is None: del optimizer_dict['clipvalue'] if optimizer_dict['global_clipnorm'] is None: del optimizer_dict['global_clipnorm'] optimizer_dict['learning_rate'] = lr if gradient_aggregator is not None: optimizer_dict['gradient_aggregator'] = gradient_aggregator if gradient_transformers is not None: optimizer_dict['gradient_transformers'] = gradient_transformers if use_legacy_optimizer: optimizer = LEGACY_OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict) else: if 'decay' in optimizer_dict: raise ValueError( '`decay` is deprecated in new Keras optimizer, please reflect the ' 'decay logic in `lr` or set `use_legacy_optimizer=True` to use the ' 'legacy optimizer.') optimizer = NEW_OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict) if self._use_ema: if not use_legacy_optimizer: raise ValueError( 'EMA can only work with the legacy optimizer, please set ' '`use_legacy_optimizer=True`.') optimizer = ema_optimizer.ExponentialMovingAverage( optimizer, **self._ema_config.as_dict()) if postprocessor: optimizer = postprocessor(optimizer) if isinstance(optimizer, tf_keras.optimizers.Optimizer): return optimizer # The following check makes sure the function won't break in older TF # version because of missing the experimental/legacy package. if hasattr(tf_keras.optimizers, 'experimental'): if isinstance(optimizer, tf_keras.optimizers.experimental.Optimizer): return optimizer if hasattr(tf_keras.optimizers, 'legacy'): if isinstance(optimizer, tf_keras.optimizers.legacy.Optimizer): return optimizer raise TypeError('OptimizerFactory.build_optimizer returning a ' 'non-optimizer object: {}'.format(optimizer))