Spaces:
Runtime error
Runtime error
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Learning rate schedule classes.""" | |
import math | |
from typing import Mapping, Any, Union, Optional | |
import tensorflow as tf, tf_keras | |
def _make_offset_wrapper(new_class_name: str, base_lr_class): | |
"""Generates a offset wrapper of learning rate schedule. | |
It will returns a subclass of the `base_lr_class`, the subclass takes an | |
`offset` argument in the constructor. When the new class instance is called, | |
the behavior is: | |
new_class_object(step) = base_lr_class_object(step - offset) | |
Example: | |
CosineDecayWithOffset = _make_offset_wrapper( | |
'CosineDecayWithOffset', | |
tf_keras.optimizers.schedules.CosineDecay) | |
# Use the lr: | |
lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1, | |
decay_steps=1000) | |
lr(101) # equals to keras.optimizers.schedules.CosineDecay(...)(101-100) | |
Args: | |
new_class_name: the name of the new class. | |
base_lr_class: the base learning rate schedule class. Should be subclass of | |
tf_keras.optimizers.schedules.LearningRateSchedule | |
Returns: | |
A new class (subclass of the base_lr_class) that can take an offset. | |
""" | |
assert issubclass(base_lr_class, | |
tf_keras.optimizers.schedules.LearningRateSchedule), ( | |
"base_lr_class should be subclass of keras " | |
f"LearningRateSchedule, got {base_lr_class}") | |
# pylint: disable=protected-access,pointless-statement | |
def offset_learning_rate_init(self, offset=0, **kwargs): | |
"""Construct learning rate schedule object. | |
When this object is called, its behavior is | |
self.__call__(step) == base_lr_class.__call__(step - offset) | |
Args: | |
self: this object. | |
offset: The offset when computing the learning rate schedule. | |
**kwargs: Pass through to base learning rate class constructor. | |
""" | |
base_lr_class.__init__(self, **kwargs) | |
self._offset = offset | |
def offset_learning_rate_call(self, step): | |
step = tf.cast(step - self._offset, tf.float32) | |
return base_lr_class.__call__(self, step) | |
# pylint: enable=protected-access,pointless-statement | |
return type( | |
new_class_name, (base_lr_class,), { | |
"base_lr_class": base_lr_class, | |
"__init__": offset_learning_rate_init, | |
"__call__": offset_learning_rate_call | |
}) | |
PiecewiseConstantDecayWithOffset = _make_offset_wrapper( | |
"PiecewiseConstantDecayWithOffset", | |
tf_keras.optimizers.schedules.PiecewiseConstantDecay) | |
PolynomialDecayWithOffset = _make_offset_wrapper( | |
"PolynomialDecayWithOffset", tf_keras.optimizers.schedules.PolynomialDecay) | |
ExponentialDecayWithOffset = _make_offset_wrapper( | |
"ExponentialDecayWithOffset", | |
tf_keras.optimizers.schedules.ExponentialDecay) | |
CosineDecayWithOffset = _make_offset_wrapper( | |
"CosineDecayWithOffset", | |
tf_keras.optimizers.schedules.CosineDecay, | |
) | |
class LinearWarmup(tf_keras.optimizers.schedules.LearningRateSchedule): | |
"""Linear warmup schedule.""" | |
def __init__(self, | |
after_warmup_lr_sched: Union[ | |
tf_keras.optimizers.schedules.LearningRateSchedule, float], | |
warmup_steps: int, | |
warmup_learning_rate: float, | |
name: Optional[str] = None): | |
"""Add linear warmup schedule to a learning rate schedule. | |
warmup_lr is the initial learning rate, the final learning rate of the | |
init_warmup period is the initial learning rate of lr_schedule in use. | |
The learning rate at each step linearly increased according to the following | |
formula: | |
learning_rate = warmup_lr + step / warmup_steps | |
* (final_warmup_lr - warmup_lr). | |
Using warmup overrides the learning rate schedule by the number of warmup | |
steps. | |
Args: | |
after_warmup_lr_sched: tf_keras.optimizers.schedules .LearningRateSchedule | |
or a constant. | |
warmup_steps: Number of the warmup steps. | |
warmup_learning_rate: Initial learning rate for the warmup. | |
name: Optional, name of warmup schedule. | |
""" | |
super().__init__() | |
self._name = name | |
self._after_warmup_lr_sched = after_warmup_lr_sched | |
self._warmup_steps = warmup_steps | |
self._init_warmup_lr = warmup_learning_rate | |
if isinstance(after_warmup_lr_sched, | |
tf_keras.optimizers.schedules.LearningRateSchedule): | |
self._final_warmup_lr = after_warmup_lr_sched(warmup_steps) | |
else: | |
self._final_warmup_lr = tf.cast(after_warmup_lr_sched, dtype=tf.float32) | |
def __call__(self, step: int): | |
global_step = tf.cast(step, dtype=tf.float32) | |
linear_warmup_lr = ( | |
self._init_warmup_lr + global_step / self._warmup_steps * | |
(self._final_warmup_lr - self._init_warmup_lr)) | |
if isinstance(self._after_warmup_lr_sched, | |
tf_keras.optimizers.schedules.LearningRateSchedule): | |
after_warmup_lr = self._after_warmup_lr_sched(step) | |
else: | |
after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32) | |
lr = tf.cond(global_step < self._warmup_steps, | |
lambda: linear_warmup_lr, | |
lambda: after_warmup_lr) | |
return lr | |
def get_config(self) -> Mapping[str, Any]: | |
if isinstance(self._after_warmup_lr_sched, | |
tf_keras.optimizers.schedules.LearningRateSchedule): | |
config = { | |
"after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()} # pytype: disable=attribute-error | |
else: | |
config = {"after_warmup_lr_sched": self._after_warmup_lr_sched} # pytype: disable=attribute-error | |
config.update({ | |
"warmup_steps": self._warmup_steps, | |
"warmup_learning_rate": self._init_warmup_lr, | |
"name": self._name | |
}) | |
return config | |
class PolynomialWarmUp(tf_keras.optimizers.schedules.LearningRateSchedule): | |
"""Applies polynomial warmup schedule on a given learning rate decay schedule.""" | |
def __init__(self, | |
after_warmup_lr_sched: Union[ | |
tf_keras.optimizers.schedules.LearningRateSchedule, float], | |
warmup_steps: int, | |
power: float = 1.0, | |
name: str = "PolynomialWarmup"): | |
super().__init__() | |
if isinstance(after_warmup_lr_sched, | |
tf_keras.optimizers.schedules.LearningRateSchedule): | |
self._initial_learning_rate = after_warmup_lr_sched(warmup_steps) | |
else: | |
self._initial_learning_rate = tf.cast( | |
after_warmup_lr_sched, dtype=tf.float32) | |
self._warmup_steps = warmup_steps | |
self._power = power | |
self._after_warmup_lr_sched = after_warmup_lr_sched | |
self._name = name | |
def __call__(self, step): | |
with tf.name_scope(self._name or "PolynomialWarmUp") as name: | |
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the | |
# learning rate will be `global_step/num_warmup_steps * init_lr`. | |
global_step_float = tf.cast(step, tf.float32) | |
warmup_steps_float = tf.cast(self._warmup_steps, tf.float32) | |
if self._warmup_steps <= 0: | |
warmup_percent_done = 1.0 | |
else: | |
# A zero `step` may cause Inf. So make `step` positive. | |
step_non_zero = tf.math.maximum(global_step_float, 1.0) | |
warmup_percent_done = step_non_zero / warmup_steps_float | |
warmup_learning_rate = ( | |
self._initial_learning_rate * | |
tf.math.pow(warmup_percent_done, self._power)) | |
if isinstance(self._after_warmup_lr_sched, | |
tf_keras.optimizers.schedules.LearningRateSchedule): | |
after_warmup_lr = self._after_warmup_lr_sched(step) | |
else: | |
after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32) | |
return tf.cond( | |
global_step_float < warmup_steps_float, | |
lambda: warmup_learning_rate, | |
lambda: after_warmup_lr, | |
name=name) | |
def get_config(self) -> Mapping[str, Any]: | |
if isinstance(self._after_warmup_lr_sched, | |
tf_keras.optimizers.schedules.LearningRateSchedule): | |
config = { | |
"after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()} # pytype: disable=attribute-error | |
else: | |
config = {"after_warmup_lr_sched": self._after_warmup_lr_sched} # pytype: disable=attribute-error | |
config.update({ | |
"warmup_steps": self._warmup_steps, | |
"power": self._power, | |
"name": self._name | |
}) | |
return config | |
class DirectPowerDecay(tf_keras.optimizers.schedules.LearningRateSchedule): | |
"""Learning rate schedule follows lr * (step)^power.""" | |
def __init__(self, | |
initial_learning_rate: float, | |
power: float = 1.0, | |
name: str = "DirectPowerDecay"): | |
"""Initialize configuration of the learning rate schedule. | |
Args: | |
initial_learning_rate: The initial learning rate. | |
power: The order of the polynomial. | |
name: Optional, name of learning rate schedule. | |
""" | |
super().__init__() | |
self._initial_learning_rate = initial_learning_rate | |
self._power = power | |
self._name = name | |
def __call__(self, step): | |
with tf.name_scope(self._name or "DirectPowerDecay"): | |
step = tf.cast(step, tf.float32) | |
learning_rate = self._initial_learning_rate | |
# A zero `step` may cause Inf. So make `step` positive. | |
step_non_zero = tf.math.maximum(step, 1.0) | |
learning_rate *= tf.math.pow(step_non_zero, self._power) | |
return learning_rate | |
def get_config(self): | |
"""Get the configuration of the learning rate schedule.""" | |
return { | |
"initial_learning_rate": self._initial_learning_rate, | |
"power": self._power, | |
"name": self._name, | |
} | |
class PowerAndLinearDecay(tf_keras.optimizers.schedules.LearningRateSchedule): | |
"""Learning rate schedule with multiplied by linear decay at the end. | |
The schedule has the following behavoir. | |
Let offset_step = step - offset. | |
1) offset_step < 0, the actual learning rate equals initial_learning_rate. | |
2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the | |
actual learning rate equals lr * offset_step^power. | |
3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step < | |
total_decay_steps, the actual learning rate equals lr * offset_step^power * | |
(total_decay_steps - offset_step) / (total_decay_steps * | |
linear_decay_fraction). | |
4) offset_step >= total_decay_steps, the actual learning rate equals zero. | |
""" | |
def __init__(self, | |
initial_learning_rate: float, | |
total_decay_steps: int, | |
power: float = 1.0, | |
linear_decay_fraction: float = 0.1, | |
offset: int = 0, | |
name: str = "PowerAndLinearDecay"): | |
"""Initialize configuration of the learning rate schedule. | |
Args: | |
initial_learning_rate: The initial learning rate. | |
total_decay_steps: The total number of steps for power + linear decay. | |
power: The order of the polynomial. | |
linear_decay_fraction: In the last `linear_decay_fraction` steps, the | |
learning rate will be multiplied by a linear decay. | |
offset: The offset applied to steps. | |
name: Optional, name of learning rate schedule. | |
""" | |
super().__init__() | |
self._initial_learning_rate = initial_learning_rate | |
self._total_decay_steps = total_decay_steps | |
self._power = power | |
self._linear_decay_fraction = linear_decay_fraction | |
self._offset = offset | |
self._name = name | |
def __call__(self, step): | |
with tf.name_scope(self._name or "PowerAndLinearDecay"): | |
step = tf.cast(step - self._offset, tf.float32) | |
learning_rate = self._initial_learning_rate | |
# A zero `step` may cause Inf. So make `step` positive. | |
step_non_zero = tf.math.maximum(step, 1.0) | |
learning_rate *= tf.math.pow(step_non_zero, self._power) | |
if self._total_decay_steps * self._linear_decay_fraction > 0: | |
learning_rate *= tf.minimum( | |
1.0, (self._total_decay_steps - step) / | |
(self._total_decay_steps * self._linear_decay_fraction)) | |
learning_rate = tf.maximum(0.0, learning_rate) | |
return learning_rate | |
def get_config(self): | |
"""Get the configuration of the learning rate schedule.""" | |
return { | |
"initial_learning_rate": self._initial_learning_rate, | |
"total_decay_steps": self._total_decay_steps, | |
"power": self._power, | |
"linear_decay_fraction": self._linear_decay_fraction, | |
"offset": self._offset, | |
"name": self._name, | |
} | |
class PowerDecayWithOffset(tf_keras.optimizers.schedules.LearningRateSchedule): | |
"""Power learning rate decay with offset. | |
Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`. | |
Otherwise, learning rate equals to lr * (step - offset)^power. | |
""" | |
def __init__(self, | |
initial_learning_rate: float, | |
power: float = 1.0, | |
offset: int = 0, | |
pre_offset_learning_rate: float = 1.0e6, | |
name: str = "PowerDecayWithOffset"): | |
"""Initialize configuration of the learning rate schedule. | |
Args: | |
initial_learning_rate: The initial learning rate. | |
power: The order of the polynomial. | |
offset: The offset when computing the power decay. | |
pre_offset_learning_rate: The maximum learning rate we'll use. | |
name: Optional, name of learning rate schedule. | |
""" | |
super().__init__() | |
self._initial_learning_rate = initial_learning_rate | |
self._power = power | |
self._offset = offset | |
self._pre_offset_lr = pre_offset_learning_rate | |
self._name = name | |
def __call__(self, step): | |
with tf.name_scope(self._name or "PowerDecayWithOffset"): | |
step = tf.cast(step, tf.float32) | |
lr_after_offset = tf.math.pow( | |
tf.math.maximum(step - self._offset, 1.0), self._power) * ( | |
self._initial_learning_rate) | |
sign = tf.cast(step > self._offset, tf.float32) | |
lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset | |
# Power may give infinitely large LR. So cap it with pre_offset_lr. | |
return tf.math.minimum(lr_combined, self._pre_offset_lr) | |
def get_config(self): | |
"""Get the configuration of the learning rate schedule.""" | |
return { | |
"initial_learning_rate": self._initial_learning_rate, | |
"power": self._power, | |
"offset": self._offset, | |
"pre_offset_learning_rate": self._pre_offset_lr, | |
"name": self._name, | |
} | |
class StepCosineDecayWithOffset( | |
tf_keras.optimizers.schedules.LearningRateSchedule): | |
"""Stepwise cosine learning rate decay with offset. | |
Learning rate is equivalent to one or more cosine decay(s) starting and | |
ending at each interval. | |
ExampleL | |
```python | |
boundaries: [100000, 110000] | |
values: [1.0, 0.5] | |
lr_decayed_fn = ( | |
lr_schedule.StepCosineDecayWithOffset( | |
boundaries, | |
values)) | |
``` | |
from 0 to 100000 step, it will cosine decay from 1.0 to 0.5 | |
from 100000 to 110000 step, it cosine decay from 0.5 to 0.0 | |
""" | |
def __init__(self, | |
boundaries, | |
values, | |
offset: int = 0, | |
name: str = "StepCosineDecayWithOffset"): | |
"""Initialize configuration of the learning rate schedule. | |
Args: | |
boundaries: A list of `Tensor`s or `int`s with strictly | |
increasing entries, and with all elements having the same type as the | |
optimizer step. | |
values: A list of `Tensor`s or `float`s that specifies the | |
values for the intervals defined by `boundaries`. It should have one | |
more element than `boundaries`, and all elements should have the same | |
type. | |
offset: The offset when computing the power decay. | |
name: Optional, name of learning rate schedule. | |
""" | |
super().__init__() | |
self.values = values | |
self.boundaries = boundaries | |
self.offset = offset | |
self.name = name | |
if len(self.values) < 1: | |
raise ValueError(f"Expect non empty {self.values}") | |
if len(self.boundaries) != len(self.values): | |
raise ValueError( | |
"Boundaries length is equal to learning rate levels length" | |
f"{len(self.boundaries)} != {len(self.values)}") | |
self.total_steps = ( | |
[boundaries[i + 1] - boundaries[i] for i in range(len(boundaries) - 1) | |
] + [0]) | |
def __call__(self, global_step): | |
with tf.name_scope(self.name or "StepCosineDecayWithOffset"): | |
global_step = tf.cast(global_step - self.offset, tf.float32) | |
lr_levels = self.values | |
lr_steps = self.boundaries | |
level_total_steps = self.total_steps | |
num_levels = len(lr_levels) | |
init_lr = lr_levels[0] | |
next_init_lr = lr_levels[1] if num_levels > 1 else 0. | |
init_total_steps = level_total_steps[0] | |
cosine_learning_rate = ((init_lr - next_init_lr) * (tf.cos( | |
tf.constant(math.pi) * (global_step) / | |
(init_total_steps)) + 1.0) / 2.0 + next_init_lr) | |
learning_rate = cosine_learning_rate | |
for i in range(1, num_levels): | |
next_init_lr = lr_levels[i] | |
next_start_step = lr_steps[i] | |
next_total_steps = level_total_steps[i] | |
next_next_init_lr = lr_levels[i + 1] if num_levels > i + 1 else 0. | |
next_cosine_learning_rate = ((next_init_lr - next_next_init_lr) * | |
(tf.cos( | |
tf.constant(math.pi) * | |
(global_step - next_start_step) / | |
(next_total_steps)) + 1.0) / 2.0 + | |
next_next_init_lr) | |
learning_rate = tf.where(global_step >= next_start_step, | |
next_cosine_learning_rate, learning_rate) | |
return learning_rate | |
def get_config(self): | |
return { | |
"boundaries": self.boundaries, | |
"values": self.values, | |
"offset": self.offset, | |
"name": self.name | |
} | |