Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved | |
from enum import Enum | |
from typing import Any, Callable, Dict, Iterable, List, Set, Type, Union | |
import torch | |
from detectron2.config import CfgNode | |
from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR | |
_GradientClipperInput = Union[torch.Tensor, Iterable[torch.Tensor]] | |
_GradientClipper = Callable[[_GradientClipperInput], None] | |
class GradientClipType(Enum): | |
VALUE = "value" | |
NORM = "norm" | |
def _create_gradient_clipper(cfg: CfgNode) -> _GradientClipper: | |
""" | |
Creates gradient clipping closure to clip by value or by norm, | |
according to the provided config. | |
""" | |
cfg = cfg.clone() | |
def clip_grad_norm(p: _GradientClipperInput): | |
torch.nn.utils.clip_grad_norm_(p, cfg.CLIP_VALUE, cfg.NORM_TYPE) | |
def clip_grad_value(p: _GradientClipperInput): | |
torch.nn.utils.clip_grad_value_(p, cfg.CLIP_VALUE) | |
_GRADIENT_CLIP_TYPE_TO_CLIPPER = { | |
GradientClipType.VALUE: clip_grad_value, | |
GradientClipType.NORM: clip_grad_norm, | |
} | |
return _GRADIENT_CLIP_TYPE_TO_CLIPPER[GradientClipType(cfg.CLIP_TYPE)] | |
def _generate_optimizer_class_with_gradient_clipping( | |
optimizer_type: Type[torch.optim.Optimizer], gradient_clipper: _GradientClipper | |
) -> Type[torch.optim.Optimizer]: | |
""" | |
Dynamically creates a new type that inherits the type of a given instance | |
and overrides the `step` method to add gradient clipping | |
""" | |
def optimizer_wgc_step(self, closure=None): | |
for group in self.param_groups: | |
for p in group["params"]: | |
gradient_clipper(p) | |
super(type(self), self).step(closure) | |
OptimizerWithGradientClip = type( | |
optimizer_type.__name__ + "WithGradientClip", | |
(optimizer_type,), | |
{"step": optimizer_wgc_step}, | |
) | |
return OptimizerWithGradientClip | |
def maybe_add_gradient_clipping( | |
cfg: CfgNode, optimizer: torch.optim.Optimizer | |
) -> torch.optim.Optimizer: | |
""" | |
If gradient clipping is enabled through config options, wraps the existing | |
optimizer instance of some type OptimizerType to become an instance | |
of the new dynamically created class OptimizerTypeWithGradientClip | |
that inherits OptimizerType and overrides the `step` method to | |
include gradient clipping. | |
Args: | |
cfg: CfgNode | |
configuration options | |
optimizer: torch.optim.Optimizer | |
existing optimizer instance | |
Return: | |
optimizer: torch.optim.Optimizer | |
either the unmodified optimizer instance (if gradient clipping is | |
disabled), or the same instance with adjusted __class__ to override | |
the `step` method and include gradient clipping | |
""" | |
if not cfg.SOLVER.CLIP_GRADIENTS.ENABLED: | |
return optimizer | |
grad_clipper = _create_gradient_clipper(cfg.SOLVER.CLIP_GRADIENTS) | |
OptimizerWithGradientClip = _generate_optimizer_class_with_gradient_clipping( | |
type(optimizer), grad_clipper | |
) | |
optimizer.__class__ = OptimizerWithGradientClip | |
return optimizer | |
def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer: | |
""" | |
Build an optimizer from config. | |
""" | |
norm_module_types = ( | |
torch.nn.BatchNorm1d, | |
torch.nn.BatchNorm2d, | |
torch.nn.BatchNorm3d, | |
torch.nn.SyncBatchNorm, | |
# NaiveSyncBatchNorm inherits from BatchNorm2d | |
torch.nn.GroupNorm, | |
torch.nn.InstanceNorm1d, | |
torch.nn.InstanceNorm2d, | |
torch.nn.InstanceNorm3d, | |
torch.nn.LayerNorm, | |
torch.nn.LocalResponseNorm, | |
) | |
params: List[Dict[str, Any]] = [] | |
memo: Set[torch.nn.parameter.Parameter] = set() | |
for module in model.modules(): | |
for key, value in module.named_parameters(recurse=False): | |
if not value.requires_grad: | |
continue | |
# Avoid duplicating parameters | |
if value in memo: | |
continue | |
memo.add(value) | |
lr = cfg.SOLVER.BASE_LR | |
weight_decay = cfg.SOLVER.WEIGHT_DECAY | |
if isinstance(module, norm_module_types): | |
weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM | |
elif key == "bias": | |
# NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0 | |
# and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer | |
# hyperparameters are by default exactly the same as for regular | |
# weights. | |
lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR | |
weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS | |
params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] | |
optimizer = torch.optim.SGD( | |
params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV | |
) | |
optimizer = maybe_add_gradient_clipping(cfg, optimizer) | |
return optimizer | |
def build_lr_scheduler( | |
cfg: CfgNode, optimizer: torch.optim.Optimizer | |
) -> torch.optim.lr_scheduler._LRScheduler: | |
""" | |
Build a LR scheduler from config. | |
""" | |
name = cfg.SOLVER.LR_SCHEDULER_NAME | |
if name == "WarmupMultiStepLR": | |
return WarmupMultiStepLR( | |
optimizer, | |
cfg.SOLVER.STEPS, | |
cfg.SOLVER.GAMMA, | |
warmup_factor=cfg.SOLVER.WARMUP_FACTOR, | |
warmup_iters=cfg.SOLVER.WARMUP_ITERS, | |
warmup_method=cfg.SOLVER.WARMUP_METHOD, | |
) | |
elif name == "WarmupCosineLR": | |
return WarmupCosineLR( | |
optimizer, | |
cfg.SOLVER.MAX_ITER, | |
warmup_factor=cfg.SOLVER.WARMUP_FACTOR, | |
warmup_iters=cfg.SOLVER.WARMUP_ITERS, | |
warmup_method=cfg.SOLVER.WARMUP_METHOD, | |
) | |
else: | |
raise ValueError("Unknown LR scheduler: {}".format(name)) | |