File size: 3,883 Bytes
e8861c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
"""
reference:
https://github.com/ultralytics/yolov5/blob/master/utils/torch_utils.py#L404
by lyuwenyu
"""
import torch
import torch.nn as nn
import math
from copy import deepcopy
from src.core import register
import src.misc.dist as dist
__all__ = ['ModelEMA']
@register
class ModelEMA(object):
""" Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
Keep a moving average of everything in the model state_dict (parameters and buffers).
This is intended to allow functionality like
https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
A smoothed version of the weights is necessary for some training schemes to perform well.
This class is sensitive where it is initialized in the sequence of model init,
GPU assignment and distributed training wrappers.
"""
def __init__(self, model: nn.Module, decay: float=0.9999, warmups: int=2000):
super().__init__()
# Create EMA
self.module = deepcopy(dist.de_parallel(model)).eval() # FP32 EMA
# if next(model.parameters()).device.type != 'cpu':
# self.module.half() # FP16 EMA
self.decay = decay
self.warmups = warmups
self.updates = 0 # number of EMA updates
# self.filter_no_grad = filter_no_grad
self.decay_fn = lambda x: decay * (1 - math.exp(-x / warmups)) # decay exponential ramp (to help early epochs)
for p in self.module.parameters():
p.requires_grad_(False)
def update(self, model: nn.Module):
# Update EMA parameters
with torch.no_grad():
self.updates += 1
d = self.decay_fn(self.updates)
msd = dist.de_parallel(model).state_dict()
for k, v in self.module.state_dict().items():
if v.dtype.is_floating_point:
v *= d
v += (1 - d) * msd[k].detach()
def to(self, *args, **kwargs):
self.module = self.module.to(*args, **kwargs)
return self
def update_attr(self, model, include=(), exclude=('process_group', 'reducer')):
# Update EMA attributes
self.copy_attr(self.module, model, include, exclude)
@staticmethod
def copy_attr(a, b, include=(), exclude=()):
# Copy attributes from b to a, options to only include [...] and to exclude [...]
for k, v in b.__dict__.items():
if (len(include) and k not in include) or k.startswith('_') or k in exclude:
continue
else:
setattr(a, k, v)
def state_dict(self, ):
return dict(module=self.module.state_dict(), updates=self.updates, warmups=self.warmups)
def load_state_dict(self, state):
self.module.load_state_dict(state['module'])
if 'updates' in state:
self.updates = state['updates']
def forwad(self, ):
raise RuntimeError('ema...')
def extra_repr(self) -> str:
return f'decay={self.decay}, warmups={self.warmups}'
class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
"""Maintains moving averages of model parameters using an exponential decay.
``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
`torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
is used to compute the EMA.
"""
def __init__(self, model, decay, device="cpu", use_buffers=True):
self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000))
def ema_avg(avg_model_param, model_param, num_averaged):
decay = self.decay_fn(num_averaged)
return decay * avg_model_param + (1 - decay) * model_param
super().__init__(model, device, ema_avg, use_buffers=use_buffers)
|