from collections import defaultdict from enum import Enum from typing import cast, Iterable, Tuple, Union import torch from captum._utils.common import _format_tensor_into_tuples, _register_backward_hook from torch import Tensor from torch.nn import Module def _reset_sample_grads(module: Module): module.weight.sample_grad = 0 # type: ignore if module.bias is not None: module.bias.sample_grad = 0 # type: ignore def linear_param_grads( module: Module, activation: Tensor, gradient_out: Tensor, reset: bool = False ) -> None: r""" Computes parameter gradients per sample for nn.Linear module, given module input activations and output gradients. Gradients are accumulated in the sample_grad attribute of each parameter (weight and bias). If reset = True, any current sample_grad values are reset, otherwise computed gradients are accumulated and added to the existing stored gradients. Inputs with more than 2 dimensions are only supported with torch 1.8 or later """ if reset: _reset_sample_grads(module) module.weight.sample_grad += torch.einsum( # type: ignore "n...i,n...j->nij", gradient_out, activation ) if module.bias is not None: module.bias.sample_grad += torch.einsum( # type: ignore "n...i->ni", gradient_out ) def conv2d_param_grads( module: Module, activation: Tensor, gradient_out: Tensor, reset: bool = False ) -> None: r""" Computes parameter gradients per sample for nn.Conv2d module, given module input activations and output gradients. nn.Conv2d modules with padding set to a string option ('same' or 'valid') are currently unsupported. Gradients are accumulated in the sample_grad attribute of each parameter (weight and bias). If reset = True, any current sample_grad values are reset, otherwise computed gradients are accumulated and added to the existing stored gradients. """ if reset: _reset_sample_grads(module) batch_size = cast(int, activation.shape[0]) unfolded_act = torch.nn.functional.unfold( activation, cast(Union[int, Tuple[int, ...]], module.kernel_size), dilation=cast(Union[int, Tuple[int, ...]], module.dilation), padding=cast(Union[int, Tuple[int, ...]], module.padding), stride=cast(Union[int, Tuple[int, ...]], module.stride), ) reshaped_grad = gradient_out.reshape(batch_size, -1, unfolded_act.shape[-1]) grad1 = torch.einsum("ijk,ilk->ijl", reshaped_grad, unfolded_act) shape = [batch_size] + list(cast(Iterable[int], module.weight.shape)) module.weight.sample_grad += grad1.reshape(shape) # type: ignore if module.bias is not None: module.bias.sample_grad += torch.sum(reshaped_grad, dim=2) # type: ignore SUPPORTED_MODULES = { torch.nn.Conv2d: conv2d_param_grads, torch.nn.Linear: linear_param_grads, } class LossMode(Enum): SUM = 0 MEAN = 1 class SampleGradientWrapper: r""" Wrapper which allows computing sample-wise gradients in a single backward pass. This is accomplished by adding hooks to capture activations and output gradients for supported modules, and using these activations and gradients to compute the parameter gradients per-sample. Currently, only nn.Linear and nn.Conv2d modules are supported. Similar reference implementations of sample-based gradients include: - https://github.com/cybertronai/autograd-hacks - https://github.com/pytorch/opacus/tree/main/opacus/grad_sample """ def __init__(self, model): self.model = model self.hooks_added = False self.activation_dict = defaultdict(list) self.gradient_dict = defaultdict(list) self.forward_hooks = [] self.backward_hooks = [] def add_hooks(self): self.hooks_added = True self.model.apply(self._register_module_hooks) def _register_module_hooks(self, module: torch.nn.Module): if isinstance(module, tuple(SUPPORTED_MODULES.keys())): self.forward_hooks.append( module.register_forward_hook(self._forward_hook_fn) ) self.backward_hooks.append( _register_backward_hook(module, self._backward_hook_fn, None) ) def _forward_hook_fn( self, module: Module, module_input: Union[Tensor, Tuple[Tensor, ...]], module_output: Union[Tensor, Tuple[Tensor, ...]], ): inp_tuple = _format_tensor_into_tuples(module_input) self.activation_dict[module].append(inp_tuple[0].clone().detach()) def _backward_hook_fn( self, module: Module, grad_input: Union[Tensor, Tuple[Tensor, ...]], grad_output: Union[Tensor, Tuple[Tensor, ...]], ): grad_output_tuple = _format_tensor_into_tuples(grad_output) self.gradient_dict[module].append(grad_output_tuple[0].clone().detach()) def remove_hooks(self): self.hooks_added = False for hook in self.forward_hooks: hook.remove() for hook in self.backward_hooks: hook.remove() self.forward_hooks = [] self.backward_hooks = [] def _reset(self): self.activation_dict = defaultdict(list) self.gradient_dict = defaultdict(list) def compute_param_sample_gradients(self, loss_blob, loss_mode="mean"): assert ( loss_mode.upper() in LossMode.__members__ ), f"Provided loss mode {loss_mode} is not valid" mode = LossMode[loss_mode.upper()] self.model.zero_grad() loss_blob.backward(gradient=torch.ones_like(loss_blob)) for module in self.gradient_dict: sample_grad_fn = SUPPORTED_MODULES[type(module)] activations = self.activation_dict[module] gradients = self.gradient_dict[module] assert len(activations) == len(gradients), ( "Number of saved activations do not match number of saved gradients." " This may occur if multiple forward passes are run without calling" " reset or computing param gradients." ) # Reversing grads since when a module is used multiple times, # the activations will be aligned with the reverse order of the gradients, # since the order is reversed in backprop. for i, (act, grad) in enumerate( zip(activations, list(reversed(gradients))) ): mult = 1 if mode is LossMode.SUM else act.shape[0] sample_grad_fn(module, act, grad * mult, reset=(i == 0)) self._reset()