Add SVD-compressed model with rank 512

Browse files

Files changed (6) hide show

config.json +79 -0
configuration_bart.py +11 -0
modeling_bart.py +45 -0
modules.py +121 -0
pytorch_model.bin +3 -0
util.py +227 -0

config.json ADDED Viewed

	@@ -0,0 +1,79 @@

+{
+  "_name_or_path": "facebook/bart-base",
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "SVDCompressedBartForConditionGeneration"
+  ],
+  "attention_dropout": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_bart.SVDCompressedBartConfig",
+    "AutoModelForSeq2SeqLM": "modeling_bart.SVDCompressedBartForConditionGeneration"
+  },
+  "bos_token_id": 0,
+  "classif_dropout": 0.1,
+  "classifier_dropout": 0.0,
+  "d_model": 768,
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "early_stopping": true,
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "max_position_embeddings": 1024,
+  "model_type": "bart",
+  "no_repeat_ngram_size": 3,
+  "normalize_before": false,
+  "normalize_embedding": true,
+  "num_beams": 4,
+  "num_hidden_layers": 6,
+  "pad_token_id": 1,
+  "rank": 512,
+  "scale_embedding": false,
+  "task_specific_params": {
+    "summarization": {
+      "length_penalty": 1.0,
+      "max_length": 128,
+      "min_length": 12,
+      "num_beams": 4
+    },
+    "summarization_cnn": {
+      "length_penalty": 2.0,
+      "max_length": 142,
+      "min_length": 56,
+      "num_beams": 4
+    },
+    "summarization_xsum": {
+      "length_penalty": 1.0,
+      "max_length": 62,
+      "min_length": 11,
+      "num_beams": 6
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1",
+  "use_cache": true,
+  "vocab_size": 50266
+}

configuration_bart.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from transformers import BartConfig
+class SVDCompressedBartConfig(BartConfig):
+    def __init__(self, *args, rank: int = 512, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.rank = rank
+SVDCompressedBartConfig.register_for_auto_class()

modeling_bart.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""This module uses parts of rut5compressed. It shares the same module
+structure as model used in neural network compression experiments with
+rut5compressed.
+"""
+from functools import partial
+from typing import Optional
+import torch as T
+from transformers import BartForConditionalGeneration
+from .configuration_bart import SVDCompressedBartConfig
+from .modules import SVDCompressedLinear
+from .util import compress_linear_svd, map_module
+class SVDCompressedBartForConditionGeneration(BartForConditionalGeneration):
+    """Class SVDCompressedBartForConditionGeneration defines a BART-based model
+    with compressed linear layers with SVD.
+    """
+    LAYERS = r'/(de|en)coder/layers/\d+/fc[12]'
+    config_class = SVDCompressedBartConfig
+    def __init__(self, config: SVDCompressedBartConfig,
+                 rank: Optional[int] = None,
+                 compress: bool = False):
+        super().__init__(config)
+        self.rank = rank or config.rank
+        compress_fn = partial(compress_linear_svd, rank=self.rank)
+        if not compress:
+            compress_fn = self.convert
+        self.model = map_module(self.model, compress_fn, self.LAYERS)
+    def convert(self, module: T.nn.Module, path: str) -> T.nn.Module:
+        if not isinstance(module, T.nn.Linear):
+            return module
+        return SVDCompressedLinear.from_random(module.in_features,
+                                               module.out_features, self.rank)
+SVDCompressedBartForConditionGeneration \
+    .register_for_auto_class('AutoModelForSeq2SeqLM')

modules.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copied from
+#   rut5compressed/nn/functional.py
+#   rut5compressed/nn/modules.py
+# modules of original repository.
+from typing import Optional, Sequence, Tuple
+import torch as T
+class SVDCompressedLinearFunc(T.autograd.Function):
+    @staticmethod
+    def forward(ctx, input: T.Tensor, lhs: T.Tensor,
+                rhs: T.Tensor, bias: Optional[T.Tensor] = None) -> T.Tensor:
+        # See PEP-0465 on matmul operator associativity.
+        # https://peps.python.org/pep-0465/#precedence-and-associativity
+        output = (input @ lhs) @ rhs
+        if bias is not None:
+            output += bias[None, :]
+        ctx.bias = bias is not None
+        ctx.save_for_backward(input, lhs, rhs)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output: Sequence[T.Tensor]):
+        input, lhs, rhs = ctx.saved_tensors
+        # Flatten input and output gradients over the leading dimensions.
+        inp_size = lhs.shape[0]
+        out_size = rhs.shape[1]
+        input_shape = input.shape
+        input = input.reshape(-1, inp_size)
+        grad_output = grad_output.reshape(-1, out_size)
+        input_grad = None
+        if ctx.needs_input_grad[0]:
+            input_grad = (grad_output @ rhs.T) @ lhs.T
+        lhs_grad = None
+        if ctx.needs_input_grad[1]:
+            # On practice for large models embedding dimension is large than
+            # batch size.
+            lhs_grad = input.T @ (grad_output @ rhs.T)
+        rhs_grad = None
+        if ctx.needs_input_grad[2]:
+            # Again, batch size is usually lesser then embedding dimension.
+            rhs_grad = (input @ lhs).T @ grad_output
+        bias_grad = None
+        if ctx.needs_input_grad[3]:
+            bias_grad = grad_output.sum(axis=0)
+        # Restore shape of input gradients.
+        input_grad = input_grad.reshape(input_shape)
+        return input_grad, lhs_grad, rhs_grad, bias_grad
+compressed_linear_svd = SVDCompressedLinearFunc.apply
+class SVDCompressedLinear(T.nn.Module):
+    """Class SVDCompressedLinear is a layer which represents a weight matrix of
+    lineaer layer in factorized view.
+    >>> linear_layer = T.nn.Linear(10, 20)
+    >>> svd_layer = SVDCompressedLinear.from_linear(linear_layer, rank=5)
+    """
+    def __init__(self, factors: Tuple[T.Tensor, T.Tensor, T.Tensor],
+                 bias: Optional[T.Tensor] = None):
+        super().__init__()
+        # We do not want track singular values so let's mix t into left and
+        # right vectors.
+        scale = T.sqrt(factors[1])
+        # Store factors of W^T but build factors for W.
+        self.lhs = T.nn.Parameter(factors[2].T * scale[None, :])
+        self.rhs = T.nn.Parameter(factors[0].T * scale[:, None])
+        self.bias = None
+        if bias is not None:
+            self.bias = T.nn.Parameter(bias)
+        self.in_features = self.lhs.shape[0]
+        self.out_features = self.rhs.shape[1]
+    @classmethod
+    def from_linear(cls, linear: T.nn.Linear, rank: Optional[int] = None,
+                    tol: float = 1e-6):
+        with T.no_grad():
+            data = linear.weight.data
+            lhs, vals, rhs = T.linalg.svd(data)
+            if rank is None:
+                raise NotImplementedError
+            else:
+                lhs = lhs[:, :rank]
+                rhs = rhs[:rank, :]
+                vals = vals[:rank]
+            bias = None
+            if linear.bias is not None:
+                bias = T.clone(linear.bias.data)
+        return SVDCompressedLinear((lhs, vals, rhs), bias)
+    @classmethod
+    def from_random(cls, in_features: int, out_features: int, rank: int,
+                    bias: bool = True):
+        lvecs = T.randn((out_features, rank))
+        svals = T.ones(rank)
+        rvecs = T.randn((rank, in_features))
+        bias_term = None
+        if bias:
+            bias_term = T.randn(out_features)
+        return SVDCompressedLinear((lvecs, svals, rvecs), bias_term)
+    def forward(self, input: T.Tensor) -> T.Tensor:
+        return compressed_linear_svd(input, self.lhs, self.rhs, self.bias)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:327dc37b94d577ccfd06fec73f9e2fd6c4ab1ed5b37b1c3917962dc81d3d84bd
+size 520233469

util.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# Copied from rut5compressed/util.py of rut5compressed repository.
+import logging
+import re
+from functools import wraps
+from re import Pattern
+from typing import Callable, Dict, Optional, Tuple
+import numpy as np
+import torch as T
+from .modules import SVDCompressedLinear
+def map_module(root: T.nn.Module,
+               func: Callable[[T.nn.Module, str], T.nn.Module],
+               patt: Optional[str] = None) -> T.nn.Module:
+    """Function ``map_module`` applies a function to each leaf of module tree
+    which matches to a specified pattern.
+    Parameters
+    ----------
+    root : torch.nn.Module
+        Module to modify.
+    func : callable
+        Function to be applied to every module (or matched to pattern) in
+        module tree.
+    patt : str, optional
+        Pattern to filter modules by path in module tree.
+    Returns
+    -------
+    torch.nn.Module
+        Module modified in-place.
+    """
+    @wraps(func)
+    def func_safe(*args, **kwargs):
+        node = func(*args, **kwargs)
+        if not isinstance(node, T.nn.Module):
+            raise ValueError('Mapped result must be toch.nn.Module type '
+                             f'but given {type(node)}.')
+        return node
+    return _map_module(root, func_safe, re.compile(patt or r'.*'), '')
+def _map_module(root: T.nn.Module,
+                func: Callable[[T.nn.Module, str], T.nn.Module], patt: Pattern,
+                path: str) -> T.nn.Module:
+    for name, child in root.named_children():
+        node = _map_module(child, func, patt, f'{path}/{name}')
+        if node != child:
+            setattr(root, name, node)
+    if patt.match(path or '/'):
+        root = func(root, path or '/')
+    return root
+def convert_linear(module: T.nn.Linear, ctor, **kwargs) -> T.nn.Module:
+    """Function convert_linear takes module and returns linear module with
+    approximate matmul. Non-linear modules are returned intact.
+    """
+    if not isinstance(module, T.nn.Linear):
+        return module
+    raise NotImplementedError
+def numel(module: T.nn.Module):
+    value = sum(x.numel() for x in module.parameters()) + \
+            sum(x.numel() for x in module.buffers())
+    def account_prunned(module: T.nn.Module, path: str):
+        nonlocal value
+        for name, attr in vars(module).items():
+            if not name.endswith('_mask') or not isinstance(attr, T.Tensor):
+                continue
+            weight_name = name[:-5]
+            if not hasattr(module, weight_name):
+                continue
+            weight = getattr(module, weight_name)
+            value -= weight.numel() - attr.sum()
+            value += attr.numel()
+        return module
+    def account_quantized(module: T.nn.Module, path: str):
+        nonlocal value
+        if isinstance(module, T.nn.quantized.Linear):
+            value += module.weight().numel()
+            if module.bias() is not None:
+                value += module.bias().numel()
+        return module
+    def account_rest(module: T.nn.Module, path: str):
+        account_prunned(module, path)
+        account_quantized(module, path)
+        return module
+    map_module(module, account_rest)
+    return value
+def sizeof(module: T.nn.Module):
+    value = sum(x.numel() * x.element_size() for x in module.parameters()) + \
+            sum(x.numel() * x.element_size() for x in module.buffers())
+    def account_prunned(module: T.nn.Module, path: str):
+        nonlocal value
+        for name, attr in vars(module).items():
+            if not name.endswith('_mask') or not isinstance(attr, T.Tensor):
+                continue
+            weight_name = name[:-5]
+            if not hasattr(module, weight_name):
+                continue
+            weight = getattr(module, weight_name)
+            value -= (weight.numel() - attr.sum()) * weight.element_size()
+            value += attr.numel() * attr.element_size()
+        return module
+    def account_quantized(module: T.nn.Module, path: str):
+        nonlocal value
+        if isinstance(module, T.nn.quantized.Linear):
+            value += module.weight().numel() * module.weight().element_size()
+            if (bias := module.bias()) is not None:
+                value += bias.numel() * bias.element_size()
+        return module
+    def account_rest(module: T.nn.Module, path: str):
+        account_prunned(module, path)
+        account_quantized(module, path)
+        return module
+    map_module(module, account_rest)
+    return value
+def flatten_module(module: T.nn.Module, regexp=None) -> Dict[str, T.nn.Module]:
+    modules = {}
+    map_module(module, lambda x, y: modules.update(**{y: x}) or x, regexp)
+    return modules
+def print_flatten(module: T.nn.Module):
+    paths = []
+    path_len = 0
+    names = []
+    name_len = 0
+    indx_len = 0
+    def func(module, path):
+        nonlocal path_len, name_len, indx_len
+        paths.append(path)
+        path_len = max(path_len, len(path))
+        name = module.__class__.__name__
+        names.append(name)
+        name_len = max(name_len, len(name))
+        indx_len += 1
+        return module
+    map_module(module, func)
+    indx_len = int(np.ceil(np.log10(indx_len)))
+    fmt = f'{{indx:>{indx_len}s}} {{path:{path_len}s}} {{name:{name_len}s}}'
+    print(fmt.format(indx='#', path='Path', name='Layer'))
+    print('-' * (indx_len + path_len + name_len + 2))
+    for i, (path, name) in enumerate(zip(paths, names)):
+        print(fmt.format(indx=str(i), path=path, name=name))
+def compress_linear_svd(module: T.nn.Module, path: str,
+                        rank: Optional[int] = None) -> T.nn.Module:
+    if not isinstance(module, T.nn.Linear):
+        return module
+    # Do not factorize if ranks equals to the size of the
+    # smallest dimension.
+    norows, nocols = module.weight.shape
+    if rank == min(norows, nocols):
+        return module
+    # If there is no rank, then choose rank to be equal point when the number
+    # of elements in original matrix is approximately equal to the number of
+    # elements in SVD factors.
+    if rank is None:
+        ratio = norows * nocols / (norows + nocols)
+        rank = int(np.floor(ratio))
+    return SVDCompressedLinear.from_linear(module, rank)
+def compress_linear_tt(module: T.nn.Module, path: str,
+                       shape: Tuple[Tuple[int], Tuple[int]],
+                       rank: int) -> T.nn.Module:
+    if not isinstance(module, T.nn.Linear):
+        return module
+    # TODO(@not-found): We need propper compression config.
+    inp_size = np.prod(shape[0])
+    out_size = np.prod(shape[1])
+    if inp_size == module.in_features and out_size == module.out_features:
+        pass
+    elif inp_size == module.out_features and out_size == module.in_features:
+        shape = (shape[1], shape[0])
+    else:
+        raise ValueError(
+            'Input and output features does not match to compression shape: '
+            f'{shape[0]} vs {module.in_features} and {shape[1]} vs '
+            f'{module.out_features}.')
+    logging.info('apply tt compression to layer %s', path)
+    return TTCompressedLinear.from_linear(module, shape, rank)  # noqa: F821
+def compress(module: T.nn.Module, rank: int) -> T.nn.Module:
+    """Function compress substitutes in-place linear layer of T5 model with
+    linear layer which weight matrix is factorized with SVD.
+    :param module: Model to compress.
+    :param rank: Desired rank of compressed layer.
+    """
+    return map_module(
+        root=module,
+        func=lambda x, y: compress_linear_svd(x, y, rank),
+        patt=r'.*/DenseReluDense/w.*')  # TODO(@not-found): Remove?