not-found commited on
Commit
db45d00
1 Parent(s): 97e0fd7

Add SVD-compressed model with rank 512

Browse files
Files changed (6) hide show
  1. config.json +79 -0
  2. configuration_bart.py +11 -0
  3. modeling_bart.py +45 -0
  4. modules.py +121 -0
  5. pytorch_model.bin +3 -0
  6. util.py +227 -0
config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-base",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "SVDCompressedBartForConditionGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "auto_map": {
12
+ "AutoConfig": "configuration_bart.SVDCompressedBartConfig",
13
+ "AutoModelForSeq2SeqLM": "modeling_bart.SVDCompressedBartForConditionGeneration"
14
+ },
15
+ "bos_token_id": 0,
16
+ "classif_dropout": 0.1,
17
+ "classifier_dropout": 0.0,
18
+ "d_model": 768,
19
+ "decoder_attention_heads": 12,
20
+ "decoder_ffn_dim": 3072,
21
+ "decoder_layerdrop": 0.0,
22
+ "decoder_layers": 6,
23
+ "decoder_start_token_id": 2,
24
+ "dropout": 0.1,
25
+ "early_stopping": true,
26
+ "encoder_attention_heads": 12,
27
+ "encoder_ffn_dim": 3072,
28
+ "encoder_layerdrop": 0.0,
29
+ "encoder_layers": 6,
30
+ "eos_token_id": 2,
31
+ "forced_eos_token_id": 2,
32
+ "gradient_checkpointing": false,
33
+ "id2label": {
34
+ "0": "LABEL_0",
35
+ "1": "LABEL_1",
36
+ "2": "LABEL_2"
37
+ },
38
+ "init_std": 0.02,
39
+ "is_encoder_decoder": true,
40
+ "label2id": {
41
+ "LABEL_0": 0,
42
+ "LABEL_1": 1,
43
+ "LABEL_2": 2
44
+ },
45
+ "max_position_embeddings": 1024,
46
+ "model_type": "bart",
47
+ "no_repeat_ngram_size": 3,
48
+ "normalize_before": false,
49
+ "normalize_embedding": true,
50
+ "num_beams": 4,
51
+ "num_hidden_layers": 6,
52
+ "pad_token_id": 1,
53
+ "rank": 512,
54
+ "scale_embedding": false,
55
+ "task_specific_params": {
56
+ "summarization": {
57
+ "length_penalty": 1.0,
58
+ "max_length": 128,
59
+ "min_length": 12,
60
+ "num_beams": 4
61
+ },
62
+ "summarization_cnn": {
63
+ "length_penalty": 2.0,
64
+ "max_length": 142,
65
+ "min_length": 56,
66
+ "num_beams": 4
67
+ },
68
+ "summarization_xsum": {
69
+ "length_penalty": 1.0,
70
+ "max_length": 62,
71
+ "min_length": 11,
72
+ "num_beams": 6
73
+ }
74
+ },
75
+ "torch_dtype": "float32",
76
+ "transformers_version": "4.25.1",
77
+ "use_cache": true,
78
+ "vocab_size": 50266
79
+ }
configuration_bart.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BartConfig
2
+
3
+
4
+ class SVDCompressedBartConfig(BartConfig):
5
+
6
+ def __init__(self, *args, rank: int = 512, **kwargs):
7
+ super().__init__(*args, **kwargs)
8
+ self.rank = rank
9
+
10
+
11
+ SVDCompressedBartConfig.register_for_auto_class()
modeling_bart.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This module uses parts of rut5compressed. It shares the same module
2
+ structure as model used in neural network compression experiments with
3
+ rut5compressed.
4
+ """
5
+
6
+ from functools import partial
7
+ from typing import Optional
8
+
9
+ import torch as T
10
+ from transformers import BartForConditionalGeneration
11
+
12
+ from .configuration_bart import SVDCompressedBartConfig
13
+ from .modules import SVDCompressedLinear
14
+ from .util import compress_linear_svd, map_module
15
+
16
+
17
+ class SVDCompressedBartForConditionGeneration(BartForConditionalGeneration):
18
+ """Class SVDCompressedBartForConditionGeneration defines a BART-based model
19
+ with compressed linear layers with SVD.
20
+ """
21
+
22
+ LAYERS = r'/(de|en)coder/layers/\d+/fc[12]'
23
+
24
+ config_class = SVDCompressedBartConfig
25
+
26
+ def __init__(self, config: SVDCompressedBartConfig,
27
+ rank: Optional[int] = None,
28
+ compress: bool = False):
29
+ super().__init__(config)
30
+ self.rank = rank or config.rank
31
+
32
+ compress_fn = partial(compress_linear_svd, rank=self.rank)
33
+ if not compress:
34
+ compress_fn = self.convert
35
+ self.model = map_module(self.model, compress_fn, self.LAYERS)
36
+
37
+ def convert(self, module: T.nn.Module, path: str) -> T.nn.Module:
38
+ if not isinstance(module, T.nn.Linear):
39
+ return module
40
+ return SVDCompressedLinear.from_random(module.in_features,
41
+ module.out_features, self.rank)
42
+
43
+
44
+ SVDCompressedBartForConditionGeneration \
45
+ .register_for_auto_class('AutoModelForSeq2SeqLM')
modules.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copied from
2
+ # rut5compressed/nn/functional.py
3
+ # rut5compressed/nn/modules.py
4
+ # modules of original repository.
5
+
6
+ from typing import Optional, Sequence, Tuple
7
+
8
+ import torch as T
9
+
10
+
11
+ class SVDCompressedLinearFunc(T.autograd.Function):
12
+
13
+ @staticmethod
14
+ def forward(ctx, input: T.Tensor, lhs: T.Tensor,
15
+ rhs: T.Tensor, bias: Optional[T.Tensor] = None) -> T.Tensor:
16
+ # See PEP-0465 on matmul operator associativity.
17
+ # https://peps.python.org/pep-0465/#precedence-and-associativity
18
+ output = (input @ lhs) @ rhs
19
+ if bias is not None:
20
+ output += bias[None, :]
21
+ ctx.bias = bias is not None
22
+ ctx.save_for_backward(input, lhs, rhs)
23
+ return output
24
+
25
+ @staticmethod
26
+ def backward(ctx, grad_output: Sequence[T.Tensor]):
27
+ input, lhs, rhs = ctx.saved_tensors
28
+
29
+ # Flatten input and output gradients over the leading dimensions.
30
+ inp_size = lhs.shape[0]
31
+ out_size = rhs.shape[1]
32
+ input_shape = input.shape
33
+ input = input.reshape(-1, inp_size)
34
+ grad_output = grad_output.reshape(-1, out_size)
35
+
36
+ input_grad = None
37
+ if ctx.needs_input_grad[0]:
38
+ input_grad = (grad_output @ rhs.T) @ lhs.T
39
+
40
+ lhs_grad = None
41
+ if ctx.needs_input_grad[1]:
42
+ # On practice for large models embedding dimension is large than
43
+ # batch size.
44
+ lhs_grad = input.T @ (grad_output @ rhs.T)
45
+
46
+ rhs_grad = None
47
+ if ctx.needs_input_grad[2]:
48
+ # Again, batch size is usually lesser then embedding dimension.
49
+ rhs_grad = (input @ lhs).T @ grad_output
50
+
51
+ bias_grad = None
52
+ if ctx.needs_input_grad[3]:
53
+ bias_grad = grad_output.sum(axis=0)
54
+
55
+ # Restore shape of input gradients.
56
+ input_grad = input_grad.reshape(input_shape)
57
+ return input_grad, lhs_grad, rhs_grad, bias_grad
58
+
59
+
60
+ compressed_linear_svd = SVDCompressedLinearFunc.apply
61
+
62
+
63
+ class SVDCompressedLinear(T.nn.Module):
64
+ """Class SVDCompressedLinear is a layer which represents a weight matrix of
65
+ lineaer layer in factorized view.
66
+
67
+ >>> linear_layer = T.nn.Linear(10, 20)
68
+ >>> svd_layer = SVDCompressedLinear.from_linear(linear_layer, rank=5)
69
+ """
70
+
71
+ def __init__(self, factors: Tuple[T.Tensor, T.Tensor, T.Tensor],
72
+ bias: Optional[T.Tensor] = None):
73
+ super().__init__()
74
+
75
+ # We do not want track singular values so let's mix t into left and
76
+ # right vectors.
77
+ scale = T.sqrt(factors[1])
78
+
79
+ # Store factors of W^T but build factors for W.
80
+ self.lhs = T.nn.Parameter(factors[2].T * scale[None, :])
81
+ self.rhs = T.nn.Parameter(factors[0].T * scale[:, None])
82
+
83
+ self.bias = None
84
+ if bias is not None:
85
+ self.bias = T.nn.Parameter(bias)
86
+
87
+ self.in_features = self.lhs.shape[0]
88
+ self.out_features = self.rhs.shape[1]
89
+
90
+ @classmethod
91
+ def from_linear(cls, linear: T.nn.Linear, rank: Optional[int] = None,
92
+ tol: float = 1e-6):
93
+ with T.no_grad():
94
+ data = linear.weight.data
95
+ lhs, vals, rhs = T.linalg.svd(data)
96
+ if rank is None:
97
+ raise NotImplementedError
98
+ else:
99
+ lhs = lhs[:, :rank]
100
+ rhs = rhs[:rank, :]
101
+ vals = vals[:rank]
102
+
103
+ bias = None
104
+ if linear.bias is not None:
105
+ bias = T.clone(linear.bias.data)
106
+
107
+ return SVDCompressedLinear((lhs, vals, rhs), bias)
108
+
109
+ @classmethod
110
+ def from_random(cls, in_features: int, out_features: int, rank: int,
111
+ bias: bool = True):
112
+ lvecs = T.randn((out_features, rank))
113
+ svals = T.ones(rank)
114
+ rvecs = T.randn((rank, in_features))
115
+ bias_term = None
116
+ if bias:
117
+ bias_term = T.randn(out_features)
118
+ return SVDCompressedLinear((lvecs, svals, rvecs), bias_term)
119
+
120
+ def forward(self, input: T.Tensor) -> T.Tensor:
121
+ return compressed_linear_svd(input, self.lhs, self.rhs, self.bias)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:327dc37b94d577ccfd06fec73f9e2fd6c4ab1ed5b37b1c3917962dc81d3d84bd
3
+ size 520233469
util.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copied from rut5compressed/util.py of rut5compressed repository.
2
+
3
+ import logging
4
+ import re
5
+ from functools import wraps
6
+ from re import Pattern
7
+ from typing import Callable, Dict, Optional, Tuple
8
+
9
+ import numpy as np
10
+ import torch as T
11
+
12
+ from .modules import SVDCompressedLinear
13
+
14
+
15
+ def map_module(root: T.nn.Module,
16
+ func: Callable[[T.nn.Module, str], T.nn.Module],
17
+ patt: Optional[str] = None) -> T.nn.Module:
18
+ """Function ``map_module`` applies a function to each leaf of module tree
19
+ which matches to a specified pattern.
20
+
21
+ Parameters
22
+ ----------
23
+ root : torch.nn.Module
24
+ Module to modify.
25
+ func : callable
26
+ Function to be applied to every module (or matched to pattern) in
27
+ module tree.
28
+ patt : str, optional
29
+ Pattern to filter modules by path in module tree.
30
+
31
+ Returns
32
+ -------
33
+ torch.nn.Module
34
+ Module modified in-place.
35
+ """
36
+ @wraps(func)
37
+ def func_safe(*args, **kwargs):
38
+ node = func(*args, **kwargs)
39
+ if not isinstance(node, T.nn.Module):
40
+ raise ValueError('Mapped result must be toch.nn.Module type '
41
+ f'but given {type(node)}.')
42
+ return node
43
+
44
+ return _map_module(root, func_safe, re.compile(patt or r'.*'), '')
45
+
46
+
47
+ def _map_module(root: T.nn.Module,
48
+ func: Callable[[T.nn.Module, str], T.nn.Module], patt: Pattern,
49
+ path: str) -> T.nn.Module:
50
+ for name, child in root.named_children():
51
+ node = _map_module(child, func, patt, f'{path}/{name}')
52
+ if node != child:
53
+ setattr(root, name, node)
54
+ if patt.match(path or '/'):
55
+ root = func(root, path or '/')
56
+ return root
57
+
58
+
59
+ def convert_linear(module: T.nn.Linear, ctor, **kwargs) -> T.nn.Module:
60
+ """Function convert_linear takes module and returns linear module with
61
+ approximate matmul. Non-linear modules are returned intact.
62
+ """
63
+ if not isinstance(module, T.nn.Linear):
64
+ return module
65
+ raise NotImplementedError
66
+
67
+
68
+ def numel(module: T.nn.Module):
69
+ value = sum(x.numel() for x in module.parameters()) + \
70
+ sum(x.numel() for x in module.buffers())
71
+
72
+ def account_prunned(module: T.nn.Module, path: str):
73
+ nonlocal value
74
+ for name, attr in vars(module).items():
75
+ if not name.endswith('_mask') or not isinstance(attr, T.Tensor):
76
+ continue
77
+
78
+ weight_name = name[:-5]
79
+ if not hasattr(module, weight_name):
80
+ continue
81
+
82
+ weight = getattr(module, weight_name)
83
+ value -= weight.numel() - attr.sum()
84
+ value += attr.numel()
85
+ return module
86
+
87
+ def account_quantized(module: T.nn.Module, path: str):
88
+ nonlocal value
89
+ if isinstance(module, T.nn.quantized.Linear):
90
+ value += module.weight().numel()
91
+ if module.bias() is not None:
92
+ value += module.bias().numel()
93
+ return module
94
+
95
+ def account_rest(module: T.nn.Module, path: str):
96
+ account_prunned(module, path)
97
+ account_quantized(module, path)
98
+ return module
99
+
100
+ map_module(module, account_rest)
101
+ return value
102
+
103
+
104
+ def sizeof(module: T.nn.Module):
105
+ value = sum(x.numel() * x.element_size() for x in module.parameters()) + \
106
+ sum(x.numel() * x.element_size() for x in module.buffers())
107
+
108
+ def account_prunned(module: T.nn.Module, path: str):
109
+ nonlocal value
110
+ for name, attr in vars(module).items():
111
+ if not name.endswith('_mask') or not isinstance(attr, T.Tensor):
112
+ continue
113
+
114
+ weight_name = name[:-5]
115
+ if not hasattr(module, weight_name):
116
+ continue
117
+
118
+ weight = getattr(module, weight_name)
119
+ value -= (weight.numel() - attr.sum()) * weight.element_size()
120
+ value += attr.numel() * attr.element_size()
121
+ return module
122
+
123
+ def account_quantized(module: T.nn.Module, path: str):
124
+ nonlocal value
125
+ if isinstance(module, T.nn.quantized.Linear):
126
+ value += module.weight().numel() * module.weight().element_size()
127
+ if (bias := module.bias()) is not None:
128
+ value += bias.numel() * bias.element_size()
129
+ return module
130
+
131
+ def account_rest(module: T.nn.Module, path: str):
132
+ account_prunned(module, path)
133
+ account_quantized(module, path)
134
+ return module
135
+
136
+ map_module(module, account_rest)
137
+ return value
138
+
139
+
140
+ def flatten_module(module: T.nn.Module, regexp=None) -> Dict[str, T.nn.Module]:
141
+ modules = {}
142
+ map_module(module, lambda x, y: modules.update(**{y: x}) or x, regexp)
143
+ return modules
144
+
145
+
146
+ def print_flatten(module: T.nn.Module):
147
+ paths = []
148
+ path_len = 0
149
+ names = []
150
+ name_len = 0
151
+ indx_len = 0
152
+
153
+ def func(module, path):
154
+ nonlocal path_len, name_len, indx_len
155
+ paths.append(path)
156
+ path_len = max(path_len, len(path))
157
+ name = module.__class__.__name__
158
+ names.append(name)
159
+ name_len = max(name_len, len(name))
160
+ indx_len += 1
161
+ return module
162
+
163
+ map_module(module, func)
164
+
165
+ indx_len = int(np.ceil(np.log10(indx_len)))
166
+ fmt = f'{{indx:>{indx_len}s}} {{path:{path_len}s}} {{name:{name_len}s}}'
167
+ print(fmt.format(indx='#', path='Path', name='Layer'))
168
+ print('-' * (indx_len + path_len + name_len + 2))
169
+ for i, (path, name) in enumerate(zip(paths, names)):
170
+ print(fmt.format(indx=str(i), path=path, name=name))
171
+
172
+
173
+ def compress_linear_svd(module: T.nn.Module, path: str,
174
+ rank: Optional[int] = None) -> T.nn.Module:
175
+ if not isinstance(module, T.nn.Linear):
176
+ return module
177
+
178
+ # Do not factorize if ranks equals to the size of the
179
+ # smallest dimension.
180
+ norows, nocols = module.weight.shape
181
+ if rank == min(norows, nocols):
182
+ return module
183
+
184
+ # If there is no rank, then choose rank to be equal point when the number
185
+ # of elements in original matrix is approximately equal to the number of
186
+ # elements in SVD factors.
187
+ if rank is None:
188
+ ratio = norows * nocols / (norows + nocols)
189
+ rank = int(np.floor(ratio))
190
+
191
+ return SVDCompressedLinear.from_linear(module, rank)
192
+
193
+
194
+ def compress_linear_tt(module: T.nn.Module, path: str,
195
+ shape: Tuple[Tuple[int], Tuple[int]],
196
+ rank: int) -> T.nn.Module:
197
+ if not isinstance(module, T.nn.Linear):
198
+ return module
199
+
200
+ # TODO(@not-found): We need propper compression config.
201
+ inp_size = np.prod(shape[0])
202
+ out_size = np.prod(shape[1])
203
+ if inp_size == module.in_features and out_size == module.out_features:
204
+ pass
205
+ elif inp_size == module.out_features and out_size == module.in_features:
206
+ shape = (shape[1], shape[0])
207
+ else:
208
+ raise ValueError(
209
+ 'Input and output features does not match to compression shape: '
210
+ f'{shape[0]} vs {module.in_features} and {shape[1]} vs '
211
+ f'{module.out_features}.')
212
+
213
+ logging.info('apply tt compression to layer %s', path)
214
+ return TTCompressedLinear.from_linear(module, shape, rank) # noqa: F821
215
+
216
+
217
+ def compress(module: T.nn.Module, rank: int) -> T.nn.Module:
218
+ """Function compress substitutes in-place linear layer of T5 model with
219
+ linear layer which weight matrix is factorized with SVD.
220
+
221
+ :param module: Model to compress.
222
+ :param rank: Desired rank of compressed layer.
223
+ """
224
+ return map_module(
225
+ root=module,
226
+ func=lambda x, y: compress_linear_svd(x, y, rank),
227
+ patt=r'.*/DenseReluDense/w.*') # TODO(@not-found): Remove?