JessyTsu1 commited on
Commit
084d33a
1 Parent(s): 81d8ae7

Upload quantizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. quantizer.py +210 -0
quantizer.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bitsandbytes as bnb
2
+ from bitsandbytes.nn.modules import Params4bit, Int8Params
3
+ import torch
4
+
5
+ def Params4bitCuda(self, device):
6
+ self.data = self.data.cuda(device)
7
+ self.quant_state[0] = self.quant_state[0].cuda(device)
8
+ self.quant_state[4][0] = self.quant_state[4][0].cuda(device)
9
+ self.quant_state[4][1][0] = self.quant_state[4][1][0].cuda(device)
10
+ self.quant_state[4][1][1] = self.quant_state[4][1][1].cuda(device)
11
+
12
+ self.quant_state[6] = self.quant_state[6].cuda(device)
13
+ return self
14
+
15
+ class Linear4bitOnline(torch.nn.Module):
16
+ def __init__(self, weight, bias, quant_type):
17
+ super().__init__()
18
+ self.weight = Params4bit(
19
+ weight.data, requires_grad=False, compress_statistics=True, quant_type=quant_type
20
+ )
21
+ self.compute_dtype = None
22
+ #self.weight.cuda(weight.device)
23
+ self.bias = bias
24
+
25
+ def forward(self, x: torch.Tensor):
26
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
27
+ if self.bias is not None and self.bias.dtype != x.dtype:
28
+ self.bias.data = self.bias.data.to(x.dtype)
29
+
30
+ if getattr(self.weight, "quant_state", None) is None:
31
+ print(
32
+ "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
33
+ )
34
+ inp_dtype = x.dtype
35
+ if self.compute_dtype is not None:
36
+ x = x.to(self.compute_dtype)
37
+
38
+ bias = None if self.bias is None else self.bias.to(self.compute_dtype)
39
+ out = bnb.matmul_4bit(
40
+ x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
41
+ )
42
+
43
+ out = out.to(inp_dtype)
44
+
45
+ return out
46
+
47
+ class Linear8bitLtOnline(torch.nn.Module):
48
+ def __init__(
49
+ self,
50
+ weight,
51
+ bias,
52
+ has_fp16_weights=True,
53
+ memory_efficient_backward=False,
54
+ threshold=0.0,
55
+ index=None,
56
+ ):
57
+ super().__init__()
58
+ assert (
59
+ not memory_efficient_backward
60
+ ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
61
+ self.state = bnb.MatmulLtState()
62
+ self.index = index
63
+
64
+ # Necessary for stacked layers
65
+ self.state.threshold = threshold
66
+ self.state.has_fp16_weights = has_fp16_weights
67
+ self.state.memory_efficient_backward = memory_efficient_backward
68
+ if threshold > 0.0 and not has_fp16_weights:
69
+ self.state.use_pool = True
70
+
71
+ self.weight = Int8Params(
72
+ weight.data,
73
+ has_fp16_weights=has_fp16_weights,
74
+ requires_grad=has_fp16_weights,
75
+ )
76
+ self.bias = bias
77
+
78
+ def init_8bit_state(self):
79
+ self.state.CB = self.weight.CB
80
+ self.state.SCB = self.weight.SCB
81
+ self.weight.CB = None
82
+ self.weight.SCB = None
83
+
84
+ def forward(self, x: torch.Tensor):
85
+ self.state.is_training = self.training
86
+ if self.weight.CB is not None:
87
+ self.init_8bit_state()
88
+
89
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
90
+ if self.bias is not None and self.bias.dtype != x.dtype:
91
+ self.bias.data = self.bias.data.to(x.dtype)
92
+
93
+ out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
94
+
95
+ if not self.state.has_fp16_weights:
96
+ if self.state.CB is not None and self.state.CxB is not None:
97
+ # we converted 8-bit row major to turing/ampere format in the first inference pass
98
+ # we no longer need the row-major weight
99
+ del self.state.CB
100
+ self.weight.data = self.state.CxB
101
+ return out
102
+
103
+ def quantize_offline(model, bits: int):
104
+ assert (bits == 4), f'bits: {bits} is not supported'
105
+
106
+ for i, layer in enumerate(model.model.layers):
107
+ layer.self_attn.W_pack = bnb.nn.Linear4bit(
108
+ layer.self_attn.W_pack.weight.shape[1],
109
+ layer.self_attn.W_pack.weight.shape[0],
110
+ False,
111
+ torch.float16,
112
+ compress_statistics=True,
113
+ quant_type="nf4",
114
+ )
115
+ layer.self_attn.o_proj = bnb.nn.Linear4bit(
116
+ layer.self_attn.o_proj.weight.shape[1],
117
+ layer.self_attn.o_proj.weight.shape[0],
118
+ False,
119
+ torch.float16,
120
+ compress_statistics=True,
121
+ quant_type="nf4",
122
+ )
123
+
124
+ layer.mlp.gate_proj = bnb.nn.Linear4bit(
125
+ layer.mlp.gate_proj.weight.shape[1],
126
+ layer.mlp.gate_proj.weight.shape[0],
127
+ False,
128
+ torch.float16,
129
+ compress_statistics=True,
130
+ quant_type="nf4",
131
+ )
132
+ layer.mlp.down_proj = bnb.nn.Linear4bit(
133
+ layer.mlp.down_proj.weight.shape[1],
134
+ layer.mlp.down_proj.weight.shape[0],
135
+ False,
136
+ torch.float16,
137
+ compress_statistics=True,
138
+ quant_type="nf4",
139
+ )
140
+ layer.mlp.up_proj = bnb.nn.Linear4bit(
141
+ layer.mlp.up_proj.weight.shape[1],
142
+ layer.mlp.up_proj.weight.shape[0],
143
+ False,
144
+ torch.float16,
145
+ compress_statistics=True,
146
+ quant_type="nf4",
147
+ )
148
+ return model
149
+
150
+ def quantize_online(model, bits: int):
151
+ def quant(weight, bias=None):
152
+ if bits == 8:
153
+ linear = Linear8bitLtOnline(
154
+ weight,
155
+ bias,
156
+ has_fp16_weights=False,
157
+ threshold=6.0,
158
+ )
159
+ if bias is not None:
160
+ linear.bias = torch.nn.Parameter(bias)
161
+ elif bits == 4:
162
+ linear = Linear4bitOnline(
163
+ weight,
164
+ bias,
165
+ quant_type="nf4", #fp4/nf4
166
+ )
167
+ else:
168
+ raise ValueError("quantize only support 4/8 bit")
169
+ return linear
170
+
171
+ for i, layer in enumerate(model.model.layers):
172
+ layer.self_attn.W_pack = quant(layer.self_attn.W_pack.weight)
173
+ layer.self_attn.o_proj = quant(layer.self_attn.o_proj.weight)
174
+ layer.mlp.gate_proj = quant(layer.mlp.gate_proj.weight)
175
+ layer.mlp.down_proj = quant(layer.mlp.down_proj.weight)
176
+ layer.mlp.up_proj = quant(layer.mlp.up_proj.weight)
177
+ return model
178
+
179
+ def init_model_weight_int4(config, model, state_dict):
180
+ #replace Params4bit.cuda with Params4bitCuda
181
+ Params4bit.cuda = Params4bitCuda
182
+
183
+ for i in range(config.num_hidden_layers):
184
+ weight_data = state_dict[f'model.layers.{i}.self_attn.W_pack.weight.data']
185
+ weight_quant_state = state_dict[f'model.layers.{i}.self_attn.W_pack.weight.quant_state']
186
+ model.model.layers[i].self_attn.W_pack.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
187
+
188
+ weight_data = state_dict[f'model.layers.{i}.self_attn.o_proj.weight.data']
189
+ weight_quant_state = state_dict[f'model.layers.{i}.self_attn.o_proj.weight.quant_state']
190
+ model.model.layers[i].self_attn.o_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
191
+
192
+ weight_data = state_dict[f'model.layers.{i}.mlp.gate_proj.weight.data']
193
+ weight_quant_state = state_dict[f'model.layers.{i}.mlp.gate_proj.weight.quant_state']
194
+ model.model.layers[i].mlp.gate_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
195
+
196
+ weight_data = state_dict[f'model.layers.{i}.mlp.up_proj.weight.data']
197
+ weight_quant_state = state_dict[f'model.layers.{i}.mlp.up_proj.weight.quant_state']
198
+ model.model.layers[i].mlp.up_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
199
+
200
+ weight_data = state_dict[f'model.layers.{i}.mlp.down_proj.weight.data']
201
+ weight_quant_state = state_dict[f'model.layers.{i}.mlp.down_proj.weight.quant_state']
202
+ model.model.layers[i].mlp.down_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
203
+
204
+ model.model.layers[i].input_layernorm.weight = state_dict[f'model.layers.{i}.input_layernorm.weight']
205
+ model.model.layers[i].post_attention_layernorm.weight = state_dict[f'model.layers.{i}.post_attention_layernorm.weight']
206
+
207
+ model.model.embed_tokens.weight = state_dict['model.embed_tokens.weight']
208
+ model.model.norm.weight = state_dict['model.norm.weight']
209
+ model.lm_head.weight = state_dict['lm_head.weight']
210
+ return model