TryingHard commited on
Commit
0fba527
1 Parent(s): 35f5c81

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +178 -0
README.md CHANGED
@@ -155,6 +155,184 @@ for i in range(len(batch_input_ids)):
155
  </details>
156
 
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  ## Performance
159
  Here we report the performance of Ovis1.6-Gemma2-9B-GPTQ-Int4. The results are obtained with VLMEvalkit.
160
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/645cb4b4a03f3ebb0bde20e0/pSKiBhCy1S6Fb1QODY_ZZ.png)
 
155
  </details>
156
 
157
 
158
+ ## Quantize Your Own Ovis Model with AutoGPTQ
159
+ We provide a demonstration code snippet for you to quantize your own fine-tuned Ovis model. Before running the code, you need to **follow the ABOVE installation steps** to obtain an environment for quantization.
160
+ ```python
161
+ from typing import Dict, Sequence, Union, List
162
+ import copy
163
+ import logging
164
+
165
+ from auto_gptq import BaseQuantizeConfig
166
+ from auto_gptq.modeling import OvisGPTQForCausalLM
167
+ import torch
168
+ from torch.utils.data import Dataset, DataLoader
169
+ from PIL import Image
170
+
171
+
172
+ # Specify paths and hyperparameters for quantization
173
+ model_path = "path/to/finetuned/model"
174
+ quantize_save_path = "path/to/save/quantized/model"
175
+ IGNORE_ID = -100
176
+ device_idx = 2 # you customize
177
+ torch.cuda.set_device(device_idx)
178
+ quantize_config = BaseQuantizeConfig(
179
+ bits=4, # 4 or 8
180
+ group_size=128,
181
+ damp_percent=0.1,
182
+ desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
183
+ static_groups=False,
184
+ sym=True,
185
+ true_sequential=True,
186
+ )
187
+
188
+
189
+ # Load model
190
+ model = OvisGPTQForCausalLM.from_pretrained(
191
+ model_path,
192
+ quantize_config,
193
+ torch_dtype=torch.bfloat16,
194
+ multimodal_max_length=8192,
195
+ trust_remote_code=True
196
+ )
197
+ print(f"Model Loaded!")
198
+
199
+
200
+ # prepare calibration samples
201
+ class CalibrationDataset(Dataset):
202
+ """
203
+ Dataset class for calibration. Initialize with the loaded Ovis model, and a sample list in the following format:
204
+ data_list = [
205
+ {
206
+ "image": "path/to/image/of/this/sample",
207
+ "conversations": [
208
+ {
209
+ "from": "human",
210
+ "value": "<image>\n[Your sample prompt]"
211
+ },
212
+ {
213
+ "from": "gpt",
214
+ "value": "[Anything]"
215
+ }
216
+ ]
217
+ },
218
+ ...
219
+ ]
220
+ """
221
+ def __init__(self, model, text_max_length, data_list: List[Dict]):
222
+ self.data = data_list
223
+ self.model = model
224
+ self.visual_tokenizer = model.get_visual_tokenizer()
225
+ self.text_max_length = text_max_length
226
+
227
+
228
+ def __len__(self):
229
+ return len(self.data)
230
+
231
+
232
+ def __getitem__(self, i: int) -> Dict[str, torch.Tensor]:
233
+ sample = self.data[i]
234
+ conversations = copy.deepcopy(sample["conversations"])
235
+ images = [Image.open(sample['image'])]
236
+ max_partition = 9
237
+
238
+ prompt, input_ids, pixel_values, labels = self.model.preprocess_inputs(
239
+ conversations,
240
+ images,
241
+ max_partition=max_partition,
242
+ generation_preface=None,
243
+ return_labels=True,
244
+ propagate_exception=False
245
+ )
246
+
247
+ if pixel_values is None:
248
+ pixel_values, _ = self.visual_tokenizer.mock_input()
249
+
250
+ input_ids = input_ids[:self.text_max_length]
251
+ labels = labels[:self.text_max_length]
252
+
253
+ return dict(
254
+ pixel_values=pixel_values,
255
+ input_ids=input_ids,
256
+ labels=labels
257
+ )
258
+
259
+
260
+ class DataCollatorForMultimodalDatasetGPTQ:
261
+ def __init__(self, text_tokenizer):
262
+ self.text_tokenizer = text_tokenizer
263
+
264
+ def __call__(self, instances: Sequence[Dict]) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
265
+ pixel_values, input_ids, labels = tuple([instance[key] for instance in instances]
266
+ for key in ("pixel_values", "input_ids", "labels"))
267
+ input_ids = torch.nn.utils.rnn.pad_sequence(
268
+ input_ids,
269
+ batch_first=True,
270
+ padding_value=self.text_tokenizer.pad_token_id)
271
+ attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
272
+ labels = torch.nn.utils.rnn.pad_sequence(
273
+ labels,
274
+ batch_first=True,
275
+ padding_value=IGNORE_ID)
276
+
277
+ num_valid_label = torch.not_equal(labels, IGNORE_ID).sum().item()
278
+ if num_valid_label == 0:
279
+ logging.warning(
280
+ f'[DataCollatorForMultimodalDatasetGPTQ] All labels are ignored, may causing training instability\n{input_ids=}\n{attention_mask=}\n{labels=}')
281
+
282
+ return dict(
283
+ input_ids=input_ids,
284
+ attention_mask=attention_mask,
285
+ labels=labels,
286
+ pixel_values=pixel_values
287
+ )
288
+
289
+
290
+ class MyDataLoader(DataLoader):
291
+ def __len__(self):
292
+ return len(self.dataset) // self.batch_size # must set drop last=True
293
+
294
+
295
+ # prepare your own calibration samples here
296
+ data_list = [
297
+ {
298
+ "image": "path/to/image/of/this/sample",
299
+ "conversations": [
300
+ {
301
+ "from": "human",
302
+ "value": "<image>\n[Your sample prompt]"
303
+ },
304
+ {
305
+ "from": "gpt",
306
+ "value": "[Anything]"
307
+ }
308
+ ]
309
+ }
310
+ ]
311
+ train_dataset = CalibrationDataset(model, text_max_length=832, data_list=data_list)
312
+ print(f"Dataset Loaded!")
313
+ print(f"Total length of the training set: {len(train_dataset)}")
314
+
315
+ train_loader = MyDataLoader(
316
+ train_dataset,
317
+ collate_fn=DataCollatorForMultimodalDatasetGPTQ(model.get_text_tokenizer()),
318
+ shuffle=False,
319
+ batch_size=4,
320
+ drop_last=True,
321
+ pin_memory=True,
322
+ num_workers=8
323
+ )
324
+ print(f"Dataloader Loaded!")
325
+
326
+
327
+ # start quantizing
328
+ model.quantize(train_loader, cache_examples_on_gpu=False, samples_dtype=torch.bfloat16) # do not change samples_dtype
329
+ print(f"Model Quantized! Now Saving...")
330
+
331
+ model.save_quantized(quantize_save_path, use_safetensors=True)
332
+ print(f"ALL Done!")
333
+ ```
334
+
335
+
336
  ## Performance
337
  Here we report the performance of Ovis1.6-Gemma2-9B-GPTQ-Int4. The results are obtained with VLMEvalkit.
338
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/645cb4b4a03f3ebb0bde20e0/pSKiBhCy1S6Fb1QODY_ZZ.png)