TryingHard
commited on
Commit
•
0fba527
1
Parent(s):
35f5c81
Update README.md
Browse files
README.md
CHANGED
@@ -155,6 +155,184 @@ for i in range(len(batch_input_ids)):
|
|
155 |
</details>
|
156 |
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
## Performance
|
159 |
Here we report the performance of Ovis1.6-Gemma2-9B-GPTQ-Int4. The results are obtained with VLMEvalkit.
|
160 |
![image/png](https://cdn-uploads.huggingface.co/production/uploads/645cb4b4a03f3ebb0bde20e0/pSKiBhCy1S6Fb1QODY_ZZ.png)
|
|
|
155 |
</details>
|
156 |
|
157 |
|
158 |
+
## Quantize Your Own Ovis Model with AutoGPTQ
|
159 |
+
We provide a demonstration code snippet for you to quantize your own fine-tuned Ovis model. Before running the code, you need to **follow the ABOVE installation steps** to obtain an environment for quantization.
|
160 |
+
```python
|
161 |
+
from typing import Dict, Sequence, Union, List
|
162 |
+
import copy
|
163 |
+
import logging
|
164 |
+
|
165 |
+
from auto_gptq import BaseQuantizeConfig
|
166 |
+
from auto_gptq.modeling import OvisGPTQForCausalLM
|
167 |
+
import torch
|
168 |
+
from torch.utils.data import Dataset, DataLoader
|
169 |
+
from PIL import Image
|
170 |
+
|
171 |
+
|
172 |
+
# Specify paths and hyperparameters for quantization
|
173 |
+
model_path = "path/to/finetuned/model"
|
174 |
+
quantize_save_path = "path/to/save/quantized/model"
|
175 |
+
IGNORE_ID = -100
|
176 |
+
device_idx = 2 # you customize
|
177 |
+
torch.cuda.set_device(device_idx)
|
178 |
+
quantize_config = BaseQuantizeConfig(
|
179 |
+
bits=4, # 4 or 8
|
180 |
+
group_size=128,
|
181 |
+
damp_percent=0.1,
|
182 |
+
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
|
183 |
+
static_groups=False,
|
184 |
+
sym=True,
|
185 |
+
true_sequential=True,
|
186 |
+
)
|
187 |
+
|
188 |
+
|
189 |
+
# Load model
|
190 |
+
model = OvisGPTQForCausalLM.from_pretrained(
|
191 |
+
model_path,
|
192 |
+
quantize_config,
|
193 |
+
torch_dtype=torch.bfloat16,
|
194 |
+
multimodal_max_length=8192,
|
195 |
+
trust_remote_code=True
|
196 |
+
)
|
197 |
+
print(f"Model Loaded!")
|
198 |
+
|
199 |
+
|
200 |
+
# prepare calibration samples
|
201 |
+
class CalibrationDataset(Dataset):
|
202 |
+
"""
|
203 |
+
Dataset class for calibration. Initialize with the loaded Ovis model, and a sample list in the following format:
|
204 |
+
data_list = [
|
205 |
+
{
|
206 |
+
"image": "path/to/image/of/this/sample",
|
207 |
+
"conversations": [
|
208 |
+
{
|
209 |
+
"from": "human",
|
210 |
+
"value": "<image>\n[Your sample prompt]"
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"from": "gpt",
|
214 |
+
"value": "[Anything]"
|
215 |
+
}
|
216 |
+
]
|
217 |
+
},
|
218 |
+
...
|
219 |
+
]
|
220 |
+
"""
|
221 |
+
def __init__(self, model, text_max_length, data_list: List[Dict]):
|
222 |
+
self.data = data_list
|
223 |
+
self.model = model
|
224 |
+
self.visual_tokenizer = model.get_visual_tokenizer()
|
225 |
+
self.text_max_length = text_max_length
|
226 |
+
|
227 |
+
|
228 |
+
def __len__(self):
|
229 |
+
return len(self.data)
|
230 |
+
|
231 |
+
|
232 |
+
def __getitem__(self, i: int) -> Dict[str, torch.Tensor]:
|
233 |
+
sample = self.data[i]
|
234 |
+
conversations = copy.deepcopy(sample["conversations"])
|
235 |
+
images = [Image.open(sample['image'])]
|
236 |
+
max_partition = 9
|
237 |
+
|
238 |
+
prompt, input_ids, pixel_values, labels = self.model.preprocess_inputs(
|
239 |
+
conversations,
|
240 |
+
images,
|
241 |
+
max_partition=max_partition,
|
242 |
+
generation_preface=None,
|
243 |
+
return_labels=True,
|
244 |
+
propagate_exception=False
|
245 |
+
)
|
246 |
+
|
247 |
+
if pixel_values is None:
|
248 |
+
pixel_values, _ = self.visual_tokenizer.mock_input()
|
249 |
+
|
250 |
+
input_ids = input_ids[:self.text_max_length]
|
251 |
+
labels = labels[:self.text_max_length]
|
252 |
+
|
253 |
+
return dict(
|
254 |
+
pixel_values=pixel_values,
|
255 |
+
input_ids=input_ids,
|
256 |
+
labels=labels
|
257 |
+
)
|
258 |
+
|
259 |
+
|
260 |
+
class DataCollatorForMultimodalDatasetGPTQ:
|
261 |
+
def __init__(self, text_tokenizer):
|
262 |
+
self.text_tokenizer = text_tokenizer
|
263 |
+
|
264 |
+
def __call__(self, instances: Sequence[Dict]) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
|
265 |
+
pixel_values, input_ids, labels = tuple([instance[key] for instance in instances]
|
266 |
+
for key in ("pixel_values", "input_ids", "labels"))
|
267 |
+
input_ids = torch.nn.utils.rnn.pad_sequence(
|
268 |
+
input_ids,
|
269 |
+
batch_first=True,
|
270 |
+
padding_value=self.text_tokenizer.pad_token_id)
|
271 |
+
attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
|
272 |
+
labels = torch.nn.utils.rnn.pad_sequence(
|
273 |
+
labels,
|
274 |
+
batch_first=True,
|
275 |
+
padding_value=IGNORE_ID)
|
276 |
+
|
277 |
+
num_valid_label = torch.not_equal(labels, IGNORE_ID).sum().item()
|
278 |
+
if num_valid_label == 0:
|
279 |
+
logging.warning(
|
280 |
+
f'[DataCollatorForMultimodalDatasetGPTQ] All labels are ignored, may causing training instability\n{input_ids=}\n{attention_mask=}\n{labels=}')
|
281 |
+
|
282 |
+
return dict(
|
283 |
+
input_ids=input_ids,
|
284 |
+
attention_mask=attention_mask,
|
285 |
+
labels=labels,
|
286 |
+
pixel_values=pixel_values
|
287 |
+
)
|
288 |
+
|
289 |
+
|
290 |
+
class MyDataLoader(DataLoader):
|
291 |
+
def __len__(self):
|
292 |
+
return len(self.dataset) // self.batch_size # must set drop last=True
|
293 |
+
|
294 |
+
|
295 |
+
# prepare your own calibration samples here
|
296 |
+
data_list = [
|
297 |
+
{
|
298 |
+
"image": "path/to/image/of/this/sample",
|
299 |
+
"conversations": [
|
300 |
+
{
|
301 |
+
"from": "human",
|
302 |
+
"value": "<image>\n[Your sample prompt]"
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"from": "gpt",
|
306 |
+
"value": "[Anything]"
|
307 |
+
}
|
308 |
+
]
|
309 |
+
}
|
310 |
+
]
|
311 |
+
train_dataset = CalibrationDataset(model, text_max_length=832, data_list=data_list)
|
312 |
+
print(f"Dataset Loaded!")
|
313 |
+
print(f"Total length of the training set: {len(train_dataset)}")
|
314 |
+
|
315 |
+
train_loader = MyDataLoader(
|
316 |
+
train_dataset,
|
317 |
+
collate_fn=DataCollatorForMultimodalDatasetGPTQ(model.get_text_tokenizer()),
|
318 |
+
shuffle=False,
|
319 |
+
batch_size=4,
|
320 |
+
drop_last=True,
|
321 |
+
pin_memory=True,
|
322 |
+
num_workers=8
|
323 |
+
)
|
324 |
+
print(f"Dataloader Loaded!")
|
325 |
+
|
326 |
+
|
327 |
+
# start quantizing
|
328 |
+
model.quantize(train_loader, cache_examples_on_gpu=False, samples_dtype=torch.bfloat16) # do not change samples_dtype
|
329 |
+
print(f"Model Quantized! Now Saving...")
|
330 |
+
|
331 |
+
model.save_quantized(quantize_save_path, use_safetensors=True)
|
332 |
+
print(f"ALL Done!")
|
333 |
+
```
|
334 |
+
|
335 |
+
|
336 |
## Performance
|
337 |
Here we report the performance of Ovis1.6-Gemma2-9B-GPTQ-Int4. The results are obtained with VLMEvalkit.
|
338 |
![image/png](https://cdn-uploads.huggingface.co/production/uploads/645cb4b4a03f3ebb0bde20e0/pSKiBhCy1S6Fb1QODY_ZZ.png)
|