mgoin commited on
Commit
0b209bf
·
verified ·
1 Parent(s): af74262

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +37 -0
README.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Creation
2
+
3
+ ```python
4
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
5
+
6
+ from llmcompressor.modifiers.quantization import QuantizationModifier
7
+ from llmcompressor.transformers import oneshot, wrap_hf_model_class
8
+
9
+ MODEL_ID = "Qwen/Qwen2-VL-72B-Instruct"
10
+
11
+ # Load model.
12
+ model_class = wrap_hf_model_class(Qwen2VLForConditionalGeneration)
13
+ model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
14
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
15
+
16
+ # Configure the quantization algorithm and scheme.
17
+ # In this case, we:
18
+ # * quantize the weights to fp8 with per channel via ptq
19
+ # * quantize the activations to fp8 with dynamic per token
20
+ recipe = QuantizationModifier(
21
+ targets="Linear",
22
+ scheme="FP8_DYNAMIC",
23
+ ignore=["re:.*lm_head", "re:visual.*"],
24
+ )
25
+
26
+ # Apply quantization and save to disk in compressed-tensors format.
27
+ SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-dynamic"
28
+ oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
29
+ processor.save_pretrained(SAVE_DIR)
30
+
31
+ # Confirm generations of the quantized model look sane.
32
+ print("========== SAMPLE GENERATION ==============")
33
+ input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
34
+ output = model.generate(input_ids, max_new_tokens=20)
35
+ print(processor.decode(output[0]))
36
+ print("==========================================")
37
+ ```