Update README.md
Browse files
README.md
CHANGED
@@ -37,13 +37,19 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
|
37 |
|
38 |
model_id = "Qwen/Qwen3-32B"
|
39 |
|
|
|
40 |
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
|
41 |
quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
|
42 |
quantization_config = TorchAoConfig(quant_type=quant_config)
|
43 |
-
quantized_model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
44 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
45 |
|
46 |
-
|
47 |
prompt = "Give me a short introduction to large language model."
|
48 |
messages = [
|
49 |
{"role": "user", "content": prompt}
|
@@ -75,6 +81,14 @@ content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("
|
|
75 |
|
76 |
print("thinking content:", thinking_content)
|
77 |
print("content:", content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
```
|
79 |
|
80 |
# 4. Model Quality
|
|
|
37 |
|
38 |
model_id = "Qwen/Qwen3-32B"
|
39 |
|
40 |
+
## Step 1: Convert to float8
|
41 |
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
|
42 |
quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
|
43 |
quantization_config = TorchAoConfig(quant_type=quant_config)
|
44 |
+
quantized_model = AutoModelForCausalLM.from_pretrained(
|
45 |
+
model_id,
|
46 |
+
device_map="auto",
|
47 |
+
torch_dtype=torch.bfloat16,
|
48 |
+
quantization_config=quantization_config,
|
49 |
+
)
|
50 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
51 |
|
52 |
+
## Step 2: Sanity check
|
53 |
prompt = "Give me a short introduction to large language model."
|
54 |
messages = [
|
55 |
{"role": "user", "content": prompt}
|
|
|
81 |
|
82 |
print("thinking content:", thinking_content)
|
83 |
print("content:", content)
|
84 |
+
|
85 |
+
|
86 |
+
# Step 3: Upload to HF
|
87 |
+
USER_ID = "YOUR_USER_ID"
|
88 |
+
MODEL_NAME = model_id.split("/")[-1]
|
89 |
+
save_to = f"{USER_ID}/{MODEL_NAME}-float8dq"
|
90 |
+
quantized_model.push_to_hub(save_to, safe_serialization=False)
|
91 |
+
tokenizer.push_to_hub(save_to)
|
92 |
```
|
93 |
|
94 |
# 4. Model Quality
|