SocialLocalMobile commited on
Commit
b18ead3
·
verified ·
1 Parent(s): cfd5a4d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -2
README.md CHANGED
@@ -37,13 +37,19 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
37
 
38
  model_id = "Qwen/Qwen3-32B"
39
 
 
40
  from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
41
  quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
42
  quantization_config = TorchAoConfig(quant_type=quant_config)
43
- quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
 
 
 
 
 
44
  tokenizer = AutoTokenizer.from_pretrained(model_id)
45
 
46
-
47
  prompt = "Give me a short introduction to large language model."
48
  messages = [
49
  {"role": "user", "content": prompt}
@@ -75,6 +81,14 @@ content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("
75
 
76
  print("thinking content:", thinking_content)
77
  print("content:", content)
 
 
 
 
 
 
 
 
78
  ```
79
 
80
  # 4. Model Quality
 
37
 
38
  model_id = "Qwen/Qwen3-32B"
39
 
40
+ ## Step 1: Convert to float8
41
  from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
42
  quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
43
  quantization_config = TorchAoConfig(quant_type=quant_config)
44
+ quantized_model = AutoModelForCausalLM.from_pretrained(
45
+ model_id,
46
+ device_map="auto",
47
+ torch_dtype=torch.bfloat16,
48
+ quantization_config=quantization_config,
49
+ )
50
  tokenizer = AutoTokenizer.from_pretrained(model_id)
51
 
52
+ ## Step 2: Sanity check
53
  prompt = "Give me a short introduction to large language model."
54
  messages = [
55
  {"role": "user", "content": prompt}
 
81
 
82
  print("thinking content:", thinking_content)
83
  print("content:", content)
84
+
85
+
86
+ # Step 3: Upload to HF
87
+ USER_ID = "YOUR_USER_ID"
88
+ MODEL_NAME = model_id.split("/")[-1]
89
+ save_to = f"{USER_ID}/{MODEL_NAME}-float8dq"
90
+ quantized_model.push_to_hub(save_to, safe_serialization=False)
91
+ tokenizer.push_to_hub(save_to)
92
  ```
93
 
94
  # 4. Model Quality