pytorch
/

Qwen3-32B-float8dq

@@ -18,13 +18,13 @@ pipeline_tag: text-generation
 # Inference with vLLM
 ```Shell
 # Server
-VLLM_DISABLE_COMPILE_CACHE=1 vllm serve SocialLocalMobile/Qwen3-32B-float8dq --tokenizer Qwen/Qwen3-32B -O3
 ```
 ```Shell
 # Client
 curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
-  "model": "SocialLocalMobile/Qwen3-32B-float8dq",
   "messages": [
     {"role": "user", "content": "Give me a short introduction to large language models."}
   ],
@@ -35,39 +35,23 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso
 }'
 ```
-# Quantization Recipe
-Install the required packages:
-```Shell
-pip install git+https://github.com/huggingface/transformers@main
-pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
-pip install torch
-pip install accelerate
-```
-Use the following code to get the quantized model:
 ```Py
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
-model_id = "Qwen/Qwen3-32B"
-## Step 1: Convert to float8
-from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
-quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
-quantization_config = TorchAoConfig(quant_type=quant_config)
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-    quantization_config=quantization_config,
 )
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-## Step 2: Sanity check
 prompt = "Give me a short introduction to large language model."
 messages = [
     {"role": "user", "content": prompt}
@@ -78,10 +62,10 @@ text = tokenizer.apply_chat_template(
     add_generation_prompt=True,
     enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
 )
-model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
 # conduct text completion
-generated_ids = quantized_model.generate(
     **model_inputs,
     max_new_tokens=32768
 )
@@ -99,9 +83,40 @@ content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("
 print("thinking content:", thinking_content)
 print("content:", content)
-# Step 3: Upload to HF
 USER_ID = "YOUR_USER_ID"
 MODEL_NAME = model_id.split("/")[-1]
 save_to = f"{USER_ID}/{MODEL_NAME}-float8dq"

 # Inference with vLLM
 ```Shell
 # Server
+VLLM_DISABLE_COMPILE_CACHE=1 vllm serve pytorch/Qwen3-32B-float8dq --tokenizer Qwen/Qwen3-32B -O3
 ```
 ```Shell
 # Client
 curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "pytorch/Qwen3-32B-float8dq",
   "messages": [
     {"role": "user", "content": "Give me a short introduction to large language models."}
   ],
 }'
 ```
+# Inference with transformers
 ```Py
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "pytorch/Qwen3-32B-float8dq"
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
 )
+# prepare the model input
 prompt = "Give me a short introduction to large language model."
 messages = [
     {"role": "user", "content": prompt}
     add_generation_prompt=True,
     enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
 )
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
 # conduct text completion
+generated_ids = model.generate(
     **model_inputs,
     max_new_tokens=32768
 )
 print("thinking content:", thinking_content)
 print("content:", content)
+```
+# Quantization Recipe
+Install the required packages:
+```Shell
+pip install git+https://github.com/huggingface/transformers@main
+pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+pip install torch
+pip install accelerate
+```
+Use the following code to get the float8 model using torchao library:
+```Py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
+model_id = "Qwen/Qwen3-32B"
+from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
+quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+quantization_config = TorchAoConfig(quant_type=quant_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    quantization_config=quantization_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+```
+Optionally, upload to your HF hub
+```Py
 USER_ID = "YOUR_USER_ID"
 MODEL_NAME = model_id.split("/")[-1]
 save_to = f"{USER_ID}/{MODEL_NAME}-float8dq"