SocialLocalMobile commited on
Commit
c303d63
·
verified ·
1 Parent(s): ebfd887

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +46 -31
README.md CHANGED
@@ -18,13 +18,13 @@ pipeline_tag: text-generation
18
  # Inference with vLLM
19
  ```Shell
20
  # Server
21
- VLLM_DISABLE_COMPILE_CACHE=1 vllm serve SocialLocalMobile/Qwen3-32B-float8dq --tokenizer Qwen/Qwen3-32B -O3
22
  ```
23
 
24
  ```Shell
25
  # Client
26
  curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
27
- "model": "SocialLocalMobile/Qwen3-32B-float8dq",
28
  "messages": [
29
  {"role": "user", "content": "Give me a short introduction to large language models."}
30
  ],
@@ -35,39 +35,23 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso
35
  }'
36
  ```
37
 
38
-
39
- # Quantization Recipe
40
-
41
- Install the required packages:
42
-
43
- ```Shell
44
- pip install git+https://github.com/huggingface/transformers@main
45
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
46
- pip install torch
47
- pip install accelerate
48
- ```
49
-
50
- Use the following code to get the quantized model:
51
 
52
  ```Py
53
  import torch
54
- from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
55
 
56
- model_id = "Qwen/Qwen3-32B"
57
 
58
- ## Step 1: Convert to float8
59
- from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
60
- quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
61
- quantization_config = TorchAoConfig(quant_type=quant_config)
62
- quantized_model = AutoModelForCausalLM.from_pretrained(
63
- model_id,
64
- device_map="auto",
65
- torch_dtype=torch.bfloat16,
66
- quantization_config=quantization_config,
67
  )
68
- tokenizer = AutoTokenizer.from_pretrained(model_id)
69
 
70
- ## Step 2: Sanity check
71
  prompt = "Give me a short introduction to large language model."
72
  messages = [
73
  {"role": "user", "content": prompt}
@@ -78,10 +62,10 @@ text = tokenizer.apply_chat_template(
78
  add_generation_prompt=True,
79
  enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
80
  )
81
- model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
82
 
83
  # conduct text completion
84
- generated_ids = quantized_model.generate(
85
  **model_inputs,
86
  max_new_tokens=32768
87
  )
@@ -99,9 +83,40 @@ content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("
99
 
100
  print("thinking content:", thinking_content)
101
  print("content:", content)
 
 
 
 
 
 
 
 
 
 
 
 
102
 
 
103
 
104
- # Step 3: Upload to HF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  USER_ID = "YOUR_USER_ID"
106
  MODEL_NAME = model_id.split("/")[-1]
107
  save_to = f"{USER_ID}/{MODEL_NAME}-float8dq"
 
18
  # Inference with vLLM
19
  ```Shell
20
  # Server
21
+ VLLM_DISABLE_COMPILE_CACHE=1 vllm serve pytorch/Qwen3-32B-float8dq --tokenizer Qwen/Qwen3-32B -O3
22
  ```
23
 
24
  ```Shell
25
  # Client
26
  curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
27
+ "model": "pytorch/Qwen3-32B-float8dq",
28
  "messages": [
29
  {"role": "user", "content": "Give me a short introduction to large language models."}
30
  ],
 
35
  }'
36
  ```
37
 
38
+ # Inference with transformers
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  ```Py
41
  import torch
42
+ from transformers import AutoModelForCausalLM, AutoTokenizer
43
 
44
+ model_name = "pytorch/Qwen3-32B-float8dq"
45
 
46
+ # load the tokenizer and the model
47
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
48
+ model = AutoModelForCausalLM.from_pretrained(
49
+ model_name,
50
+ torch_dtype="auto",
51
+ device_map="auto"
 
 
 
52
  )
 
53
 
54
+ # prepare the model input
55
  prompt = "Give me a short introduction to large language model."
56
  messages = [
57
  {"role": "user", "content": prompt}
 
62
  add_generation_prompt=True,
63
  enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
64
  )
65
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
66
 
67
  # conduct text completion
68
+ generated_ids = model.generate(
69
  **model_inputs,
70
  max_new_tokens=32768
71
  )
 
83
 
84
  print("thinking content:", thinking_content)
85
  print("content:", content)
86
+ ```
87
+
88
+ # Quantization Recipe
89
+
90
+ Install the required packages:
91
+
92
+ ```Shell
93
+ pip install git+https://github.com/huggingface/transformers@main
94
+ pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
95
+ pip install torch
96
+ pip install accelerate
97
+ ```
98
 
99
+ Use the following code to get the float8 model using torchao library:
100
 
101
+ ```Py
102
+ import torch
103
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
104
+
105
+ model_id = "Qwen/Qwen3-32B"
106
+ from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
107
+ quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
108
+ quantization_config = TorchAoConfig(quant_type=quant_config)
109
+ quantized_model = AutoModelForCausalLM.from_pretrained(
110
+ model_id,
111
+ device_map="auto",
112
+ torch_dtype=torch.bfloat16,
113
+ quantization_config=quantization_config,
114
+ )
115
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
116
+ ```
117
+
118
+ Optionally, upload to your HF hub
119
+ ```Py
120
  USER_ID = "YOUR_USER_ID"
121
  MODEL_NAME = model_id.split("/")[-1]
122
  save_to = f"{USER_ID}/{MODEL_NAME}-float8dq"