Spaces:
Running
Running
Update app.py
#1
by
jerryzh168
- opened
app.py
CHANGED
@@ -13,22 +13,25 @@ from torchao.quantization import (
|
|
13 |
Int8DynamicActivationInt8WeightConfig,
|
14 |
Float8WeightOnlyConfig,
|
15 |
Float8DynamicActivationFloat8WeightConfig,
|
|
|
16 |
)
|
17 |
|
18 |
MAP_QUANT_TYPE_TO_NAME = {
|
19 |
-
"
|
20 |
-
"
|
21 |
-
"
|
22 |
-
"
|
23 |
-
"
|
|
|
24 |
"autoquant": "autoquant",
|
25 |
}
|
26 |
MAP_QUANT_TYPE_TO_CONFIG = {
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
|
|
32 |
}
|
33 |
|
34 |
|
@@ -56,8 +59,7 @@ def check_model_exists(
|
|
56 |
repo_name = f"{username}/{quantized_model_name}"
|
57 |
else:
|
58 |
if (
|
59 |
-
quantization_type
|
60 |
-
or quantization_type == "int8_weight_only"
|
61 |
) and (group_size is not None):
|
62 |
repo_name = f"{username}/{model_name.split('/')[-1]}-ao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}"
|
63 |
else:
|
@@ -173,13 +175,13 @@ def quantize_model(
|
|
173 |
print(f"Quantizing model: {quantization_type}")
|
174 |
progress(0, desc="Preparing Quantization")
|
175 |
if (
|
176 |
-
quantization_type == "
|
177 |
):
|
178 |
quant_config = MAP_QUANT_TYPE_TO_CONFIG[quantization_type](
|
179 |
group_size=group_size
|
180 |
)
|
181 |
quantization_config = TorchAoConfig(quant_config)
|
182 |
-
elif quantization_type == "
|
183 |
from torchao.dtypes import Int4CPULayout
|
184 |
|
185 |
quant_config = MAP_QUANT_TYPE_TO_CONFIG[quantization_type](
|
@@ -233,8 +235,7 @@ def save_model(
|
|
233 |
repo_name = f"{username}/{quantized_model_name}"
|
234 |
else:
|
235 |
if (
|
236 |
-
quantization_type
|
237 |
-
or quantization_type == "int8_weight_only"
|
238 |
) and (group_size is not None):
|
239 |
repo_name = f"{username}/{model_name.split('/')[-1]}-ao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}"
|
240 |
else:
|
@@ -318,7 +319,7 @@ def quantize_and_save(
|
|
318 |
return """
|
319 |
<div class="error-box">
|
320 |
<h3>β Group Size Error</h3>
|
321 |
-
<p>Group Size is a
|
322 |
</div>
|
323 |
"""
|
324 |
|
@@ -492,11 +493,12 @@ with gr.Blocks(css=css) as demo:
|
|
492 |
quantization_type = gr.Dropdown(
|
493 |
info="Select the Quantization method",
|
494 |
choices=[
|
495 |
-
"
|
496 |
-
"
|
497 |
-
"
|
498 |
-
"
|
499 |
-
"
|
|
|
500 |
"autoquant",
|
501 |
],
|
502 |
value="int8_weight_only",
|
@@ -549,11 +551,18 @@ with gr.Blocks(css=css) as demo:
|
|
549 |
## π Quantization Options
|
550 |
|
551 |
### Quantization Types
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
- **autoquant**: automatic quantization (uses the best quantization method for the model)
|
558 |
|
559 |
### Group Size
|
|
|
13 |
Int8DynamicActivationInt8WeightConfig,
|
14 |
Float8WeightOnlyConfig,
|
15 |
Float8DynamicActivationFloat8WeightConfig,
|
16 |
+
GemliteUIntXWeightOnlyConfig,
|
17 |
)
|
18 |
|
19 |
MAP_QUANT_TYPE_TO_NAME = {
|
20 |
+
"Int4WeightOnly": "int4wo",
|
21 |
+
"GemliteUIntXWeightOnly": "intxwo-gemlite"
|
22 |
+
"Int8WeightOnly": "int8wo",
|
23 |
+
"Int8DynamicActivationInt8Weight": "int8da8w8",
|
24 |
+
"Float8WeightOnly": "float8wo",
|
25 |
+
"Float8DynamicActivationFloat8Weight": "float8da8w8",
|
26 |
"autoquant": "autoquant",
|
27 |
}
|
28 |
MAP_QUANT_TYPE_TO_CONFIG = {
|
29 |
+
"Int4WeightOnly": Int4WeightOnlyConfig,
|
30 |
+
"GemliteUIntXWeightOnly": GemliteUIntXWeightOnlyConfig,
|
31 |
+
"Int8WeightOnly": Int8WeightOnlyConfig,
|
32 |
+
"Int8DynamicActivationInt8Weight": Int8DynamicActivationInt8WeightConfig,
|
33 |
+
"Float8WeightOnly": Float8WeightOnlyConfig,
|
34 |
+
"Float8DynamicActivationFloat8Weight": Float8DynamicActivationFloat8WeightConfig,
|
35 |
}
|
36 |
|
37 |
|
|
|
59 |
repo_name = f"{username}/{quantized_model_name}"
|
60 |
else:
|
61 |
if (
|
62 |
+
quantization_type in ["Int4WeightOnly", "GemliteUIntXWeightOnly"]
|
|
|
63 |
) and (group_size is not None):
|
64 |
repo_name = f"{username}/{model_name.split('/')[-1]}-ao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}"
|
65 |
else:
|
|
|
175 |
print(f"Quantizing model: {quantization_type}")
|
176 |
progress(0, desc="Preparing Quantization")
|
177 |
if (
|
178 |
+
quantization_type == "GemliteUIntXWeightOnly"
|
179 |
):
|
180 |
quant_config = MAP_QUANT_TYPE_TO_CONFIG[quantization_type](
|
181 |
group_size=group_size
|
182 |
)
|
183 |
quantization_config = TorchAoConfig(quant_config)
|
184 |
+
elif quantization_type == "Int4WeightOnly":
|
185 |
from torchao.dtypes import Int4CPULayout
|
186 |
|
187 |
quant_config = MAP_QUANT_TYPE_TO_CONFIG[quantization_type](
|
|
|
235 |
repo_name = f"{username}/{quantized_model_name}"
|
236 |
else:
|
237 |
if (
|
238 |
+
quantization_type in ["Int4WeightOnly", "GemliteUIntXWeightOnly"]
|
|
|
239 |
) and (group_size is not None):
|
240 |
repo_name = f"{username}/{model_name.split('/')[-1]}-ao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}"
|
241 |
else:
|
|
|
319 |
return """
|
320 |
<div class="error-box">
|
321 |
<h3>β Group Size Error</h3>
|
322 |
+
<p>Group Size is a parameter for Int4WeightOnly or GemliteUIntXWeightOnly</p>
|
323 |
</div>
|
324 |
"""
|
325 |
|
|
|
493 |
quantization_type = gr.Dropdown(
|
494 |
info="Select the Quantization method",
|
495 |
choices=[
|
496 |
+
"Int4WeightOnly",
|
497 |
+
"GemliteUIntXWeightOnly"
|
498 |
+
"Int8WeightOnly",
|
499 |
+
"Int8DynamicActivationInt8Weight",
|
500 |
+
"Float8WeightOnly",
|
501 |
+
"Float8DynamicActivationFloat8Weight",
|
502 |
"autoquant",
|
503 |
],
|
504 |
value="int8_weight_only",
|
|
|
551 |
## π Quantization Options
|
552 |
|
553 |
### Quantization Types
|
554 |
+
"Int4WeightOnly",
|
555 |
+
"GemliteUIntXWeightOnly"
|
556 |
+
"Int8WeightOnly",
|
557 |
+
"Int8DynamicActivationInt8Weight",
|
558 |
+
"Float8WeightOnly",
|
559 |
+
"Float8DynamicActivationFloat8Weight",
|
560 |
+
- **Int4WeightOnly**: 4-bit weight-only quantization
|
561 |
+
- **GemliteUIntXWeightOnly**: uintx gemlite quantization (default to 4 bit only for now)
|
562 |
+
- **Int8WeightOnly**: 8-bit weight-only quantization
|
563 |
+
- **Int8DynamicActivationInt8Weight**: 8-bit quantization for both weights and activations
|
564 |
+
- **Float8WeightOnly**: float8-bit weight-only quantization
|
565 |
+
- **Float8DynamicActivationFloat8Weight**: float8-bit quantization for both weights and activations
|
566 |
- **autoquant**: automatic quantization (uses the best quantization method for the model)
|
567 |
|
568 |
### Group Size
|