Build (AArch64)
Browse files- build/torch26-cxx11-cu126-aarch64-linux/quantization/__init__.py +2 -2
- build/torch26-cxx11-cu126-aarch64-linux/quantization/_ops.py +3 -3
- build/torch26-cxx11-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} +2 -2
- build/torch26-cxx11-cu126-aarch64-linux/quantization/compressed_tensors.py +34 -33
- build/torch26-cxx11-cu126-aarch64-linux/quantization/cutlass.py +10 -16
- build/torch26-cxx11-cu126-aarch64-linux/quantization/marlin.py +40 -74
- build/torch26-cxx11-cu126-aarch64-linux/quantization/platforms.py +69 -0
- build/torch26-cxx11-cu126-aarch64-linux/quantization/scalar_type.py +19 -2
- build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils.py +231 -170
- build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py +282 -0
- build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py +90 -68
- build/torch26-cxx98-cu126-aarch64-linux/quantization/__init__.py +2 -2
- build/torch26-cxx98-cu126-aarch64-linux/quantization/_ops.py +3 -3
- build/torch26-cxx98-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} +2 -2
- build/torch26-cxx98-cu126-aarch64-linux/quantization/compressed_tensors.py +34 -33
- build/torch26-cxx98-cu126-aarch64-linux/quantization/cutlass.py +10 -16
- build/torch26-cxx98-cu126-aarch64-linux/quantization/marlin.py +40 -74
- build/torch26-cxx98-cu126-aarch64-linux/quantization/platforms.py +69 -0
- build/torch26-cxx98-cu126-aarch64-linux/quantization/scalar_type.py +19 -2
- build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils.py +231 -170
- build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py +282 -0
- build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py +90 -68
- build/torch27-cxx11-cu126-aarch64-linux/quantization/__init__.py +2 -2
- build/torch27-cxx11-cu126-aarch64-linux/quantization/_ops.py +3 -3
- build/torch27-cxx11-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} +2 -2
- build/torch27-cxx11-cu126-aarch64-linux/quantization/compressed_tensors.py +34 -33
- build/torch27-cxx11-cu126-aarch64-linux/quantization/cutlass.py +10 -16
- build/torch27-cxx11-cu126-aarch64-linux/quantization/marlin.py +40 -74
- build/torch27-cxx11-cu126-aarch64-linux/quantization/platforms.py +69 -0
- build/torch27-cxx11-cu126-aarch64-linux/quantization/scalar_type.py +19 -2
- build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils.py +231 -170
- build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py +282 -0
- build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py +90 -68
- build/torch27-cxx11-cu128-aarch64-linux/quantization/__init__.py +2 -2
- build/torch27-cxx11-cu128-aarch64-linux/quantization/_ops.py +3 -3
- build/torch27-cxx11-cu128-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} +2 -2
- build/torch27-cxx11-cu128-aarch64-linux/quantization/compressed_tensors.py +34 -33
- build/torch27-cxx11-cu128-aarch64-linux/quantization/cutlass.py +10 -16
- build/torch27-cxx11-cu128-aarch64-linux/quantization/marlin.py +40 -74
- build/torch27-cxx11-cu128-aarch64-linux/quantization/platforms.py +69 -0
- build/torch27-cxx11-cu128-aarch64-linux/quantization/scalar_type.py +19 -2
- build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils.py +231 -170
- build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils_fp4.py +282 -0
- build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils_fp8.py +90 -68
build/torch26-cxx11-cu126-aarch64-linux/quantization/__init__.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
|
2 |
from .cutlass import (
|
|
|
3 |
cutlass_scaled_mm_supports_fp8,
|
4 |
cutlass_scaled_mm,
|
5 |
cutlass_scaled_mm_azp,
|
6 |
)
|
7 |
from .marlin import (
|
8 |
awq_marlin_repack,
|
9 |
-
fp8_marlin_gemm,
|
10 |
gptq_marlin_gemm,
|
11 |
gptq_marlin_repack,
|
12 |
gptq_marlin_24_gemm,
|
@@ -25,8 +25,8 @@ __all__ = [
|
|
25 |
"awq_marlin_repack",
|
26 |
"cutlass_scaled_mm",
|
27 |
"cutlass_scaled_mm_azp",
|
|
|
28 |
"cutlass_scaled_mm_supports_fp8",
|
29 |
-
"fp8_marlin_gemm",
|
30 |
"gptq_marlin_24_gemm",
|
31 |
"gptq_marlin_gemm",
|
32 |
"gptq_marlin_repack",
|
|
|
1 |
from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
|
2 |
from .cutlass import (
|
3 |
+
cutlass_scaled_mm_supports_block_fp8,
|
4 |
cutlass_scaled_mm_supports_fp8,
|
5 |
cutlass_scaled_mm,
|
6 |
cutlass_scaled_mm_azp,
|
7 |
)
|
8 |
from .marlin import (
|
9 |
awq_marlin_repack,
|
|
|
10 |
gptq_marlin_gemm,
|
11 |
gptq_marlin_repack,
|
12 |
gptq_marlin_24_gemm,
|
|
|
25 |
"awq_marlin_repack",
|
26 |
"cutlass_scaled_mm",
|
27 |
"cutlass_scaled_mm_azp",
|
28 |
+
"cutlass_scaled_mm_supports_block_fp8",
|
29 |
"cutlass_scaled_mm_supports_fp8",
|
|
|
30 |
"gptq_marlin_24_gemm",
|
31 |
"gptq_marlin_gemm",
|
32 |
"gptq_marlin_repack",
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _quantization_9035540
|
3 |
+
ops = torch.ops._quantization_9035540
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_quantization_9035540::{op_name}"
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aee128710f3a8587386120a226a6caddd5e77cd7a0296a1f7fad51b4028550b1
|
3 |
+
size 159934120
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/compressed_tensors.py
CHANGED
@@ -2,17 +2,7 @@ from typing import Optional, Tuple
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
-
|
6 |
-
from ._ops import ops
|
7 |
-
except ImportError as e:
|
8 |
-
# Fallback for local development.
|
9 |
-
try:
|
10 |
-
import _quantization
|
11 |
-
|
12 |
-
ops = torch.ops._quantization
|
13 |
-
except ImportError:
|
14 |
-
raise e
|
15 |
-
|
16 |
|
17 |
# fp8
|
18 |
def scaled_fp8_quant(
|
@@ -21,7 +11,8 @@ def scaled_fp8_quant(
|
|
21 |
num_token_padding: Optional[int] = None,
|
22 |
scale_ub: Optional[torch.Tensor] = None,
|
23 |
use_per_token_if_dynamic: bool = False,
|
24 |
-
|
|
|
25 |
"""
|
26 |
Quantize input tensor to FP8 and return quantized tensor and scale.
|
27 |
|
@@ -42,30 +33,36 @@ def scaled_fp8_quant(
|
|
42 |
in the dynamic quantization case.
|
43 |
|
44 |
Returns:
|
45 |
-
|
46 |
scaling factor.
|
47 |
"""
|
48 |
# This code assumes batch_dim and num_tokens are flattened
|
49 |
-
assert input.ndim == 2
|
50 |
-
shape: Union[
|
51 |
-
# For
|
52 |
-
|
53 |
-
# if current_platform.is_rocm() else torch.float8_e4m3fn
|
54 |
-
out_dtype = torch.float8_e4m3fn
|
55 |
if num_token_padding:
|
56 |
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
57 |
-
output
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if scale is None:
|
60 |
if use_per_token_if_dynamic:
|
61 |
-
scale = torch.empty((shape[0], 1),
|
62 |
-
|
|
|
|
|
|
|
63 |
else:
|
64 |
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
65 |
ops.dynamic_scaled_fp8_quant(output, input, scale)
|
66 |
else:
|
67 |
# num_token_padding not implemented for this case
|
68 |
-
assert scale.numel() == 1
|
69 |
ops.static_scaled_fp8_quant(output, input, scale)
|
70 |
|
71 |
return output, scale
|
@@ -76,8 +73,8 @@ def scaled_int8_quant(
|
|
76 |
input: torch.Tensor,
|
77 |
scale: Optional[torch.Tensor] = None,
|
78 |
azp: Optional[torch.Tensor] = None,
|
79 |
-
symmetric: bool = True
|
80 |
-
) ->
|
81 |
"""
|
82 |
Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
|
83 |
|
@@ -90,21 +87,25 @@ def scaled_int8_quant(
|
|
90 |
symmetric: Whether to use symmetric quantization (scale only, azp ignored).
|
91 |
|
92 |
Returns:
|
93 |
-
|
94 |
"""
|
95 |
output = torch.empty_like(input, dtype=torch.int8)
|
96 |
if scale is not None:
|
97 |
# static-per-tensor quantization.
|
98 |
assert symmetric == (
|
99 |
-
azp
|
100 |
-
|
101 |
ops.static_scaled_int8_quant(output, input, scale, azp)
|
102 |
return output, scale, azp
|
103 |
|
104 |
# dynamic-per-token quantization.
|
105 |
-
input_scales = torch.empty(
|
106 |
-
|
107 |
-
|
108 |
-
input_azp = None if symmetric else torch.empty_like(input_scales,
|
109 |
-
|
|
|
|
|
110 |
return output, input_scales, input_azp
|
|
|
|
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
+
from ._ops import ops
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# fp8
|
8 |
def scaled_fp8_quant(
|
|
|
11 |
num_token_padding: Optional[int] = None,
|
12 |
scale_ub: Optional[torch.Tensor] = None,
|
13 |
use_per_token_if_dynamic: bool = False,
|
14 |
+
output: Optional[torch.Tensor] = None,
|
15 |
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
16 |
"""
|
17 |
Quantize input tensor to FP8 and return quantized tensor and scale.
|
18 |
|
|
|
33 |
in the dynamic quantization case.
|
34 |
|
35 |
Returns:
|
36 |
+
tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
|
37 |
scaling factor.
|
38 |
"""
|
39 |
# This code assumes batch_dim and num_tokens are flattened
|
40 |
+
assert (input.ndim == 2)
|
41 |
+
shape: Union[tuple[int, int], torch.Size] = input.shape
|
42 |
+
# For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
|
43 |
+
out_dtype: torch.dtype = current_platform.fp8_dtype()
|
|
|
|
|
44 |
if num_token_padding:
|
45 |
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
46 |
+
if output is None:
|
47 |
+
output = torch.empty(shape, device=input.device, dtype=out_dtype)
|
48 |
+
else:
|
49 |
+
assert num_token_padding is None, \
|
50 |
+
"padding not supported if output passed in"
|
51 |
+
assert output.dtype == out_dtype
|
52 |
|
53 |
if scale is None:
|
54 |
if use_per_token_if_dynamic:
|
55 |
+
scale = torch.empty((shape[0], 1),
|
56 |
+
device=input.device,
|
57 |
+
dtype=torch.float32)
|
58 |
+
ops.dynamic_per_token_scaled_fp8_quant(
|
59 |
+
output, input.contiguous(), scale, scale_ub)
|
60 |
else:
|
61 |
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
62 |
ops.dynamic_scaled_fp8_quant(output, input, scale)
|
63 |
else:
|
64 |
# num_token_padding not implemented for this case
|
65 |
+
assert (scale.numel() == 1 and num_token_padding is None)
|
66 |
ops.static_scaled_fp8_quant(output, input, scale)
|
67 |
|
68 |
return output, scale
|
|
|
73 |
input: torch.Tensor,
|
74 |
scale: Optional[torch.Tensor] = None,
|
75 |
azp: Optional[torch.Tensor] = None,
|
76 |
+
symmetric: bool = True
|
77 |
+
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
78 |
"""
|
79 |
Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
|
80 |
|
|
|
87 |
symmetric: Whether to use symmetric quantization (scale only, azp ignored).
|
88 |
|
89 |
Returns:
|
90 |
+
tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
|
91 |
"""
|
92 |
output = torch.empty_like(input, dtype=torch.int8)
|
93 |
if scale is not None:
|
94 |
# static-per-tensor quantization.
|
95 |
assert symmetric == (
|
96 |
+
azp
|
97 |
+
is None), "azp must only be provided for asymmetric quantization."
|
98 |
ops.static_scaled_int8_quant(output, input, scale, azp)
|
99 |
return output, scale, azp
|
100 |
|
101 |
# dynamic-per-token quantization.
|
102 |
+
input_scales = torch.empty((input.numel() // input.shape[-1], 1),
|
103 |
+
device=input.device,
|
104 |
+
dtype=torch.float32)
|
105 |
+
input_azp = None if symmetric else torch.empty_like(input_scales,
|
106 |
+
dtype=torch.int32)
|
107 |
+
ops.dynamic_scaled_int8_quant(output, input.contiguous(),
|
108 |
+
input_scales, input_azp)
|
109 |
return output, input_scales, input_azp
|
110 |
+
|
111 |
+
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/cutlass.py
CHANGED
@@ -2,22 +2,18 @@ from typing import Optional
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
except ImportError as e:
|
8 |
-
# Fallback for local development.
|
9 |
-
try:
|
10 |
-
import _quantization
|
11 |
-
|
12 |
-
ops = torch.ops._quantization
|
13 |
-
except ImportError:
|
14 |
-
raise e
|
15 |
|
16 |
|
17 |
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
18 |
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
21 |
def cutlass_scaled_mm(
|
22 |
a: torch.Tensor,
|
23 |
b: torch.Tensor,
|
@@ -33,12 +29,10 @@ def cutlass_scaled_mm(
|
|
33 |
m = a.shape[0]
|
34 |
n = b.shape[1]
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
41 |
-
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
42 |
|
43 |
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
44 |
|
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
+
from ._ops import ops
|
6 |
+
from .platforms import current_platform
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
10 |
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
11 |
|
12 |
|
13 |
+
def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
|
14 |
+
return ops.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
|
15 |
+
|
16 |
+
|
17 |
def cutlass_scaled_mm(
|
18 |
a: torch.Tensor,
|
19 |
b: torch.Tensor,
|
|
|
29 |
m = a.shape[0]
|
30 |
n = b.shape[1]
|
31 |
|
32 |
+
cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
33 |
+
if not cutlass_compatible_b:
|
34 |
+
from .triton_scaled_mm import triton_scaled_mm
|
35 |
+
return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
|
|
|
|
36 |
|
37 |
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
38 |
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/marlin.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import TYPE_CHECKING
|
2 |
|
3 |
import torch
|
4 |
|
@@ -30,58 +30,30 @@ except ImportError as e:
|
|
30 |
from .scalar_type import ScalarType
|
31 |
|
32 |
|
33 |
-
# fp8 marlin
|
34 |
-
def fp8_marlin_gemm(
|
35 |
-
a: torch.Tensor,
|
36 |
-
b_q_weight: torch.Tensor,
|
37 |
-
b_scales: torch.Tensor,
|
38 |
-
workspace: torch.Tensor,
|
39 |
-
num_bits: int,
|
40 |
-
size_m: int,
|
41 |
-
size_n: int,
|
42 |
-
size_k: int,
|
43 |
-
) -> torch.Tensor:
|
44 |
-
return ops.fp8_marlin_gemm(
|
45 |
-
a, b_q_weight, b_scales, workspace, num_bits, size_m, size_n, size_k
|
46 |
-
)
|
47 |
-
|
48 |
-
|
49 |
# gptq_marlin
|
50 |
-
def gptq_marlin_gemm(
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
) -> torch.Tensor:
|
67 |
-
return ops.gptq_marlin_gemm(
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
perm,
|
74 |
-
workspace,
|
75 |
-
b_q_type.id,
|
76 |
-
size_m,
|
77 |
-
size_n,
|
78 |
-
size_k,
|
79 |
-
is_k_full,
|
80 |
-
has_zp,
|
81 |
-
use_fp32_reduce,
|
82 |
-
is_zp_float,
|
83 |
-
)
|
84 |
-
|
85 |
|
86 |
# gptq_marlin
|
87 |
def gptq_marlin_repack(
|
@@ -153,14 +125,6 @@ def marlin_qqq_gemm(
|
|
153 |
# Fake ops
|
154 |
|
155 |
if hasattr(ops, "gptq_marlin_24_gemm"):
|
156 |
-
@register_fake(add_op_namespace_prefix("fp8_marlin_gemm"))
|
157 |
-
def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
158 |
-
b_scales: torch.Tensor, workspace: torch.Tensor,
|
159 |
-
num_bits: int, size_m: torch.SymInt,
|
160 |
-
size_n: torch.SymInt,
|
161 |
-
size_k: torch.SymInt) -> torch.Tensor:
|
162 |
-
return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
|
163 |
-
|
164 |
@register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
|
165 |
def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
166 |
b_meta: torch.Tensor, b_scales: torch.Tensor,
|
@@ -172,20 +136,22 @@ if hasattr(ops, "gptq_marlin_24_gemm"):
|
|
172 |
|
173 |
@register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
|
174 |
def _gptq_marlin_gemm_fake(a: torch.Tensor,
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
189 |
return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
|
190 |
|
191 |
@register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))
|
|
|
1 |
+
from typing import TYPE_CHECKING, Optional
|
2 |
|
3 |
import torch
|
4 |
|
|
|
30 |
from .scalar_type import ScalarType
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# gptq_marlin
|
34 |
+
def gptq_marlin_gemm(a: torch.Tensor,
|
35 |
+
c: Optional[torch.Tensor],
|
36 |
+
b_q_weight: torch.Tensor,
|
37 |
+
b_scales: torch.Tensor,
|
38 |
+
global_scale: Optional[torch.Tensor],
|
39 |
+
b_zeros: Optional[torch.Tensor],
|
40 |
+
g_idx: Optional[torch.Tensor],
|
41 |
+
perm: Optional[torch.Tensor],
|
42 |
+
workspace: torch.Tensor,
|
43 |
+
b_q_type: ScalarType,
|
44 |
+
size_m: int,
|
45 |
+
size_n: int,
|
46 |
+
size_k: int,
|
47 |
+
is_k_full: bool = True,
|
48 |
+
use_atomic_add: bool = False,
|
49 |
+
use_fp32_reduce: bool = False,
|
50 |
+
is_zp_float: bool = False) -> torch.Tensor:
|
51 |
+
return ops.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
|
52 |
+
global_scale, b_zeros, g_idx, perm,
|
53 |
+
workspace, b_q_type.id, size_m,
|
54 |
+
size_n, size_k, is_k_full,
|
55 |
+
use_atomic_add, use_fp32_reduce,
|
56 |
+
is_zp_float)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# gptq_marlin
|
59 |
def gptq_marlin_repack(
|
|
|
125 |
# Fake ops
|
126 |
|
127 |
if hasattr(ops, "gptq_marlin_24_gemm"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
@register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
|
129 |
def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
130 |
b_meta: torch.Tensor, b_scales: torch.Tensor,
|
|
|
136 |
|
137 |
@register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
|
138 |
def _gptq_marlin_gemm_fake(a: torch.Tensor,
|
139 |
+
c: Optional[torch.Tensor],
|
140 |
+
b_q_weight: torch.Tensor,
|
141 |
+
b_scales: torch.Tensor,
|
142 |
+
global_scale: Optional[torch.Tensor],
|
143 |
+
b_zeros: Optional[torch.Tensor],
|
144 |
+
g_idx: Optional[torch.Tensor],
|
145 |
+
perm: Optional[torch.Tensor],
|
146 |
+
workspace: torch.Tensor,
|
147 |
+
b_q_type_id: int,
|
148 |
+
size_m: torch.SymInt,
|
149 |
+
size_n: torch.SymInt,
|
150 |
+
size_k: torch.SymInt,
|
151 |
+
is_k_full: bool = True,
|
152 |
+
use_atomic_add: bool = False,
|
153 |
+
use_fp32_reduce: bool = False,
|
154 |
+
is_zp_float: bool = False) -> torch.Tensor:
|
155 |
return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
|
156 |
|
157 |
@register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/platforms.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from functools import lru_cache
|
3 |
+
from typing import NamedTuple
|
4 |
+
|
5 |
+
import torch
|
6 |
+
|
7 |
+
IS_ROCM = torch.version.hip is not None
|
8 |
+
|
9 |
+
|
10 |
+
class DeviceCapability(NamedTuple):
|
11 |
+
major: int
|
12 |
+
minor: int
|
13 |
+
|
14 |
+
def as_version_str(self) -> str:
|
15 |
+
return f"{self.major}.{self.minor}"
|
16 |
+
|
17 |
+
def to_int(self) -> int:
|
18 |
+
"""
|
19 |
+
Express device capability as an integer ``<major><minor>``.
|
20 |
+
|
21 |
+
It is assumed that the minor version is always a single digit.
|
22 |
+
"""
|
23 |
+
assert 0 <= self.minor < 10
|
24 |
+
return self.major * 10 + self.minor
|
25 |
+
|
26 |
+
|
27 |
+
class Platform(ABC):
|
28 |
+
simple_compile_backend: str = "inductor"
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
@abstractmethod
|
32 |
+
def get_device_name(cls, device_id: int = 0) -> str: ...
|
33 |
+
|
34 |
+
@abstractmethod
|
35 |
+
def is_rocm(self): ...
|
36 |
+
|
37 |
+
|
38 |
+
class CudaPlatform(Platform):
|
39 |
+
@classmethod
|
40 |
+
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
41 |
+
major, minor = torch.cuda.get_device_capability(device_id)
|
42 |
+
return DeviceCapability(major=major, minor=minor)
|
43 |
+
|
44 |
+
@classmethod
|
45 |
+
@lru_cache(maxsize=8)
|
46 |
+
def get_device_name(cls, device_id: int = 0) -> str:
|
47 |
+
return torch.cuda.get_device_name(0)
|
48 |
+
|
49 |
+
def is_rocm(self):
|
50 |
+
return False
|
51 |
+
|
52 |
+
|
53 |
+
class RocmPlatform(Platform):
|
54 |
+
@classmethod
|
55 |
+
@lru_cache(maxsize=8)
|
56 |
+
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
57 |
+
major, minor = torch.cuda.get_device_capability(device_id)
|
58 |
+
return DeviceCapability(major=major, minor=minor)
|
59 |
+
|
60 |
+
@classmethod
|
61 |
+
@lru_cache(maxsize=8)
|
62 |
+
def get_device_name(cls, device_id: int = 0) -> str:
|
63 |
+
return torch.cuda.get_device_name(device_id)
|
64 |
+
|
65 |
+
def is_rocm(self):
|
66 |
+
return True
|
67 |
+
|
68 |
+
|
69 |
+
current_platform = RocmPlatform() if IS_ROCM else CudaPlatform()
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/scalar_type.py
CHANGED
@@ -1,9 +1,14 @@
|
|
|
|
|
|
|
|
1 |
import functools
|
2 |
import struct
|
3 |
from dataclasses import dataclass
|
4 |
from enum import Enum
|
5 |
from typing import Optional, Union
|
6 |
|
|
|
|
|
7 |
|
8 |
# Mirrors enum in `core/scalar_type.hpp`
|
9 |
class NanRepr(Enum):
|
@@ -121,8 +126,8 @@ class ScalarType:
|
|
121 |
min_raw = max_raw | sign_bit_double
|
122 |
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
123 |
else:
|
124 |
-
assert (not self.is_signed() or
|
125 |
-
|
126 |
|
127 |
if self.is_signed():
|
128 |
return -(1 << (self.size_bits - 1))
|
@@ -156,6 +161,8 @@ class ScalarType:
|
|
156 |
assert offset <= 64, \
|
157 |
f"ScalarType fields too big {offset} to fit into an int64"
|
158 |
|
|
|
|
|
159 |
return val
|
160 |
|
161 |
@property
|
@@ -293,6 +300,13 @@ class ScalarType:
|
|
293 |
ret.id # noqa B018: make sure the id is cached
|
294 |
return ret
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
# naming generally follows: https://github.com/jax-ml/ml_dtypes
|
298 |
# for floating point types (leading f) the scheme is:
|
@@ -319,6 +333,9 @@ class scalar_types:
|
|
319 |
# fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
|
320 |
float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
|
321 |
|
|
|
|
|
|
|
322 |
# "gptq" types
|
323 |
uint2b2 = ScalarType.uint(2, 2)
|
324 |
uint3b4 = ScalarType.uint(3, 4)
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
import functools
|
5 |
import struct
|
6 |
from dataclasses import dataclass
|
7 |
from enum import Enum
|
8 |
from typing import Optional, Union
|
9 |
|
10 |
+
_SCALAR_TYPES_ID_MAP = {}
|
11 |
+
|
12 |
|
13 |
# Mirrors enum in `core/scalar_type.hpp`
|
14 |
class NanRepr(Enum):
|
|
|
126 |
min_raw = max_raw | sign_bit_double
|
127 |
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
128 |
else:
|
129 |
+
assert (not self.is_signed() or self.size_bits
|
130 |
+
<= 64), "Cannot represent min as a int64_t"
|
131 |
|
132 |
if self.is_signed():
|
133 |
return -(1 << (self.size_bits - 1))
|
|
|
161 |
assert offset <= 64, \
|
162 |
f"ScalarType fields too big {offset} to fit into an int64"
|
163 |
|
164 |
+
_SCALAR_TYPES_ID_MAP[val] = self
|
165 |
+
|
166 |
return val
|
167 |
|
168 |
@property
|
|
|
300 |
ret.id # noqa B018: make sure the id is cached
|
301 |
return ret
|
302 |
|
303 |
+
@classmethod
|
304 |
+
def from_id(cls, scalar_type_id: int):
|
305 |
+
if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
|
306 |
+
raise ValueError(
|
307 |
+
f"scalar_type_id {scalar_type_id} doesn't exists.")
|
308 |
+
return _SCALAR_TYPES_ID_MAP[scalar_type_id]
|
309 |
+
|
310 |
|
311 |
# naming generally follows: https://github.com/jax-ml/ml_dtypes
|
312 |
# for floating point types (leading f) the scheme is:
|
|
|
333 |
# fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
|
334 |
float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
|
335 |
|
336 |
+
# fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
|
337 |
+
float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
|
338 |
+
|
339 |
# "gptq" types
|
340 |
uint2b2 = ScalarType.uint(2, 2)
|
341 |
uint3b4 = ScalarType.uint(3, 4)
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
|
3 |
import numpy
|
4 |
import torch
|
@@ -42,7 +45,9 @@ USE_FP32_REDUCE_DEFAULT = True
|
|
42 |
# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
|
43 |
# TODO: we may want to move this into the C++ so its closer to the actual impl
|
44 |
def query_marlin_supported_quant_types(
|
45 |
-
has_zp:
|
|
|
|
|
46 |
):
|
47 |
if device_capability is None:
|
48 |
capability_tuple = torch.cuda.get_device_capability()
|
@@ -51,137 +56,141 @@ def query_marlin_supported_quant_types(
|
|
51 |
if device_capability < 80:
|
52 |
return []
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
if has_zp:
|
55 |
# AWQ style, unsigned + runtime zero-point
|
56 |
-
return [scalar_types.uint4
|
57 |
else:
|
58 |
# GPTQ style, unsigned + symmetric bias
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
62 |
|
63 |
|
64 |
def _check_marlin_supported(
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
) -> Tuple[bool, Optional[str]]:
|
70 |
|
71 |
if device_capability is None:
|
72 |
capability_tuple = torch.cuda.get_device_capability()
|
73 |
device_capability = capability_tuple[0] * 10 + capability_tuple[1]
|
74 |
|
75 |
-
supported_types = query_marlin_supported_quant_types(
|
|
|
76 |
|
77 |
if quant_type not in supported_types:
|
78 |
-
return (
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
return (
|
87 |
-
False,
|
88 |
-
f"Marlin does not support group_size = {group_size}. "
|
89 |
-
f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
|
90 |
-
"are supported.",
|
91 |
-
)
|
92 |
|
93 |
return True, None
|
94 |
|
95 |
|
96 |
-
def check_marlin_supported(
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
)
|
102 |
-
cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
|
103 |
return cond
|
104 |
|
105 |
|
106 |
-
def verify_marlin_supported(
|
107 |
-
|
108 |
-
) -> None:
|
109 |
cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
|
110 |
if not cond:
|
111 |
assert err_msg is not None
|
112 |
raise ValueError(err_msg)
|
113 |
|
114 |
|
115 |
-
def verify_marlin_supports_shape(
|
116 |
-
|
117 |
-
|
118 |
-
input_size: int,
|
119 |
-
group_size: int,
|
120 |
-
) -> None:
|
121 |
|
122 |
# Validate output_size_per_partition
|
123 |
if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
|
124 |
-
raise ValueError(
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
"with --quantization gptq."
|
130 |
-
)
|
131 |
|
132 |
# Validate input_size_per_partition
|
133 |
if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
|
134 |
-
raise ValueError(
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
if group_size < input_size and input_size_per_partition % group_size != 0:
|
143 |
raise ValueError(
|
144 |
f"Weight input_size_per_partition = {input_size_per_partition}"
|
145 |
-
f" is not divisible by group_size = {group_size}."
|
146 |
"Consider reducing tensor_parallel_size or running "
|
147 |
-
"with --quantization gptq."
|
148 |
-
)
|
149 |
|
150 |
|
151 |
-
def check_marlin_supports_shape(
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
group_size: int,
|
156 |
-
) -> Tuple[bool, Optional[str]]:
|
157 |
try:
|
158 |
-
verify_marlin_supports_shape(
|
159 |
-
|
160 |
-
|
161 |
except ValueError as e:
|
162 |
return False, e.__str__()
|
163 |
return True, None
|
164 |
|
165 |
|
166 |
-
def marlin_make_workspace(
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
|
171 |
-
) * GPTQ_MARLIN_MAX_PARALLEL
|
172 |
|
173 |
-
return torch.zeros(
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
|
178 |
def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
|
179 |
return (not act_order) or (act_order and not is_row_parallel)
|
180 |
|
181 |
|
182 |
-
def marlin_repeat_scales_on_all_ranks(
|
183 |
-
|
184 |
-
) -> bool:
|
185 |
# Need to repeat scales on every rank if act_ordering or
|
186 |
# channelwise and RowParallelLinear
|
187 |
is_channelwise = group_size == -1
|
@@ -189,35 +198,34 @@ def marlin_repeat_scales_on_all_ranks(
|
|
189 |
|
190 |
|
191 |
def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
192 |
-
return torch.nn.Parameter(
|
193 |
-
|
194 |
-
)
|
195 |
|
196 |
|
197 |
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
198 |
-
return torch.nn.Parameter(
|
199 |
-
|
200 |
-
)
|
201 |
|
202 |
|
203 |
-
def marlin_sort_g_idx(
|
|
|
204 |
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
205 |
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
206 |
|
207 |
|
208 |
def get_scale_perms():
|
209 |
-
scale_perm:
|
210 |
for i in range(8):
|
211 |
scale_perm.extend([i + 8 * j for j in range(8)])
|
212 |
-
scale_perm_single:
|
213 |
for i in range(4):
|
214 |
-
scale_perm_single.extend(
|
|
|
215 |
return scale_perm, scale_perm_single
|
216 |
|
217 |
|
218 |
-
def marlin_permute_scales(
|
219 |
-
|
220 |
-
) -> torch.Tensor:
|
221 |
|
222 |
scale_perm, scale_perm_single = get_scale_perms()
|
223 |
if group_size < size_k and group_size != -1:
|
@@ -247,9 +255,8 @@ def marlin_moe_permute_scales(
|
|
247 |
return output
|
248 |
|
249 |
|
250 |
-
def marlin_zero_points(
|
251 |
-
|
252 |
-
) -> torch.Tensor:
|
253 |
# Permute zero-points in a similar way to scales, but do not use the
|
254 |
# "single" permutation, since zero-points are applied on every MMA
|
255 |
scale_perm, _ = get_scale_perms()
|
@@ -270,9 +277,8 @@ def marlin_zero_points(
|
|
270 |
return zp
|
271 |
|
272 |
|
273 |
-
def awq_to_marlin_zero_points(
|
274 |
-
|
275 |
-
) -> torch.Tensor:
|
276 |
# AWQ zero-points are quantized and packed on the column dim.
|
277 |
# In addition, the values are permuted based on dequantizer.
|
278 |
# Here we undo both of these, and then apply marlin permutation
|
@@ -294,9 +300,8 @@ def awq_to_marlin_zero_points(
|
|
294 |
return marlin_zp
|
295 |
|
296 |
|
297 |
-
def moe_awq_to_marlin_zero_points(
|
298 |
-
|
299 |
-
):
|
300 |
num_experts = q_zp_packed.shape[0]
|
301 |
output = torch.empty(
|
302 |
(num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
|
@@ -304,45 +309,97 @@ def moe_awq_to_marlin_zero_points(
|
|
304 |
dtype=q_zp_packed.dtype,
|
305 |
)
|
306 |
for e in range(num_experts):
|
307 |
-
output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
|
|
|
308 |
return output
|
309 |
|
310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
def apply_gptq_marlin_linear(
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
) -> torch.Tensor:
|
326 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
327 |
-
out_shape = input.shape[:-1] + (output_size_per_partition,)
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
|
347 |
if bias is not None:
|
348 |
output.add_(bias) # In-place add
|
@@ -351,39 +408,43 @@ def apply_gptq_marlin_linear(
|
|
351 |
|
352 |
|
353 |
def apply_awq_marlin_linear(
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
) -> torch.Tensor:
|
367 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
368 |
-
out_shape = input.shape[:-1] + (output_size_per_partition,)
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
|
|
|
|
|
|
|
|
|
|
387 |
|
388 |
if bias is not None:
|
389 |
output.add_(bias) # In-place add
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
+
from typing import Optional
|
5 |
|
6 |
import numpy
|
7 |
import torch
|
|
|
45 |
# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
|
46 |
# TODO: we may want to move this into the C++ so its closer to the actual impl
|
47 |
def query_marlin_supported_quant_types(
|
48 |
+
has_zp: Optional[bool] = None,
|
49 |
+
include_fp_type: bool = True,
|
50 |
+
device_capability: Optional[int] = None,
|
51 |
):
|
52 |
if device_capability is None:
|
53 |
capability_tuple = torch.cuda.get_device_capability()
|
|
|
56 |
if device_capability < 80:
|
57 |
return []
|
58 |
|
59 |
+
# - has_zp is True: return quant_types that has zero points
|
60 |
+
# - has_zp is False: return quant_types that has not zero points
|
61 |
+
# - has_zp is None: both
|
62 |
+
if has_zp is None:
|
63 |
+
types0 = query_marlin_supported_quant_types(False, include_fp_type,
|
64 |
+
device_capability)
|
65 |
+
types1 = query_marlin_supported_quant_types(True, include_fp_type,
|
66 |
+
device_capability)
|
67 |
+
return types0 + types1
|
68 |
+
|
69 |
if has_zp:
|
70 |
# AWQ style, unsigned + runtime zero-point
|
71 |
+
return [scalar_types.uint4]
|
72 |
else:
|
73 |
# GPTQ style, unsigned + symmetric bias
|
74 |
+
res = [scalar_types.uint4b8, scalar_types.uint8b128]
|
75 |
+
if include_fp_type:
|
76 |
+
res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
|
77 |
+
return res
|
78 |
|
79 |
|
80 |
def _check_marlin_supported(
|
81 |
+
quant_type: ScalarType,
|
82 |
+
group_size: Optional[int],
|
83 |
+
has_zp: bool,
|
84 |
+
device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
|
|
|
85 |
|
86 |
if device_capability is None:
|
87 |
capability_tuple = torch.cuda.get_device_capability()
|
88 |
device_capability = capability_tuple[0] * 10 + capability_tuple[1]
|
89 |
|
90 |
+
supported_types = query_marlin_supported_quant_types(
|
91 |
+
has_zp, True, device_capability)
|
92 |
|
93 |
if quant_type not in supported_types:
|
94 |
+
return (False, f"Marlin does not support weight_bits = {quant_type}. "
|
95 |
+
f"Only types = {supported_types} "
|
96 |
+
f"are supported (for group_size = {group_size}, "
|
97 |
+
f"device_capability = {device_capability}, zp = {has_zp}).")
|
98 |
+
if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
|
99 |
+
return (False, f"Marlin does not support group_size = {group_size}. "
|
100 |
+
f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
|
101 |
+
"are supported.")
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
return True, None
|
104 |
|
105 |
|
106 |
+
def check_marlin_supported(quant_type: ScalarType,
|
107 |
+
group_size: int,
|
108 |
+
has_zp: bool = False,
|
109 |
+
device_capability: Optional[int] = None) -> bool:
|
110 |
+
cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
|
111 |
+
device_capability)
|
|
|
112 |
return cond
|
113 |
|
114 |
|
115 |
+
def verify_marlin_supported(quant_type: ScalarType,
|
116 |
+
group_size: int,
|
117 |
+
has_zp: bool = False) -> None:
|
118 |
cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
|
119 |
if not cond:
|
120 |
assert err_msg is not None
|
121 |
raise ValueError(err_msg)
|
122 |
|
123 |
|
124 |
+
def verify_marlin_supports_shape(output_size_per_partition: int,
|
125 |
+
input_size_per_partition: int,
|
126 |
+
input_size: int, group_size: int) -> None:
|
|
|
|
|
|
|
127 |
|
128 |
# Validate output_size_per_partition
|
129 |
if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
|
130 |
+
raise ValueError(f"Weight output_size_per_partition = "
|
131 |
+
f"{output_size_per_partition} is not divisible by "
|
132 |
+
f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
|
133 |
+
"Consider reducing tensor_parallel_size or running "
|
134 |
+
"with --quantization gptq.")
|
|
|
|
|
135 |
|
136 |
# Validate input_size_per_partition
|
137 |
if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
|
138 |
+
raise ValueError(f"Weight input_size_per_partition = "
|
139 |
+
f"{input_size_per_partition} is not divisible "
|
140 |
+
f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
|
141 |
+
"Consider reducing tensor_parallel_size or running "
|
142 |
+
"with --quantization gptq.")
|
143 |
+
|
144 |
+
if (group_size < input_size
|
145 |
+
and input_size_per_partition % group_size != 0):
|
|
|
146 |
raise ValueError(
|
147 |
f"Weight input_size_per_partition = {input_size_per_partition}"
|
148 |
+
f" is not divisible by group_size = {group_size}. "
|
149 |
"Consider reducing tensor_parallel_size or running "
|
150 |
+
"with --quantization gptq.")
|
|
|
151 |
|
152 |
|
153 |
+
def check_marlin_supports_shape(output_size_per_partition: int,
|
154 |
+
input_size_per_partition: int,
|
155 |
+
input_size: int, group_size: int) \
|
156 |
+
-> tuple[bool, Optional[str]]:
|
|
|
|
|
157 |
try:
|
158 |
+
verify_marlin_supports_shape(output_size_per_partition,
|
159 |
+
input_size_per_partition, input_size,
|
160 |
+
group_size)
|
161 |
except ValueError as e:
|
162 |
return False, e.__str__()
|
163 |
return True, None
|
164 |
|
165 |
|
166 |
+
def marlin_make_workspace(output_size_per_partition: int,
|
167 |
+
device: torch.device) -> torch.Tensor:
|
168 |
+
max_workspace_size = (output_size_per_partition //
|
169 |
+
GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
|
|
|
|
|
170 |
|
171 |
+
return torch.zeros(max_workspace_size,
|
172 |
+
dtype=torch.int,
|
173 |
+
device=device,
|
174 |
+
requires_grad=False)
|
175 |
+
|
176 |
+
|
177 |
+
def marlin_make_workspace_new(device: torch.device,
|
178 |
+
max_blocks_per_sm: int = 1) -> torch.Tensor:
|
179 |
+
# In the new marlin kernel, we use the num of threadblocks as workspace
|
180 |
+
# size. The num of threadblocks is is sms_count * max_blocks_per_sm.
|
181 |
+
sms = torch.cuda.get_device_properties(device).multi_processor_count
|
182 |
+
return torch.zeros(sms * max_blocks_per_sm,
|
183 |
+
dtype=torch.int,
|
184 |
+
device=device,
|
185 |
+
requires_grad=False)
|
186 |
|
187 |
|
188 |
def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
|
189 |
return (not act_order) or (act_order and not is_row_parallel)
|
190 |
|
191 |
|
192 |
+
def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
|
193 |
+
is_row_parallel: bool) -> bool:
|
|
|
194 |
# Need to repeat scales on every rank if act_ordering or
|
195 |
# channelwise and RowParallelLinear
|
196 |
is_channelwise = group_size == -1
|
|
|
198 |
|
199 |
|
200 |
def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
201 |
+
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
202 |
+
requires_grad=False)
|
|
|
203 |
|
204 |
|
205 |
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
206 |
+
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
207 |
+
requires_grad=False)
|
|
|
208 |
|
209 |
|
210 |
+
def marlin_sort_g_idx(
|
211 |
+
g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
212 |
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
213 |
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
214 |
|
215 |
|
216 |
def get_scale_perms():
|
217 |
+
scale_perm: list[int] = []
|
218 |
for i in range(8):
|
219 |
scale_perm.extend([i + 8 * j for j in range(8)])
|
220 |
+
scale_perm_single: list[int] = []
|
221 |
for i in range(4):
|
222 |
+
scale_perm_single.extend(
|
223 |
+
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
224 |
return scale_perm, scale_perm_single
|
225 |
|
226 |
|
227 |
+
def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
|
228 |
+
group_size: int) -> torch.Tensor:
|
|
|
229 |
|
230 |
scale_perm, scale_perm_single = get_scale_perms()
|
231 |
if group_size < size_k and group_size != -1:
|
|
|
255 |
return output
|
256 |
|
257 |
|
258 |
+
def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
|
259 |
+
num_bits: int) -> torch.Tensor:
|
|
|
260 |
# Permute zero-points in a similar way to scales, but do not use the
|
261 |
# "single" permutation, since zero-points are applied on every MMA
|
262 |
scale_perm, _ = get_scale_perms()
|
|
|
277 |
return zp
|
278 |
|
279 |
|
280 |
+
def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
281 |
+
size_n: int, num_bits: int) -> torch.Tensor:
|
|
|
282 |
# AWQ zero-points are quantized and packed on the column dim.
|
283 |
# In addition, the values are permuted based on dequantizer.
|
284 |
# Here we undo both of these, and then apply marlin permutation
|
|
|
300 |
return marlin_zp
|
301 |
|
302 |
|
303 |
+
def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
304 |
+
size_n: int, num_bits: int):
|
|
|
305 |
num_experts = q_zp_packed.shape[0]
|
306 |
output = torch.empty(
|
307 |
(num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
|
|
|
309 |
dtype=q_zp_packed.dtype,
|
310 |
)
|
311 |
for e in range(num_experts):
|
312 |
+
output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
|
313 |
+
num_bits)
|
314 |
return output
|
315 |
|
316 |
|
317 |
+
def maybe_warn_marlin_atomic_add(device, dtype):
|
318 |
+
if torch.compiler.is_dynamo_compiling():
|
319 |
+
return
|
320 |
+
device_capability = torch.cuda.get_device_capability(device)
|
321 |
+
if device_capability[0] < 9 and dtype == torch.bfloat16:
|
322 |
+
logger.info_once(
|
323 |
+
"You are running Marlin kernel with bf16 on GPUs before SM90. "
|
324 |
+
"You can consider change to fp16 to achieve better performance "
|
325 |
+
"if possible.")
|
326 |
+
|
327 |
+
|
328 |
+
def maybe_warn_marlin_atomic_add_env():
|
329 |
+
if torch.compiler.is_dynamo_compiling():
|
330 |
+
return
|
331 |
+
if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
|
332 |
+
return
|
333 |
+
logger.info_once(
|
334 |
+
"Marlin kernel can achieve better performance for small size_n "
|
335 |
+
"with experimental use_atomic_add feature. "
|
336 |
+
"You can consider set environment variable "
|
337 |
+
"VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
|
338 |
+
|
339 |
+
|
340 |
+
def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
|
341 |
+
dtype: torch.dtype) -> bool:
|
342 |
+
|
343 |
+
# the performance of atomicAdd is better than global reduce
|
344 |
+
# only when m*n is small and k is large
|
345 |
+
if n >= 2048 or k < 2048 or device.type != "cuda":
|
346 |
+
return False
|
347 |
+
|
348 |
+
# disable atomicAdd reduce by default,
|
349 |
+
# one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
|
350 |
+
if not envs.VLLM_MARLIN_USE_ATOMIC_ADD:
|
351 |
+
maybe_warn_marlin_atomic_add_env()
|
352 |
+
return False
|
353 |
+
|
354 |
+
# sm8x doesn't support atomicAdd + bfloat16 natively
|
355 |
+
device_capability = torch.cuda.get_device_capability(device)
|
356 |
+
if device_capability[0] < 9 and dtype == torch.bfloat16:
|
357 |
+
maybe_warn_marlin_atomic_add(device, dtype)
|
358 |
+
return False
|
359 |
+
|
360 |
+
return True
|
361 |
+
|
362 |
+
|
363 |
def apply_gptq_marlin_linear(
|
364 |
+
input: torch.Tensor,
|
365 |
+
weight: torch.Tensor,
|
366 |
+
weight_scale: torch.Tensor,
|
367 |
+
weight_zp: torch.Tensor,
|
368 |
+
g_idx: torch.Tensor,
|
369 |
+
g_idx_sort_indices: torch.Tensor,
|
370 |
+
workspace: torch.Tensor,
|
371 |
+
wtype: ScalarType,
|
372 |
+
output_size_per_partition: int,
|
373 |
+
input_size_per_partition: int,
|
374 |
+
is_k_full: bool,
|
375 |
+
bias: Optional[torch.Tensor] = None,
|
376 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
|
|
377 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
378 |
+
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
379 |
+
|
380 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
381 |
+
n=output_size_per_partition,
|
382 |
+
k=reshaped_x.size(1),
|
383 |
+
device=input.device,
|
384 |
+
dtype=input.dtype)
|
385 |
+
|
386 |
+
output = ops.gptq_marlin_gemm(reshaped_x,
|
387 |
+
None,
|
388 |
+
weight,
|
389 |
+
weight_scale,
|
390 |
+
None,
|
391 |
+
weight_zp,
|
392 |
+
g_idx,
|
393 |
+
g_idx_sort_indices,
|
394 |
+
workspace,
|
395 |
+
wtype,
|
396 |
+
size_m=reshaped_x.shape[0],
|
397 |
+
size_n=output_size_per_partition,
|
398 |
+
size_k=input_size_per_partition,
|
399 |
+
is_k_full=is_k_full,
|
400 |
+
use_atomic_add=use_atomic_add,
|
401 |
+
use_fp32_reduce=use_fp32_reduce,
|
402 |
+
is_zp_float=False)
|
403 |
|
404 |
if bias is not None:
|
405 |
output.add_(bias) # In-place add
|
|
|
408 |
|
409 |
|
410 |
def apply_awq_marlin_linear(
|
411 |
+
input: torch.Tensor,
|
412 |
+
weight: torch.Tensor,
|
413 |
+
weight_scale: torch.Tensor,
|
414 |
+
weight_zp: torch.Tensor,
|
415 |
+
g_idx: torch.Tensor,
|
416 |
+
g_idx_sort_indices: torch.Tensor,
|
417 |
+
workspace: torch.Tensor,
|
418 |
+
quant_type: ScalarType,
|
419 |
+
output_size_per_partition: int,
|
420 |
+
input_size_per_partition: int,
|
421 |
+
bias: Optional[torch.Tensor] = None,
|
422 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
|
|
423 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
424 |
+
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
425 |
+
|
426 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
427 |
+
n=output_size_per_partition,
|
428 |
+
k=reshaped_x.size(1),
|
429 |
+
device=input.device,
|
430 |
+
dtype=input.dtype)
|
431 |
+
|
432 |
+
output = ops.gptq_marlin_gemm(reshaped_x,
|
433 |
+
None,
|
434 |
+
weight,
|
435 |
+
weight_scale,
|
436 |
+
None,
|
437 |
+
weight_zp,
|
438 |
+
g_idx,
|
439 |
+
g_idx_sort_indices,
|
440 |
+
workspace,
|
441 |
+
quant_type,
|
442 |
+
size_m=reshaped_x.shape[0],
|
443 |
+
size_n=output_size_per_partition,
|
444 |
+
size_k=input_size_per_partition,
|
445 |
+
use_atomic_add=use_atomic_add,
|
446 |
+
use_fp32_reduce=use_fp32_reduce,
|
447 |
+
is_zp_float=False)
|
448 |
|
449 |
if bias is not None:
|
450 |
output.add_(bias) # In-place add
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
import torch
|
7 |
+
|
8 |
+
import quantization as ops
|
9 |
+
|
10 |
+
from .marlin_utils import (
|
11 |
+
USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
|
12 |
+
should_use_atomic_add_reduce)
|
13 |
+
from quantization.scalar_type import scalar_types
|
14 |
+
|
15 |
+
FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16]
|
16 |
+
|
17 |
+
|
18 |
+
def is_fp4_marlin_supported():
|
19 |
+
capability = torch.cuda.get_device_capability()
|
20 |
+
capability = capability[0] * 10 + capability[1]
|
21 |
+
return capability >= 80
|
22 |
+
|
23 |
+
|
24 |
+
def fp4_marlin_process_scales(marlin_scales):
|
25 |
+
if not (marlin_scales >= 0).all():
|
26 |
+
logger.warning_once(
|
27 |
+
"NVFP4 Marlin assumes the scales to be >=0, but has encountered "
|
28 |
+
"negative scales. Accuracy will likely be degraded. This is "
|
29 |
+
"because it changes the scales from FP8-S1E4M3 to a special "
|
30 |
+
"FP8-S0E5M3 format to speedup the dequantization.")
|
31 |
+
|
32 |
+
# convert to half first, we would convert to fp8 later
|
33 |
+
marlin_scales = marlin_scales.to(torch.half)
|
34 |
+
|
35 |
+
# 8 is the number of scale number using by one thread
|
36 |
+
marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
|
37 |
+
marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
|
38 |
+
marlin_scales.size(0) * 2, -1)
|
39 |
+
|
40 |
+
# fit the layout of fp8 dequantization
|
41 |
+
marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
|
42 |
+
marlin_scales.size(0), -1)
|
43 |
+
|
44 |
+
# We assume that weight_scale (FP8-S1E4M3) is always greater
|
45 |
+
# than or equal to 0. So we can convert
|
46 |
+
# (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format.
|
47 |
+
# After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1
|
48 |
+
# when weight_scale > 0. This allows us to have an exponent bias
|
49 |
+
# closer to zero after dequantization.
|
50 |
+
|
51 |
+
marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1
|
52 |
+
marlin_scales = marlin_scales.view(torch.float8_e4m3fn)
|
53 |
+
marlin_scales = marlin_scales[:, 1::2].contiguous()
|
54 |
+
|
55 |
+
return marlin_scales
|
56 |
+
|
57 |
+
|
58 |
+
def fp4_marlin_process_global_scale(global_scale):
|
59 |
+
assert global_scale.dtype in [torch.half, torch.bfloat16]
|
60 |
+
fp4_exponent = 2
|
61 |
+
if global_scale.dtype == torch.half:
|
62 |
+
target_exponent = 5
|
63 |
+
elif global_scale.dtype == torch.bfloat16:
|
64 |
+
target_exponent = 8
|
65 |
+
# exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
|
66 |
+
# exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
|
67 |
+
exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
|
68 |
+
return global_scale * (2.0**(exponent_bias - 7))
|
69 |
+
|
70 |
+
|
71 |
+
def apply_fp4_marlin_linear(
|
72 |
+
input: torch.Tensor,
|
73 |
+
weight: torch.Tensor,
|
74 |
+
weight_scale: torch.Tensor,
|
75 |
+
weight_scale_2: torch.Tensor,
|
76 |
+
workspace: torch.Tensor,
|
77 |
+
size_n: int,
|
78 |
+
size_k: int,
|
79 |
+
bias: Optional[torch.Tensor] = None,
|
80 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
81 |
+
# For GPUs that lack FP4 hardware support, we can leverage the
|
82 |
+
# Marlin kernel for fast weight-only FP4 quantization
|
83 |
+
|
84 |
+
reshaped_x = input.reshape(-1, input.shape[-1])
|
85 |
+
out_shape = input.shape[:-1] + (size_n, )
|
86 |
+
|
87 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
88 |
+
n=size_n,
|
89 |
+
k=size_k,
|
90 |
+
device=input.device,
|
91 |
+
dtype=input.dtype)
|
92 |
+
|
93 |
+
output = ops.gptq_marlin_gemm(a=reshaped_x,
|
94 |
+
c=None,
|
95 |
+
b_q_weight=weight,
|
96 |
+
b_scales=weight_scale,
|
97 |
+
global_scale=weight_scale_2,
|
98 |
+
b_zeros=None,
|
99 |
+
g_idx=None,
|
100 |
+
perm=None,
|
101 |
+
workspace=workspace,
|
102 |
+
b_q_type=scalar_types.float4_e2m1f,
|
103 |
+
size_m=reshaped_x.size(0),
|
104 |
+
size_n=size_n,
|
105 |
+
size_k=size_k,
|
106 |
+
use_atomic_add=use_atomic_add,
|
107 |
+
use_fp32_reduce=use_fp32_reduce)
|
108 |
+
|
109 |
+
if bias is not None:
|
110 |
+
output.add_(bias) # In-place add
|
111 |
+
|
112 |
+
return output.reshape(out_shape)
|
113 |
+
|
114 |
+
|
115 |
+
def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
|
116 |
+
logger.warning_once(
|
117 |
+
"Your GPU does not have native support for FP4 computation but "
|
118 |
+
"FP4 quantization is being used. Weight-only FP4 compression will "
|
119 |
+
"be used leveraging the Marlin kernel. This may degrade "
|
120 |
+
"performance for compute-heavy workloads.")
|
121 |
+
|
122 |
+
part_size_n = layer.output_size_per_partition
|
123 |
+
part_size_k = layer.input_size_per_partition
|
124 |
+
param_dtype = layer.params_dtype
|
125 |
+
|
126 |
+
assert layer.weight.shape == (part_size_n, part_size_k // 2)
|
127 |
+
|
128 |
+
device = layer.weight.device
|
129 |
+
|
130 |
+
# WORKSPACE
|
131 |
+
layer.workspace = marlin_make_workspace_new(device)
|
132 |
+
|
133 |
+
# WEIGHT
|
134 |
+
# Repack weights to marlin format
|
135 |
+
perm = torch.empty(0, dtype=torch.int, device=device)
|
136 |
+
qweight = layer.weight.view(torch.int32).T.contiguous()
|
137 |
+
|
138 |
+
marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
|
139 |
+
perm=perm,
|
140 |
+
size_k=part_size_k,
|
141 |
+
size_n=part_size_n,
|
142 |
+
num_bits=4)
|
143 |
+
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
|
144 |
+
|
145 |
+
# WEIGHT SCALES
|
146 |
+
# Permute scales
|
147 |
+
weight_scale = layer.weight_scale.T.to(param_dtype)
|
148 |
+
weight_scale = marlin_permute_scales(s=weight_scale,
|
149 |
+
size_k=part_size_k,
|
150 |
+
size_n=part_size_n,
|
151 |
+
group_size=16)
|
152 |
+
weight_scale = fp4_marlin_process_scales(weight_scale)
|
153 |
+
layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
|
154 |
+
|
155 |
+
weight_scale_2 = layer.weight_scale_2.to(param_dtype)
|
156 |
+
weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
|
157 |
+
layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
|
158 |
+
requires_grad=False)
|
159 |
+
|
160 |
+
return
|
161 |
+
|
162 |
+
|
163 |
+
def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
|
164 |
+
logger.warning_once(
|
165 |
+
"Your GPU does not have native support for FP4 computation but "
|
166 |
+
"FP4 quantization is being used. Weight-only FP4 compression will "
|
167 |
+
"be used leveraging the Marlin kernel. This may degrade "
|
168 |
+
"performance for compute-heavy workloads.")
|
169 |
+
|
170 |
+
e = layer.num_experts
|
171 |
+
k = layer.hidden_size
|
172 |
+
n = layer.intermediate_size_per_partition
|
173 |
+
|
174 |
+
# WORKSPACE
|
175 |
+
device = layer.w13_weight.device
|
176 |
+
param_dtype = layer.params_dtype
|
177 |
+
layer.workspace = marlin_make_workspace_new(device, 4)
|
178 |
+
perm = torch.empty(0, dtype=torch.int, device=device)
|
179 |
+
|
180 |
+
# WEIGHT
|
181 |
+
# Repack weights to marlin format
|
182 |
+
for name in ["w13_weight", "w2_weight"]:
|
183 |
+
weight = getattr(layer, name)
|
184 |
+
tensor_list = []
|
185 |
+
if "w13" in name:
|
186 |
+
size_n, size_k = n * 2, k
|
187 |
+
else:
|
188 |
+
size_n, size_k = k, n
|
189 |
+
|
190 |
+
assert weight.shape == (e, size_n, size_k // 2)
|
191 |
+
|
192 |
+
for i in range(e):
|
193 |
+
qweight = weight[i].view(torch.int32).T.contiguous()
|
194 |
+
|
195 |
+
marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
|
196 |
+
perm=perm,
|
197 |
+
size_k=size_k,
|
198 |
+
size_n=size_n,
|
199 |
+
num_bits=4)
|
200 |
+
tensor_list.append(marlin_qweight)
|
201 |
+
|
202 |
+
weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
|
203 |
+
weight = torch.nn.Parameter(weight, requires_grad=False)
|
204 |
+
|
205 |
+
setattr(layer, name, weight)
|
206 |
+
|
207 |
+
# WEIGHT SCALES
|
208 |
+
# Permute scales
|
209 |
+
for name in ["w13", "w2"]:
|
210 |
+
scales = getattr(layer, name + "_weight_scale").to(param_dtype)
|
211 |
+
global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
|
212 |
+
|
213 |
+
tensor_list = []
|
214 |
+
if "w13" in name:
|
215 |
+
size_n, size_k = n * 2, k
|
216 |
+
else:
|
217 |
+
size_n, size_k = k, n
|
218 |
+
|
219 |
+
for i in range(e):
|
220 |
+
marlin_scales = marlin_permute_scales(s=scales[i].T,
|
221 |
+
size_k=size_k,
|
222 |
+
size_n=size_n,
|
223 |
+
group_size=16)
|
224 |
+
marlin_scales = fp4_marlin_process_scales(marlin_scales)
|
225 |
+
tensor_list.append(marlin_scales)
|
226 |
+
|
227 |
+
scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
|
228 |
+
scales = torch.nn.Parameter(scales, requires_grad=False)
|
229 |
+
setattr(layer, name + "_weight_scale", scales)
|
230 |
+
|
231 |
+
global_scale = fp4_marlin_process_global_scale(global_scale)
|
232 |
+
global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
|
233 |
+
setattr(layer, name + "_weight_scale_2", global_scale)
|
234 |
+
|
235 |
+
|
236 |
+
def rand_marlin_weight_fp4_like(weight, group_size):
|
237 |
+
assert group_size > 0
|
238 |
+
size_n, size_k = weight.shape
|
239 |
+
device = weight.device
|
240 |
+
|
241 |
+
scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6
|
242 |
+
global_scale = scales.max() / 448
|
243 |
+
scales = (scales / global_scale).to(torch.float8_e4m3fn)
|
244 |
+
|
245 |
+
fp4_weight = torch.randint(0,
|
246 |
+
256, (size_n, size_k // 2),
|
247 |
+
dtype=torch.uint8,
|
248 |
+
device=weight.device)
|
249 |
+
fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
|
250 |
+
((fp4_weight & 0b01110000) >> 2))
|
251 |
+
fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
|
252 |
+
fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
|
253 |
+
|
254 |
+
fp4_weight2 = fp4_weight << 4
|
255 |
+
fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
|
256 |
+
((fp4_weight2 & 0b01110000) >> 2))
|
257 |
+
fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
|
258 |
+
fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
|
259 |
+
|
260 |
+
weight_ref = torch.cat(
|
261 |
+
[fp4_weight_part_2.unsqueeze(2),
|
262 |
+
fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
|
263 |
+
weight_ref = weight_ref * global_scale.to(weight.dtype) * \
|
264 |
+
scales.repeat_interleave(group_size, 1).to(weight.dtype)
|
265 |
+
|
266 |
+
marlin_qweight = ops.gptq_marlin_repack(
|
267 |
+
b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
|
268 |
+
perm=torch.empty(0, dtype=torch.int, device=device),
|
269 |
+
size_k=size_k,
|
270 |
+
size_n=size_n,
|
271 |
+
num_bits=4,
|
272 |
+
)
|
273 |
+
|
274 |
+
marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
|
275 |
+
size_k=size_k,
|
276 |
+
size_n=size_n,
|
277 |
+
group_size=group_size)
|
278 |
+
marlin_scales = fp4_marlin_process_scales(marlin_scales)
|
279 |
+
|
280 |
+
global_scale = fp4_marlin_process_global_scale(global_scale)
|
281 |
+
|
282 |
+
return weight_ref.T, marlin_qweight, marlin_scales, global_scale
|
build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py
CHANGED
@@ -1,10 +1,13 @@
|
|
|
|
|
|
|
|
1 |
from typing import Optional
|
2 |
|
3 |
import torch
|
4 |
|
5 |
import quantization as ops
|
6 |
|
7 |
-
from .marlin_utils import marlin_make_workspace, marlin_permute_scales
|
8 |
|
9 |
|
10 |
def is_fp8_marlin_supported():
|
@@ -13,88 +16,107 @@ def is_fp8_marlin_supported():
|
|
13 |
return capability >= 80
|
14 |
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def apply_fp8_marlin_linear(
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
) -> torch.Tensor:
|
25 |
# For GPUs that lack FP8 hardware support, we can leverage the
|
26 |
# Marlin kernel for fast weight-only FP8 quantization
|
27 |
|
28 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
29 |
-
out_shape = input.shape[:-1] + (size_n,)
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
if bias is not None:
|
43 |
output.add_(bias) # In-place add
|
44 |
|
45 |
return output.reshape(out_shape)
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
layer: torch.nn.Module, strategy: str = "tensor"
|
50 |
-
) -> None:
|
51 |
-
part_size_n = layer.output_size_per_partition
|
52 |
-
part_size_k = layer.input_size_per_partition
|
53 |
-
|
54 |
-
device = layer.weight.device
|
55 |
-
|
56 |
-
# WORKSPACE
|
57 |
-
layer.workspace = marlin_make_workspace(part_size_n, device)
|
58 |
-
|
59 |
-
# WEIGHT
|
60 |
-
# Repack weights to marlin format
|
61 |
-
marlin_qweight = ops.gptq_marlin_repack(
|
62 |
-
b_q_weight=pack_fp8_to_int32(layer.weight),
|
63 |
-
perm=torch.empty(0, dtype=torch.int, device=device),
|
64 |
-
size_k=part_size_k,
|
65 |
-
size_n=part_size_n,
|
66 |
-
num_bits=8,
|
67 |
-
)
|
68 |
-
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
|
69 |
-
|
70 |
-
# WEIGHT SCALES
|
71 |
-
scales = layer.weight_scale.to(layer.orig_dtype)
|
72 |
-
# Permute scales
|
73 |
-
marlin_scales = marlin_permute_scales(
|
74 |
-
s=scales, size_k=part_size_k, size_n=part_size_n, group_size=-1
|
75 |
-
)
|
76 |
-
layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
|
77 |
-
|
78 |
-
|
79 |
-
def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
|
80 |
"""
|
81 |
Repack FP8 weights to gptq format (packed int32 elements)
|
82 |
"""
|
83 |
assert fp8_tensor.dtype == torch.float8_e4m3fn
|
84 |
-
assert fp8_tensor.
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
-
|
90 |
-
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
packed = (
|
94 |
-
byte_tensor[:, 0].to(torch.int32)
|
95 |
-
| (byte_tensor[:, 1].to(torch.int32) << 8)
|
96 |
-
| (byte_tensor[:, 2].to(torch.int32) << 16)
|
97 |
-
| (byte_tensor[:, 3].to(torch.int32) << 24)
|
98 |
-
)
|
99 |
|
100 |
-
return
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
from typing import Optional
|
5 |
|
6 |
import torch
|
7 |
|
8 |
import quantization as ops
|
9 |
|
10 |
+
from .marlin_utils import USE_FP32_REDUCE_DEFAULT, marlin_make_workspace, marlin_permute_scales
|
11 |
|
12 |
|
13 |
def is_fp8_marlin_supported():
|
|
|
16 |
return capability >= 80
|
17 |
|
18 |
|
19 |
+
def fp8_fused_exponent_bias_into_scales(scales):
|
20 |
+
fp8_exponent = 4
|
21 |
+
if scales.dtype == torch.half:
|
22 |
+
target_exponent = 5
|
23 |
+
elif scales.dtype == torch.bfloat16:
|
24 |
+
target_exponent = 8
|
25 |
+
# exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
|
26 |
+
# exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
|
27 |
+
exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
|
28 |
+
s = torch.ones_like(scales) * 2
|
29 |
+
s = s**exponent_bias
|
30 |
+
return scales * s
|
31 |
+
|
32 |
+
|
33 |
def apply_fp8_marlin_linear(
|
34 |
+
input: torch.Tensor,
|
35 |
+
weight: torch.Tensor,
|
36 |
+
weight_scale: torch.Tensor,
|
37 |
+
workspace: torch.Tensor,
|
38 |
+
size_n: int,
|
39 |
+
size_k: int,
|
40 |
+
bias: Optional[torch.Tensor],
|
41 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
42 |
# For GPUs that lack FP8 hardware support, we can leverage the
|
43 |
# Marlin kernel for fast weight-only FP8 quantization
|
44 |
|
45 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
46 |
+
out_shape = input.shape[:-1] + (size_n, )
|
47 |
+
|
48 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
49 |
+
n=size_n,
|
50 |
+
k=size_k,
|
51 |
+
device=input.device,
|
52 |
+
dtype=input.dtype)
|
53 |
+
|
54 |
+
output = ops.gptq_marlin_gemm(a=reshaped_x,
|
55 |
+
c=None,
|
56 |
+
b_q_weight=weight,
|
57 |
+
b_scales=weight_scale,
|
58 |
+
global_scale=None,
|
59 |
+
b_zeros=None,
|
60 |
+
g_idx=None,
|
61 |
+
perm=None,
|
62 |
+
workspace=workspace,
|
63 |
+
b_q_type=scalar_types.float8_e4m3fn,
|
64 |
+
size_m=reshaped_x.size(0),
|
65 |
+
size_n=size_n,
|
66 |
+
size_k=size_k,
|
67 |
+
use_atomic_add=use_atomic_add,
|
68 |
+
use_fp32_reduce=use_fp32_reduce)
|
69 |
|
70 |
if bias is not None:
|
71 |
output.add_(bias) # In-place add
|
72 |
|
73 |
return output.reshape(out_shape)
|
74 |
|
75 |
+
def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
|
76 |
+
size_k_first: bool = True) -> torch.Tensor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
"""
|
78 |
Repack FP8 weights to gptq format (packed int32 elements)
|
79 |
"""
|
80 |
assert fp8_tensor.dtype == torch.float8_e4m3fn
|
81 |
+
assert fp8_tensor.ndim == 2
|
82 |
+
|
83 |
+
fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
|
84 |
+
fp8_tensor = fp8_tensor.contiguous()
|
85 |
+
# fp8_tensor is contiguous and have shape (N, K) now
|
86 |
+
# with `.view(torch.int32)`, it become (N, K // 4)
|
87 |
+
int32_tensor = fp8_tensor.view(torch.int32)
|
88 |
+
return int32_tensor.T.contiguous() if size_k_first else int32_tensor
|
89 |
+
|
90 |
+
|
91 |
+
def marlin_quant_fp8_torch(weight, group_size):
|
92 |
+
size_n, size_k = weight.shape
|
93 |
+
device = weight.device
|
94 |
+
|
95 |
+
if group_size != -1:
|
96 |
+
scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
|
97 |
+
repeated_scales = scales.repeat_interleave(group_size, 1)
|
98 |
+
fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
|
99 |
+
weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
|
100 |
+
else:
|
101 |
+
scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
|
102 |
+
repeated_scales = scales.repeat_interleave(size_k, 1)
|
103 |
+
fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
|
104 |
+
weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
|
105 |
+
|
106 |
+
packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
|
107 |
+
marlin_qweight = ops.gptq_marlin_repack(
|
108 |
+
b_q_weight=packed_weight,
|
109 |
+
perm=torch.empty(0, dtype=torch.int, device=device),
|
110 |
+
size_k=size_k,
|
111 |
+
size_n=size_n,
|
112 |
+
num_bits=8,
|
113 |
+
)
|
114 |
|
115 |
+
marlin_scales = marlin_permute_scales(s=scales.T,
|
116 |
+
size_k=size_k,
|
117 |
+
size_n=size_n,
|
118 |
+
group_size=group_size)
|
119 |
|
120 |
+
marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
+
return weight_ref.T, marlin_qweight, marlin_scales
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/__init__.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
|
2 |
from .cutlass import (
|
|
|
3 |
cutlass_scaled_mm_supports_fp8,
|
4 |
cutlass_scaled_mm,
|
5 |
cutlass_scaled_mm_azp,
|
6 |
)
|
7 |
from .marlin import (
|
8 |
awq_marlin_repack,
|
9 |
-
fp8_marlin_gemm,
|
10 |
gptq_marlin_gemm,
|
11 |
gptq_marlin_repack,
|
12 |
gptq_marlin_24_gemm,
|
@@ -25,8 +25,8 @@ __all__ = [
|
|
25 |
"awq_marlin_repack",
|
26 |
"cutlass_scaled_mm",
|
27 |
"cutlass_scaled_mm_azp",
|
|
|
28 |
"cutlass_scaled_mm_supports_fp8",
|
29 |
-
"fp8_marlin_gemm",
|
30 |
"gptq_marlin_24_gemm",
|
31 |
"gptq_marlin_gemm",
|
32 |
"gptq_marlin_repack",
|
|
|
1 |
from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
|
2 |
from .cutlass import (
|
3 |
+
cutlass_scaled_mm_supports_block_fp8,
|
4 |
cutlass_scaled_mm_supports_fp8,
|
5 |
cutlass_scaled_mm,
|
6 |
cutlass_scaled_mm_azp,
|
7 |
)
|
8 |
from .marlin import (
|
9 |
awq_marlin_repack,
|
|
|
10 |
gptq_marlin_gemm,
|
11 |
gptq_marlin_repack,
|
12 |
gptq_marlin_24_gemm,
|
|
|
25 |
"awq_marlin_repack",
|
26 |
"cutlass_scaled_mm",
|
27 |
"cutlass_scaled_mm_azp",
|
28 |
+
"cutlass_scaled_mm_supports_block_fp8",
|
29 |
"cutlass_scaled_mm_supports_fp8",
|
|
|
30 |
"gptq_marlin_24_gemm",
|
31 |
"gptq_marlin_gemm",
|
32 |
"gptq_marlin_repack",
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _quantization_9035540
|
3 |
+
ops = torch.ops._quantization_9035540
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_quantization_9035540::{op_name}"
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3685a434362226370f1956f59790a58d2f4c8999f9f35acafd25ca9d73bfc5ae
|
3 |
+
size 159991696
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/compressed_tensors.py
CHANGED
@@ -2,17 +2,7 @@ from typing import Optional, Tuple
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
-
|
6 |
-
from ._ops import ops
|
7 |
-
except ImportError as e:
|
8 |
-
# Fallback for local development.
|
9 |
-
try:
|
10 |
-
import _quantization
|
11 |
-
|
12 |
-
ops = torch.ops._quantization
|
13 |
-
except ImportError:
|
14 |
-
raise e
|
15 |
-
|
16 |
|
17 |
# fp8
|
18 |
def scaled_fp8_quant(
|
@@ -21,7 +11,8 @@ def scaled_fp8_quant(
|
|
21 |
num_token_padding: Optional[int] = None,
|
22 |
scale_ub: Optional[torch.Tensor] = None,
|
23 |
use_per_token_if_dynamic: bool = False,
|
24 |
-
|
|
|
25 |
"""
|
26 |
Quantize input tensor to FP8 and return quantized tensor and scale.
|
27 |
|
@@ -42,30 +33,36 @@ def scaled_fp8_quant(
|
|
42 |
in the dynamic quantization case.
|
43 |
|
44 |
Returns:
|
45 |
-
|
46 |
scaling factor.
|
47 |
"""
|
48 |
# This code assumes batch_dim and num_tokens are flattened
|
49 |
-
assert input.ndim == 2
|
50 |
-
shape: Union[
|
51 |
-
# For
|
52 |
-
|
53 |
-
# if current_platform.is_rocm() else torch.float8_e4m3fn
|
54 |
-
out_dtype = torch.float8_e4m3fn
|
55 |
if num_token_padding:
|
56 |
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
57 |
-
output
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if scale is None:
|
60 |
if use_per_token_if_dynamic:
|
61 |
-
scale = torch.empty((shape[0], 1),
|
62 |
-
|
|
|
|
|
|
|
63 |
else:
|
64 |
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
65 |
ops.dynamic_scaled_fp8_quant(output, input, scale)
|
66 |
else:
|
67 |
# num_token_padding not implemented for this case
|
68 |
-
assert scale.numel() == 1
|
69 |
ops.static_scaled_fp8_quant(output, input, scale)
|
70 |
|
71 |
return output, scale
|
@@ -76,8 +73,8 @@ def scaled_int8_quant(
|
|
76 |
input: torch.Tensor,
|
77 |
scale: Optional[torch.Tensor] = None,
|
78 |
azp: Optional[torch.Tensor] = None,
|
79 |
-
symmetric: bool = True
|
80 |
-
) ->
|
81 |
"""
|
82 |
Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
|
83 |
|
@@ -90,21 +87,25 @@ def scaled_int8_quant(
|
|
90 |
symmetric: Whether to use symmetric quantization (scale only, azp ignored).
|
91 |
|
92 |
Returns:
|
93 |
-
|
94 |
"""
|
95 |
output = torch.empty_like(input, dtype=torch.int8)
|
96 |
if scale is not None:
|
97 |
# static-per-tensor quantization.
|
98 |
assert symmetric == (
|
99 |
-
azp
|
100 |
-
|
101 |
ops.static_scaled_int8_quant(output, input, scale, azp)
|
102 |
return output, scale, azp
|
103 |
|
104 |
# dynamic-per-token quantization.
|
105 |
-
input_scales = torch.empty(
|
106 |
-
|
107 |
-
|
108 |
-
input_azp = None if symmetric else torch.empty_like(input_scales,
|
109 |
-
|
|
|
|
|
110 |
return output, input_scales, input_azp
|
|
|
|
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
+
from ._ops import ops
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# fp8
|
8 |
def scaled_fp8_quant(
|
|
|
11 |
num_token_padding: Optional[int] = None,
|
12 |
scale_ub: Optional[torch.Tensor] = None,
|
13 |
use_per_token_if_dynamic: bool = False,
|
14 |
+
output: Optional[torch.Tensor] = None,
|
15 |
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
16 |
"""
|
17 |
Quantize input tensor to FP8 and return quantized tensor and scale.
|
18 |
|
|
|
33 |
in the dynamic quantization case.
|
34 |
|
35 |
Returns:
|
36 |
+
tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
|
37 |
scaling factor.
|
38 |
"""
|
39 |
# This code assumes batch_dim and num_tokens are flattened
|
40 |
+
assert (input.ndim == 2)
|
41 |
+
shape: Union[tuple[int, int], torch.Size] = input.shape
|
42 |
+
# For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
|
43 |
+
out_dtype: torch.dtype = current_platform.fp8_dtype()
|
|
|
|
|
44 |
if num_token_padding:
|
45 |
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
46 |
+
if output is None:
|
47 |
+
output = torch.empty(shape, device=input.device, dtype=out_dtype)
|
48 |
+
else:
|
49 |
+
assert num_token_padding is None, \
|
50 |
+
"padding not supported if output passed in"
|
51 |
+
assert output.dtype == out_dtype
|
52 |
|
53 |
if scale is None:
|
54 |
if use_per_token_if_dynamic:
|
55 |
+
scale = torch.empty((shape[0], 1),
|
56 |
+
device=input.device,
|
57 |
+
dtype=torch.float32)
|
58 |
+
ops.dynamic_per_token_scaled_fp8_quant(
|
59 |
+
output, input.contiguous(), scale, scale_ub)
|
60 |
else:
|
61 |
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
62 |
ops.dynamic_scaled_fp8_quant(output, input, scale)
|
63 |
else:
|
64 |
# num_token_padding not implemented for this case
|
65 |
+
assert (scale.numel() == 1 and num_token_padding is None)
|
66 |
ops.static_scaled_fp8_quant(output, input, scale)
|
67 |
|
68 |
return output, scale
|
|
|
73 |
input: torch.Tensor,
|
74 |
scale: Optional[torch.Tensor] = None,
|
75 |
azp: Optional[torch.Tensor] = None,
|
76 |
+
symmetric: bool = True
|
77 |
+
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
78 |
"""
|
79 |
Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
|
80 |
|
|
|
87 |
symmetric: Whether to use symmetric quantization (scale only, azp ignored).
|
88 |
|
89 |
Returns:
|
90 |
+
tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
|
91 |
"""
|
92 |
output = torch.empty_like(input, dtype=torch.int8)
|
93 |
if scale is not None:
|
94 |
# static-per-tensor quantization.
|
95 |
assert symmetric == (
|
96 |
+
azp
|
97 |
+
is None), "azp must only be provided for asymmetric quantization."
|
98 |
ops.static_scaled_int8_quant(output, input, scale, azp)
|
99 |
return output, scale, azp
|
100 |
|
101 |
# dynamic-per-token quantization.
|
102 |
+
input_scales = torch.empty((input.numel() // input.shape[-1], 1),
|
103 |
+
device=input.device,
|
104 |
+
dtype=torch.float32)
|
105 |
+
input_azp = None if symmetric else torch.empty_like(input_scales,
|
106 |
+
dtype=torch.int32)
|
107 |
+
ops.dynamic_scaled_int8_quant(output, input.contiguous(),
|
108 |
+
input_scales, input_azp)
|
109 |
return output, input_scales, input_azp
|
110 |
+
|
111 |
+
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/cutlass.py
CHANGED
@@ -2,22 +2,18 @@ from typing import Optional
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
except ImportError as e:
|
8 |
-
# Fallback for local development.
|
9 |
-
try:
|
10 |
-
import _quantization
|
11 |
-
|
12 |
-
ops = torch.ops._quantization
|
13 |
-
except ImportError:
|
14 |
-
raise e
|
15 |
|
16 |
|
17 |
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
18 |
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
21 |
def cutlass_scaled_mm(
|
22 |
a: torch.Tensor,
|
23 |
b: torch.Tensor,
|
@@ -33,12 +29,10 @@ def cutlass_scaled_mm(
|
|
33 |
m = a.shape[0]
|
34 |
n = b.shape[1]
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
41 |
-
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
42 |
|
43 |
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
44 |
|
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
+
from ._ops import ops
|
6 |
+
from .platforms import current_platform
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
10 |
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
11 |
|
12 |
|
13 |
+
def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
|
14 |
+
return ops.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
|
15 |
+
|
16 |
+
|
17 |
def cutlass_scaled_mm(
|
18 |
a: torch.Tensor,
|
19 |
b: torch.Tensor,
|
|
|
29 |
m = a.shape[0]
|
30 |
n = b.shape[1]
|
31 |
|
32 |
+
cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
33 |
+
if not cutlass_compatible_b:
|
34 |
+
from .triton_scaled_mm import triton_scaled_mm
|
35 |
+
return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
|
|
|
|
36 |
|
37 |
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
38 |
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/marlin.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import TYPE_CHECKING
|
2 |
|
3 |
import torch
|
4 |
|
@@ -30,58 +30,30 @@ except ImportError as e:
|
|
30 |
from .scalar_type import ScalarType
|
31 |
|
32 |
|
33 |
-
# fp8 marlin
|
34 |
-
def fp8_marlin_gemm(
|
35 |
-
a: torch.Tensor,
|
36 |
-
b_q_weight: torch.Tensor,
|
37 |
-
b_scales: torch.Tensor,
|
38 |
-
workspace: torch.Tensor,
|
39 |
-
num_bits: int,
|
40 |
-
size_m: int,
|
41 |
-
size_n: int,
|
42 |
-
size_k: int,
|
43 |
-
) -> torch.Tensor:
|
44 |
-
return ops.fp8_marlin_gemm(
|
45 |
-
a, b_q_weight, b_scales, workspace, num_bits, size_m, size_n, size_k
|
46 |
-
)
|
47 |
-
|
48 |
-
|
49 |
# gptq_marlin
|
50 |
-
def gptq_marlin_gemm(
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
) -> torch.Tensor:
|
67 |
-
return ops.gptq_marlin_gemm(
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
perm,
|
74 |
-
workspace,
|
75 |
-
b_q_type.id,
|
76 |
-
size_m,
|
77 |
-
size_n,
|
78 |
-
size_k,
|
79 |
-
is_k_full,
|
80 |
-
has_zp,
|
81 |
-
use_fp32_reduce,
|
82 |
-
is_zp_float,
|
83 |
-
)
|
84 |
-
|
85 |
|
86 |
# gptq_marlin
|
87 |
def gptq_marlin_repack(
|
@@ -153,14 +125,6 @@ def marlin_qqq_gemm(
|
|
153 |
# Fake ops
|
154 |
|
155 |
if hasattr(ops, "gptq_marlin_24_gemm"):
|
156 |
-
@register_fake(add_op_namespace_prefix("fp8_marlin_gemm"))
|
157 |
-
def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
158 |
-
b_scales: torch.Tensor, workspace: torch.Tensor,
|
159 |
-
num_bits: int, size_m: torch.SymInt,
|
160 |
-
size_n: torch.SymInt,
|
161 |
-
size_k: torch.SymInt) -> torch.Tensor:
|
162 |
-
return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
|
163 |
-
|
164 |
@register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
|
165 |
def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
166 |
b_meta: torch.Tensor, b_scales: torch.Tensor,
|
@@ -172,20 +136,22 @@ if hasattr(ops, "gptq_marlin_24_gemm"):
|
|
172 |
|
173 |
@register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
|
174 |
def _gptq_marlin_gemm_fake(a: torch.Tensor,
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
189 |
return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
|
190 |
|
191 |
@register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))
|
|
|
1 |
+
from typing import TYPE_CHECKING, Optional
|
2 |
|
3 |
import torch
|
4 |
|
|
|
30 |
from .scalar_type import ScalarType
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# gptq_marlin
|
34 |
+
def gptq_marlin_gemm(a: torch.Tensor,
|
35 |
+
c: Optional[torch.Tensor],
|
36 |
+
b_q_weight: torch.Tensor,
|
37 |
+
b_scales: torch.Tensor,
|
38 |
+
global_scale: Optional[torch.Tensor],
|
39 |
+
b_zeros: Optional[torch.Tensor],
|
40 |
+
g_idx: Optional[torch.Tensor],
|
41 |
+
perm: Optional[torch.Tensor],
|
42 |
+
workspace: torch.Tensor,
|
43 |
+
b_q_type: ScalarType,
|
44 |
+
size_m: int,
|
45 |
+
size_n: int,
|
46 |
+
size_k: int,
|
47 |
+
is_k_full: bool = True,
|
48 |
+
use_atomic_add: bool = False,
|
49 |
+
use_fp32_reduce: bool = False,
|
50 |
+
is_zp_float: bool = False) -> torch.Tensor:
|
51 |
+
return ops.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
|
52 |
+
global_scale, b_zeros, g_idx, perm,
|
53 |
+
workspace, b_q_type.id, size_m,
|
54 |
+
size_n, size_k, is_k_full,
|
55 |
+
use_atomic_add, use_fp32_reduce,
|
56 |
+
is_zp_float)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# gptq_marlin
|
59 |
def gptq_marlin_repack(
|
|
|
125 |
# Fake ops
|
126 |
|
127 |
if hasattr(ops, "gptq_marlin_24_gemm"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
@register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
|
129 |
def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
130 |
b_meta: torch.Tensor, b_scales: torch.Tensor,
|
|
|
136 |
|
137 |
@register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
|
138 |
def _gptq_marlin_gemm_fake(a: torch.Tensor,
|
139 |
+
c: Optional[torch.Tensor],
|
140 |
+
b_q_weight: torch.Tensor,
|
141 |
+
b_scales: torch.Tensor,
|
142 |
+
global_scale: Optional[torch.Tensor],
|
143 |
+
b_zeros: Optional[torch.Tensor],
|
144 |
+
g_idx: Optional[torch.Tensor],
|
145 |
+
perm: Optional[torch.Tensor],
|
146 |
+
workspace: torch.Tensor,
|
147 |
+
b_q_type_id: int,
|
148 |
+
size_m: torch.SymInt,
|
149 |
+
size_n: torch.SymInt,
|
150 |
+
size_k: torch.SymInt,
|
151 |
+
is_k_full: bool = True,
|
152 |
+
use_atomic_add: bool = False,
|
153 |
+
use_fp32_reduce: bool = False,
|
154 |
+
is_zp_float: bool = False) -> torch.Tensor:
|
155 |
return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
|
156 |
|
157 |
@register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/platforms.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from functools import lru_cache
|
3 |
+
from typing import NamedTuple
|
4 |
+
|
5 |
+
import torch
|
6 |
+
|
7 |
+
IS_ROCM = torch.version.hip is not None
|
8 |
+
|
9 |
+
|
10 |
+
class DeviceCapability(NamedTuple):
|
11 |
+
major: int
|
12 |
+
minor: int
|
13 |
+
|
14 |
+
def as_version_str(self) -> str:
|
15 |
+
return f"{self.major}.{self.minor}"
|
16 |
+
|
17 |
+
def to_int(self) -> int:
|
18 |
+
"""
|
19 |
+
Express device capability as an integer ``<major><minor>``.
|
20 |
+
|
21 |
+
It is assumed that the minor version is always a single digit.
|
22 |
+
"""
|
23 |
+
assert 0 <= self.minor < 10
|
24 |
+
return self.major * 10 + self.minor
|
25 |
+
|
26 |
+
|
27 |
+
class Platform(ABC):
|
28 |
+
simple_compile_backend: str = "inductor"
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
@abstractmethod
|
32 |
+
def get_device_name(cls, device_id: int = 0) -> str: ...
|
33 |
+
|
34 |
+
@abstractmethod
|
35 |
+
def is_rocm(self): ...
|
36 |
+
|
37 |
+
|
38 |
+
class CudaPlatform(Platform):
|
39 |
+
@classmethod
|
40 |
+
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
41 |
+
major, minor = torch.cuda.get_device_capability(device_id)
|
42 |
+
return DeviceCapability(major=major, minor=minor)
|
43 |
+
|
44 |
+
@classmethod
|
45 |
+
@lru_cache(maxsize=8)
|
46 |
+
def get_device_name(cls, device_id: int = 0) -> str:
|
47 |
+
return torch.cuda.get_device_name(0)
|
48 |
+
|
49 |
+
def is_rocm(self):
|
50 |
+
return False
|
51 |
+
|
52 |
+
|
53 |
+
class RocmPlatform(Platform):
|
54 |
+
@classmethod
|
55 |
+
@lru_cache(maxsize=8)
|
56 |
+
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
57 |
+
major, minor = torch.cuda.get_device_capability(device_id)
|
58 |
+
return DeviceCapability(major=major, minor=minor)
|
59 |
+
|
60 |
+
@classmethod
|
61 |
+
@lru_cache(maxsize=8)
|
62 |
+
def get_device_name(cls, device_id: int = 0) -> str:
|
63 |
+
return torch.cuda.get_device_name(device_id)
|
64 |
+
|
65 |
+
def is_rocm(self):
|
66 |
+
return True
|
67 |
+
|
68 |
+
|
69 |
+
current_platform = RocmPlatform() if IS_ROCM else CudaPlatform()
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/scalar_type.py
CHANGED
@@ -1,9 +1,14 @@
|
|
|
|
|
|
|
|
1 |
import functools
|
2 |
import struct
|
3 |
from dataclasses import dataclass
|
4 |
from enum import Enum
|
5 |
from typing import Optional, Union
|
6 |
|
|
|
|
|
7 |
|
8 |
# Mirrors enum in `core/scalar_type.hpp`
|
9 |
class NanRepr(Enum):
|
@@ -121,8 +126,8 @@ class ScalarType:
|
|
121 |
min_raw = max_raw | sign_bit_double
|
122 |
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
123 |
else:
|
124 |
-
assert (not self.is_signed() or
|
125 |
-
|
126 |
|
127 |
if self.is_signed():
|
128 |
return -(1 << (self.size_bits - 1))
|
@@ -156,6 +161,8 @@ class ScalarType:
|
|
156 |
assert offset <= 64, \
|
157 |
f"ScalarType fields too big {offset} to fit into an int64"
|
158 |
|
|
|
|
|
159 |
return val
|
160 |
|
161 |
@property
|
@@ -293,6 +300,13 @@ class ScalarType:
|
|
293 |
ret.id # noqa B018: make sure the id is cached
|
294 |
return ret
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
# naming generally follows: https://github.com/jax-ml/ml_dtypes
|
298 |
# for floating point types (leading f) the scheme is:
|
@@ -319,6 +333,9 @@ class scalar_types:
|
|
319 |
# fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
|
320 |
float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
|
321 |
|
|
|
|
|
|
|
322 |
# "gptq" types
|
323 |
uint2b2 = ScalarType.uint(2, 2)
|
324 |
uint3b4 = ScalarType.uint(3, 4)
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
import functools
|
5 |
import struct
|
6 |
from dataclasses import dataclass
|
7 |
from enum import Enum
|
8 |
from typing import Optional, Union
|
9 |
|
10 |
+
_SCALAR_TYPES_ID_MAP = {}
|
11 |
+
|
12 |
|
13 |
# Mirrors enum in `core/scalar_type.hpp`
|
14 |
class NanRepr(Enum):
|
|
|
126 |
min_raw = max_raw | sign_bit_double
|
127 |
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
128 |
else:
|
129 |
+
assert (not self.is_signed() or self.size_bits
|
130 |
+
<= 64), "Cannot represent min as a int64_t"
|
131 |
|
132 |
if self.is_signed():
|
133 |
return -(1 << (self.size_bits - 1))
|
|
|
161 |
assert offset <= 64, \
|
162 |
f"ScalarType fields too big {offset} to fit into an int64"
|
163 |
|
164 |
+
_SCALAR_TYPES_ID_MAP[val] = self
|
165 |
+
|
166 |
return val
|
167 |
|
168 |
@property
|
|
|
300 |
ret.id # noqa B018: make sure the id is cached
|
301 |
return ret
|
302 |
|
303 |
+
@classmethod
|
304 |
+
def from_id(cls, scalar_type_id: int):
|
305 |
+
if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
|
306 |
+
raise ValueError(
|
307 |
+
f"scalar_type_id {scalar_type_id} doesn't exists.")
|
308 |
+
return _SCALAR_TYPES_ID_MAP[scalar_type_id]
|
309 |
+
|
310 |
|
311 |
# naming generally follows: https://github.com/jax-ml/ml_dtypes
|
312 |
# for floating point types (leading f) the scheme is:
|
|
|
333 |
# fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
|
334 |
float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
|
335 |
|
336 |
+
# fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
|
337 |
+
float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
|
338 |
+
|
339 |
# "gptq" types
|
340 |
uint2b2 = ScalarType.uint(2, 2)
|
341 |
uint3b4 = ScalarType.uint(3, 4)
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
|
3 |
import numpy
|
4 |
import torch
|
@@ -42,7 +45,9 @@ USE_FP32_REDUCE_DEFAULT = True
|
|
42 |
# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
|
43 |
# TODO: we may want to move this into the C++ so its closer to the actual impl
|
44 |
def query_marlin_supported_quant_types(
|
45 |
-
has_zp:
|
|
|
|
|
46 |
):
|
47 |
if device_capability is None:
|
48 |
capability_tuple = torch.cuda.get_device_capability()
|
@@ -51,137 +56,141 @@ def query_marlin_supported_quant_types(
|
|
51 |
if device_capability < 80:
|
52 |
return []
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
if has_zp:
|
55 |
# AWQ style, unsigned + runtime zero-point
|
56 |
-
return [scalar_types.uint4
|
57 |
else:
|
58 |
# GPTQ style, unsigned + symmetric bias
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
62 |
|
63 |
|
64 |
def _check_marlin_supported(
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
) -> Tuple[bool, Optional[str]]:
|
70 |
|
71 |
if device_capability is None:
|
72 |
capability_tuple = torch.cuda.get_device_capability()
|
73 |
device_capability = capability_tuple[0] * 10 + capability_tuple[1]
|
74 |
|
75 |
-
supported_types = query_marlin_supported_quant_types(
|
|
|
76 |
|
77 |
if quant_type not in supported_types:
|
78 |
-
return (
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
return (
|
87 |
-
False,
|
88 |
-
f"Marlin does not support group_size = {group_size}. "
|
89 |
-
f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
|
90 |
-
"are supported.",
|
91 |
-
)
|
92 |
|
93 |
return True, None
|
94 |
|
95 |
|
96 |
-
def check_marlin_supported(
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
)
|
102 |
-
cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
|
103 |
return cond
|
104 |
|
105 |
|
106 |
-
def verify_marlin_supported(
|
107 |
-
|
108 |
-
) -> None:
|
109 |
cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
|
110 |
if not cond:
|
111 |
assert err_msg is not None
|
112 |
raise ValueError(err_msg)
|
113 |
|
114 |
|
115 |
-
def verify_marlin_supports_shape(
|
116 |
-
|
117 |
-
|
118 |
-
input_size: int,
|
119 |
-
group_size: int,
|
120 |
-
) -> None:
|
121 |
|
122 |
# Validate output_size_per_partition
|
123 |
if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
|
124 |
-
raise ValueError(
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
"with --quantization gptq."
|
130 |
-
)
|
131 |
|
132 |
# Validate input_size_per_partition
|
133 |
if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
|
134 |
-
raise ValueError(
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
if group_size < input_size and input_size_per_partition % group_size != 0:
|
143 |
raise ValueError(
|
144 |
f"Weight input_size_per_partition = {input_size_per_partition}"
|
145 |
-
f" is not divisible by group_size = {group_size}."
|
146 |
"Consider reducing tensor_parallel_size or running "
|
147 |
-
"with --quantization gptq."
|
148 |
-
)
|
149 |
|
150 |
|
151 |
-
def check_marlin_supports_shape(
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
group_size: int,
|
156 |
-
) -> Tuple[bool, Optional[str]]:
|
157 |
try:
|
158 |
-
verify_marlin_supports_shape(
|
159 |
-
|
160 |
-
|
161 |
except ValueError as e:
|
162 |
return False, e.__str__()
|
163 |
return True, None
|
164 |
|
165 |
|
166 |
-
def marlin_make_workspace(
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
|
171 |
-
) * GPTQ_MARLIN_MAX_PARALLEL
|
172 |
|
173 |
-
return torch.zeros(
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
|
178 |
def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
|
179 |
return (not act_order) or (act_order and not is_row_parallel)
|
180 |
|
181 |
|
182 |
-
def marlin_repeat_scales_on_all_ranks(
|
183 |
-
|
184 |
-
) -> bool:
|
185 |
# Need to repeat scales on every rank if act_ordering or
|
186 |
# channelwise and RowParallelLinear
|
187 |
is_channelwise = group_size == -1
|
@@ -189,35 +198,34 @@ def marlin_repeat_scales_on_all_ranks(
|
|
189 |
|
190 |
|
191 |
def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
192 |
-
return torch.nn.Parameter(
|
193 |
-
|
194 |
-
)
|
195 |
|
196 |
|
197 |
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
198 |
-
return torch.nn.Parameter(
|
199 |
-
|
200 |
-
)
|
201 |
|
202 |
|
203 |
-
def marlin_sort_g_idx(
|
|
|
204 |
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
205 |
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
206 |
|
207 |
|
208 |
def get_scale_perms():
|
209 |
-
scale_perm:
|
210 |
for i in range(8):
|
211 |
scale_perm.extend([i + 8 * j for j in range(8)])
|
212 |
-
scale_perm_single:
|
213 |
for i in range(4):
|
214 |
-
scale_perm_single.extend(
|
|
|
215 |
return scale_perm, scale_perm_single
|
216 |
|
217 |
|
218 |
-
def marlin_permute_scales(
|
219 |
-
|
220 |
-
) -> torch.Tensor:
|
221 |
|
222 |
scale_perm, scale_perm_single = get_scale_perms()
|
223 |
if group_size < size_k and group_size != -1:
|
@@ -247,9 +255,8 @@ def marlin_moe_permute_scales(
|
|
247 |
return output
|
248 |
|
249 |
|
250 |
-
def marlin_zero_points(
|
251 |
-
|
252 |
-
) -> torch.Tensor:
|
253 |
# Permute zero-points in a similar way to scales, but do not use the
|
254 |
# "single" permutation, since zero-points are applied on every MMA
|
255 |
scale_perm, _ = get_scale_perms()
|
@@ -270,9 +277,8 @@ def marlin_zero_points(
|
|
270 |
return zp
|
271 |
|
272 |
|
273 |
-
def awq_to_marlin_zero_points(
|
274 |
-
|
275 |
-
) -> torch.Tensor:
|
276 |
# AWQ zero-points are quantized and packed on the column dim.
|
277 |
# In addition, the values are permuted based on dequantizer.
|
278 |
# Here we undo both of these, and then apply marlin permutation
|
@@ -294,9 +300,8 @@ def awq_to_marlin_zero_points(
|
|
294 |
return marlin_zp
|
295 |
|
296 |
|
297 |
-
def moe_awq_to_marlin_zero_points(
|
298 |
-
|
299 |
-
):
|
300 |
num_experts = q_zp_packed.shape[0]
|
301 |
output = torch.empty(
|
302 |
(num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
|
@@ -304,45 +309,97 @@ def moe_awq_to_marlin_zero_points(
|
|
304 |
dtype=q_zp_packed.dtype,
|
305 |
)
|
306 |
for e in range(num_experts):
|
307 |
-
output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
|
|
|
308 |
return output
|
309 |
|
310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
def apply_gptq_marlin_linear(
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
) -> torch.Tensor:
|
326 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
327 |
-
out_shape = input.shape[:-1] + (output_size_per_partition,)
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
|
347 |
if bias is not None:
|
348 |
output.add_(bias) # In-place add
|
@@ -351,39 +408,43 @@ def apply_gptq_marlin_linear(
|
|
351 |
|
352 |
|
353 |
def apply_awq_marlin_linear(
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
) -> torch.Tensor:
|
367 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
368 |
-
out_shape = input.shape[:-1] + (output_size_per_partition,)
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
|
|
|
|
|
|
|
|
|
|
387 |
|
388 |
if bias is not None:
|
389 |
output.add_(bias) # In-place add
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
+
from typing import Optional
|
5 |
|
6 |
import numpy
|
7 |
import torch
|
|
|
45 |
# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
|
46 |
# TODO: we may want to move this into the C++ so its closer to the actual impl
|
47 |
def query_marlin_supported_quant_types(
|
48 |
+
has_zp: Optional[bool] = None,
|
49 |
+
include_fp_type: bool = True,
|
50 |
+
device_capability: Optional[int] = None,
|
51 |
):
|
52 |
if device_capability is None:
|
53 |
capability_tuple = torch.cuda.get_device_capability()
|
|
|
56 |
if device_capability < 80:
|
57 |
return []
|
58 |
|
59 |
+
# - has_zp is True: return quant_types that has zero points
|
60 |
+
# - has_zp is False: return quant_types that has not zero points
|
61 |
+
# - has_zp is None: both
|
62 |
+
if has_zp is None:
|
63 |
+
types0 = query_marlin_supported_quant_types(False, include_fp_type,
|
64 |
+
device_capability)
|
65 |
+
types1 = query_marlin_supported_quant_types(True, include_fp_type,
|
66 |
+
device_capability)
|
67 |
+
return types0 + types1
|
68 |
+
|
69 |
if has_zp:
|
70 |
# AWQ style, unsigned + runtime zero-point
|
71 |
+
return [scalar_types.uint4]
|
72 |
else:
|
73 |
# GPTQ style, unsigned + symmetric bias
|
74 |
+
res = [scalar_types.uint4b8, scalar_types.uint8b128]
|
75 |
+
if include_fp_type:
|
76 |
+
res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
|
77 |
+
return res
|
78 |
|
79 |
|
80 |
def _check_marlin_supported(
|
81 |
+
quant_type: ScalarType,
|
82 |
+
group_size: Optional[int],
|
83 |
+
has_zp: bool,
|
84 |
+
device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
|
|
|
85 |
|
86 |
if device_capability is None:
|
87 |
capability_tuple = torch.cuda.get_device_capability()
|
88 |
device_capability = capability_tuple[0] * 10 + capability_tuple[1]
|
89 |
|
90 |
+
supported_types = query_marlin_supported_quant_types(
|
91 |
+
has_zp, True, device_capability)
|
92 |
|
93 |
if quant_type not in supported_types:
|
94 |
+
return (False, f"Marlin does not support weight_bits = {quant_type}. "
|
95 |
+
f"Only types = {supported_types} "
|
96 |
+
f"are supported (for group_size = {group_size}, "
|
97 |
+
f"device_capability = {device_capability}, zp = {has_zp}).")
|
98 |
+
if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
|
99 |
+
return (False, f"Marlin does not support group_size = {group_size}. "
|
100 |
+
f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
|
101 |
+
"are supported.")
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
return True, None
|
104 |
|
105 |
|
106 |
+
def check_marlin_supported(quant_type: ScalarType,
|
107 |
+
group_size: int,
|
108 |
+
has_zp: bool = False,
|
109 |
+
device_capability: Optional[int] = None) -> bool:
|
110 |
+
cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
|
111 |
+
device_capability)
|
|
|
112 |
return cond
|
113 |
|
114 |
|
115 |
+
def verify_marlin_supported(quant_type: ScalarType,
|
116 |
+
group_size: int,
|
117 |
+
has_zp: bool = False) -> None:
|
118 |
cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
|
119 |
if not cond:
|
120 |
assert err_msg is not None
|
121 |
raise ValueError(err_msg)
|
122 |
|
123 |
|
124 |
+
def verify_marlin_supports_shape(output_size_per_partition: int,
|
125 |
+
input_size_per_partition: int,
|
126 |
+
input_size: int, group_size: int) -> None:
|
|
|
|
|
|
|
127 |
|
128 |
# Validate output_size_per_partition
|
129 |
if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
|
130 |
+
raise ValueError(f"Weight output_size_per_partition = "
|
131 |
+
f"{output_size_per_partition} is not divisible by "
|
132 |
+
f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
|
133 |
+
"Consider reducing tensor_parallel_size or running "
|
134 |
+
"with --quantization gptq.")
|
|
|
|
|
135 |
|
136 |
# Validate input_size_per_partition
|
137 |
if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
|
138 |
+
raise ValueError(f"Weight input_size_per_partition = "
|
139 |
+
f"{input_size_per_partition} is not divisible "
|
140 |
+
f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
|
141 |
+
"Consider reducing tensor_parallel_size or running "
|
142 |
+
"with --quantization gptq.")
|
143 |
+
|
144 |
+
if (group_size < input_size
|
145 |
+
and input_size_per_partition % group_size != 0):
|
|
|
146 |
raise ValueError(
|
147 |
f"Weight input_size_per_partition = {input_size_per_partition}"
|
148 |
+
f" is not divisible by group_size = {group_size}. "
|
149 |
"Consider reducing tensor_parallel_size or running "
|
150 |
+
"with --quantization gptq.")
|
|
|
151 |
|
152 |
|
153 |
+
def check_marlin_supports_shape(output_size_per_partition: int,
|
154 |
+
input_size_per_partition: int,
|
155 |
+
input_size: int, group_size: int) \
|
156 |
+
-> tuple[bool, Optional[str]]:
|
|
|
|
|
157 |
try:
|
158 |
+
verify_marlin_supports_shape(output_size_per_partition,
|
159 |
+
input_size_per_partition, input_size,
|
160 |
+
group_size)
|
161 |
except ValueError as e:
|
162 |
return False, e.__str__()
|
163 |
return True, None
|
164 |
|
165 |
|
166 |
+
def marlin_make_workspace(output_size_per_partition: int,
|
167 |
+
device: torch.device) -> torch.Tensor:
|
168 |
+
max_workspace_size = (output_size_per_partition //
|
169 |
+
GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
|
|
|
|
|
170 |
|
171 |
+
return torch.zeros(max_workspace_size,
|
172 |
+
dtype=torch.int,
|
173 |
+
device=device,
|
174 |
+
requires_grad=False)
|
175 |
+
|
176 |
+
|
177 |
+
def marlin_make_workspace_new(device: torch.device,
|
178 |
+
max_blocks_per_sm: int = 1) -> torch.Tensor:
|
179 |
+
# In the new marlin kernel, we use the num of threadblocks as workspace
|
180 |
+
# size. The num of threadblocks is is sms_count * max_blocks_per_sm.
|
181 |
+
sms = torch.cuda.get_device_properties(device).multi_processor_count
|
182 |
+
return torch.zeros(sms * max_blocks_per_sm,
|
183 |
+
dtype=torch.int,
|
184 |
+
device=device,
|
185 |
+
requires_grad=False)
|
186 |
|
187 |
|
188 |
def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
|
189 |
return (not act_order) or (act_order and not is_row_parallel)
|
190 |
|
191 |
|
192 |
+
def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
|
193 |
+
is_row_parallel: bool) -> bool:
|
|
|
194 |
# Need to repeat scales on every rank if act_ordering or
|
195 |
# channelwise and RowParallelLinear
|
196 |
is_channelwise = group_size == -1
|
|
|
198 |
|
199 |
|
200 |
def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
201 |
+
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
202 |
+
requires_grad=False)
|
|
|
203 |
|
204 |
|
205 |
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
206 |
+
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
207 |
+
requires_grad=False)
|
|
|
208 |
|
209 |
|
210 |
+
def marlin_sort_g_idx(
|
211 |
+
g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
212 |
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
213 |
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
214 |
|
215 |
|
216 |
def get_scale_perms():
|
217 |
+
scale_perm: list[int] = []
|
218 |
for i in range(8):
|
219 |
scale_perm.extend([i + 8 * j for j in range(8)])
|
220 |
+
scale_perm_single: list[int] = []
|
221 |
for i in range(4):
|
222 |
+
scale_perm_single.extend(
|
223 |
+
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
224 |
return scale_perm, scale_perm_single
|
225 |
|
226 |
|
227 |
+
def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
|
228 |
+
group_size: int) -> torch.Tensor:
|
|
|
229 |
|
230 |
scale_perm, scale_perm_single = get_scale_perms()
|
231 |
if group_size < size_k and group_size != -1:
|
|
|
255 |
return output
|
256 |
|
257 |
|
258 |
+
def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
|
259 |
+
num_bits: int) -> torch.Tensor:
|
|
|
260 |
# Permute zero-points in a similar way to scales, but do not use the
|
261 |
# "single" permutation, since zero-points are applied on every MMA
|
262 |
scale_perm, _ = get_scale_perms()
|
|
|
277 |
return zp
|
278 |
|
279 |
|
280 |
+
def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
281 |
+
size_n: int, num_bits: int) -> torch.Tensor:
|
|
|
282 |
# AWQ zero-points are quantized and packed on the column dim.
|
283 |
# In addition, the values are permuted based on dequantizer.
|
284 |
# Here we undo both of these, and then apply marlin permutation
|
|
|
300 |
return marlin_zp
|
301 |
|
302 |
|
303 |
+
def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
304 |
+
size_n: int, num_bits: int):
|
|
|
305 |
num_experts = q_zp_packed.shape[0]
|
306 |
output = torch.empty(
|
307 |
(num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
|
|
|
309 |
dtype=q_zp_packed.dtype,
|
310 |
)
|
311 |
for e in range(num_experts):
|
312 |
+
output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
|
313 |
+
num_bits)
|
314 |
return output
|
315 |
|
316 |
|
317 |
+
def maybe_warn_marlin_atomic_add(device, dtype):
|
318 |
+
if torch.compiler.is_dynamo_compiling():
|
319 |
+
return
|
320 |
+
device_capability = torch.cuda.get_device_capability(device)
|
321 |
+
if device_capability[0] < 9 and dtype == torch.bfloat16:
|
322 |
+
logger.info_once(
|
323 |
+
"You are running Marlin kernel with bf16 on GPUs before SM90. "
|
324 |
+
"You can consider change to fp16 to achieve better performance "
|
325 |
+
"if possible.")
|
326 |
+
|
327 |
+
|
328 |
+
def maybe_warn_marlin_atomic_add_env():
|
329 |
+
if torch.compiler.is_dynamo_compiling():
|
330 |
+
return
|
331 |
+
if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
|
332 |
+
return
|
333 |
+
logger.info_once(
|
334 |
+
"Marlin kernel can achieve better performance for small size_n "
|
335 |
+
"with experimental use_atomic_add feature. "
|
336 |
+
"You can consider set environment variable "
|
337 |
+
"VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
|
338 |
+
|
339 |
+
|
340 |
+
def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
|
341 |
+
dtype: torch.dtype) -> bool:
|
342 |
+
|
343 |
+
# the performance of atomicAdd is better than global reduce
|
344 |
+
# only when m*n is small and k is large
|
345 |
+
if n >= 2048 or k < 2048 or device.type != "cuda":
|
346 |
+
return False
|
347 |
+
|
348 |
+
# disable atomicAdd reduce by default,
|
349 |
+
# one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
|
350 |
+
if not envs.VLLM_MARLIN_USE_ATOMIC_ADD:
|
351 |
+
maybe_warn_marlin_atomic_add_env()
|
352 |
+
return False
|
353 |
+
|
354 |
+
# sm8x doesn't support atomicAdd + bfloat16 natively
|
355 |
+
device_capability = torch.cuda.get_device_capability(device)
|
356 |
+
if device_capability[0] < 9 and dtype == torch.bfloat16:
|
357 |
+
maybe_warn_marlin_atomic_add(device, dtype)
|
358 |
+
return False
|
359 |
+
|
360 |
+
return True
|
361 |
+
|
362 |
+
|
363 |
def apply_gptq_marlin_linear(
|
364 |
+
input: torch.Tensor,
|
365 |
+
weight: torch.Tensor,
|
366 |
+
weight_scale: torch.Tensor,
|
367 |
+
weight_zp: torch.Tensor,
|
368 |
+
g_idx: torch.Tensor,
|
369 |
+
g_idx_sort_indices: torch.Tensor,
|
370 |
+
workspace: torch.Tensor,
|
371 |
+
wtype: ScalarType,
|
372 |
+
output_size_per_partition: int,
|
373 |
+
input_size_per_partition: int,
|
374 |
+
is_k_full: bool,
|
375 |
+
bias: Optional[torch.Tensor] = None,
|
376 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
|
|
377 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
378 |
+
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
379 |
+
|
380 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
381 |
+
n=output_size_per_partition,
|
382 |
+
k=reshaped_x.size(1),
|
383 |
+
device=input.device,
|
384 |
+
dtype=input.dtype)
|
385 |
+
|
386 |
+
output = ops.gptq_marlin_gemm(reshaped_x,
|
387 |
+
None,
|
388 |
+
weight,
|
389 |
+
weight_scale,
|
390 |
+
None,
|
391 |
+
weight_zp,
|
392 |
+
g_idx,
|
393 |
+
g_idx_sort_indices,
|
394 |
+
workspace,
|
395 |
+
wtype,
|
396 |
+
size_m=reshaped_x.shape[0],
|
397 |
+
size_n=output_size_per_partition,
|
398 |
+
size_k=input_size_per_partition,
|
399 |
+
is_k_full=is_k_full,
|
400 |
+
use_atomic_add=use_atomic_add,
|
401 |
+
use_fp32_reduce=use_fp32_reduce,
|
402 |
+
is_zp_float=False)
|
403 |
|
404 |
if bias is not None:
|
405 |
output.add_(bias) # In-place add
|
|
|
408 |
|
409 |
|
410 |
def apply_awq_marlin_linear(
|
411 |
+
input: torch.Tensor,
|
412 |
+
weight: torch.Tensor,
|
413 |
+
weight_scale: torch.Tensor,
|
414 |
+
weight_zp: torch.Tensor,
|
415 |
+
g_idx: torch.Tensor,
|
416 |
+
g_idx_sort_indices: torch.Tensor,
|
417 |
+
workspace: torch.Tensor,
|
418 |
+
quant_type: ScalarType,
|
419 |
+
output_size_per_partition: int,
|
420 |
+
input_size_per_partition: int,
|
421 |
+
bias: Optional[torch.Tensor] = None,
|
422 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
|
|
423 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
424 |
+
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
425 |
+
|
426 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
427 |
+
n=output_size_per_partition,
|
428 |
+
k=reshaped_x.size(1),
|
429 |
+
device=input.device,
|
430 |
+
dtype=input.dtype)
|
431 |
+
|
432 |
+
output = ops.gptq_marlin_gemm(reshaped_x,
|
433 |
+
None,
|
434 |
+
weight,
|
435 |
+
weight_scale,
|
436 |
+
None,
|
437 |
+
weight_zp,
|
438 |
+
g_idx,
|
439 |
+
g_idx_sort_indices,
|
440 |
+
workspace,
|
441 |
+
quant_type,
|
442 |
+
size_m=reshaped_x.shape[0],
|
443 |
+
size_n=output_size_per_partition,
|
444 |
+
size_k=input_size_per_partition,
|
445 |
+
use_atomic_add=use_atomic_add,
|
446 |
+
use_fp32_reduce=use_fp32_reduce,
|
447 |
+
is_zp_float=False)
|
448 |
|
449 |
if bias is not None:
|
450 |
output.add_(bias) # In-place add
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
import torch
|
7 |
+
|
8 |
+
import quantization as ops
|
9 |
+
|
10 |
+
from .marlin_utils import (
|
11 |
+
USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
|
12 |
+
should_use_atomic_add_reduce)
|
13 |
+
from quantization.scalar_type import scalar_types
|
14 |
+
|
15 |
+
FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16]
|
16 |
+
|
17 |
+
|
18 |
+
def is_fp4_marlin_supported():
|
19 |
+
capability = torch.cuda.get_device_capability()
|
20 |
+
capability = capability[0] * 10 + capability[1]
|
21 |
+
return capability >= 80
|
22 |
+
|
23 |
+
|
24 |
+
def fp4_marlin_process_scales(marlin_scales):
|
25 |
+
if not (marlin_scales >= 0).all():
|
26 |
+
logger.warning_once(
|
27 |
+
"NVFP4 Marlin assumes the scales to be >=0, but has encountered "
|
28 |
+
"negative scales. Accuracy will likely be degraded. This is "
|
29 |
+
"because it changes the scales from FP8-S1E4M3 to a special "
|
30 |
+
"FP8-S0E5M3 format to speedup the dequantization.")
|
31 |
+
|
32 |
+
# convert to half first, we would convert to fp8 later
|
33 |
+
marlin_scales = marlin_scales.to(torch.half)
|
34 |
+
|
35 |
+
# 8 is the number of scale number using by one thread
|
36 |
+
marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
|
37 |
+
marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
|
38 |
+
marlin_scales.size(0) * 2, -1)
|
39 |
+
|
40 |
+
# fit the layout of fp8 dequantization
|
41 |
+
marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
|
42 |
+
marlin_scales.size(0), -1)
|
43 |
+
|
44 |
+
# We assume that weight_scale (FP8-S1E4M3) is always greater
|
45 |
+
# than or equal to 0. So we can convert
|
46 |
+
# (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format.
|
47 |
+
# After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1
|
48 |
+
# when weight_scale > 0. This allows us to have an exponent bias
|
49 |
+
# closer to zero after dequantization.
|
50 |
+
|
51 |
+
marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1
|
52 |
+
marlin_scales = marlin_scales.view(torch.float8_e4m3fn)
|
53 |
+
marlin_scales = marlin_scales[:, 1::2].contiguous()
|
54 |
+
|
55 |
+
return marlin_scales
|
56 |
+
|
57 |
+
|
58 |
+
def fp4_marlin_process_global_scale(global_scale):
|
59 |
+
assert global_scale.dtype in [torch.half, torch.bfloat16]
|
60 |
+
fp4_exponent = 2
|
61 |
+
if global_scale.dtype == torch.half:
|
62 |
+
target_exponent = 5
|
63 |
+
elif global_scale.dtype == torch.bfloat16:
|
64 |
+
target_exponent = 8
|
65 |
+
# exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
|
66 |
+
# exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
|
67 |
+
exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
|
68 |
+
return global_scale * (2.0**(exponent_bias - 7))
|
69 |
+
|
70 |
+
|
71 |
+
def apply_fp4_marlin_linear(
|
72 |
+
input: torch.Tensor,
|
73 |
+
weight: torch.Tensor,
|
74 |
+
weight_scale: torch.Tensor,
|
75 |
+
weight_scale_2: torch.Tensor,
|
76 |
+
workspace: torch.Tensor,
|
77 |
+
size_n: int,
|
78 |
+
size_k: int,
|
79 |
+
bias: Optional[torch.Tensor] = None,
|
80 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
81 |
+
# For GPUs that lack FP4 hardware support, we can leverage the
|
82 |
+
# Marlin kernel for fast weight-only FP4 quantization
|
83 |
+
|
84 |
+
reshaped_x = input.reshape(-1, input.shape[-1])
|
85 |
+
out_shape = input.shape[:-1] + (size_n, )
|
86 |
+
|
87 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
88 |
+
n=size_n,
|
89 |
+
k=size_k,
|
90 |
+
device=input.device,
|
91 |
+
dtype=input.dtype)
|
92 |
+
|
93 |
+
output = ops.gptq_marlin_gemm(a=reshaped_x,
|
94 |
+
c=None,
|
95 |
+
b_q_weight=weight,
|
96 |
+
b_scales=weight_scale,
|
97 |
+
global_scale=weight_scale_2,
|
98 |
+
b_zeros=None,
|
99 |
+
g_idx=None,
|
100 |
+
perm=None,
|
101 |
+
workspace=workspace,
|
102 |
+
b_q_type=scalar_types.float4_e2m1f,
|
103 |
+
size_m=reshaped_x.size(0),
|
104 |
+
size_n=size_n,
|
105 |
+
size_k=size_k,
|
106 |
+
use_atomic_add=use_atomic_add,
|
107 |
+
use_fp32_reduce=use_fp32_reduce)
|
108 |
+
|
109 |
+
if bias is not None:
|
110 |
+
output.add_(bias) # In-place add
|
111 |
+
|
112 |
+
return output.reshape(out_shape)
|
113 |
+
|
114 |
+
|
115 |
+
def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
|
116 |
+
logger.warning_once(
|
117 |
+
"Your GPU does not have native support for FP4 computation but "
|
118 |
+
"FP4 quantization is being used. Weight-only FP4 compression will "
|
119 |
+
"be used leveraging the Marlin kernel. This may degrade "
|
120 |
+
"performance for compute-heavy workloads.")
|
121 |
+
|
122 |
+
part_size_n = layer.output_size_per_partition
|
123 |
+
part_size_k = layer.input_size_per_partition
|
124 |
+
param_dtype = layer.params_dtype
|
125 |
+
|
126 |
+
assert layer.weight.shape == (part_size_n, part_size_k // 2)
|
127 |
+
|
128 |
+
device = layer.weight.device
|
129 |
+
|
130 |
+
# WORKSPACE
|
131 |
+
layer.workspace = marlin_make_workspace_new(device)
|
132 |
+
|
133 |
+
# WEIGHT
|
134 |
+
# Repack weights to marlin format
|
135 |
+
perm = torch.empty(0, dtype=torch.int, device=device)
|
136 |
+
qweight = layer.weight.view(torch.int32).T.contiguous()
|
137 |
+
|
138 |
+
marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
|
139 |
+
perm=perm,
|
140 |
+
size_k=part_size_k,
|
141 |
+
size_n=part_size_n,
|
142 |
+
num_bits=4)
|
143 |
+
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
|
144 |
+
|
145 |
+
# WEIGHT SCALES
|
146 |
+
# Permute scales
|
147 |
+
weight_scale = layer.weight_scale.T.to(param_dtype)
|
148 |
+
weight_scale = marlin_permute_scales(s=weight_scale,
|
149 |
+
size_k=part_size_k,
|
150 |
+
size_n=part_size_n,
|
151 |
+
group_size=16)
|
152 |
+
weight_scale = fp4_marlin_process_scales(weight_scale)
|
153 |
+
layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
|
154 |
+
|
155 |
+
weight_scale_2 = layer.weight_scale_2.to(param_dtype)
|
156 |
+
weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
|
157 |
+
layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
|
158 |
+
requires_grad=False)
|
159 |
+
|
160 |
+
return
|
161 |
+
|
162 |
+
|
163 |
+
def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
|
164 |
+
logger.warning_once(
|
165 |
+
"Your GPU does not have native support for FP4 computation but "
|
166 |
+
"FP4 quantization is being used. Weight-only FP4 compression will "
|
167 |
+
"be used leveraging the Marlin kernel. This may degrade "
|
168 |
+
"performance for compute-heavy workloads.")
|
169 |
+
|
170 |
+
e = layer.num_experts
|
171 |
+
k = layer.hidden_size
|
172 |
+
n = layer.intermediate_size_per_partition
|
173 |
+
|
174 |
+
# WORKSPACE
|
175 |
+
device = layer.w13_weight.device
|
176 |
+
param_dtype = layer.params_dtype
|
177 |
+
layer.workspace = marlin_make_workspace_new(device, 4)
|
178 |
+
perm = torch.empty(0, dtype=torch.int, device=device)
|
179 |
+
|
180 |
+
# WEIGHT
|
181 |
+
# Repack weights to marlin format
|
182 |
+
for name in ["w13_weight", "w2_weight"]:
|
183 |
+
weight = getattr(layer, name)
|
184 |
+
tensor_list = []
|
185 |
+
if "w13" in name:
|
186 |
+
size_n, size_k = n * 2, k
|
187 |
+
else:
|
188 |
+
size_n, size_k = k, n
|
189 |
+
|
190 |
+
assert weight.shape == (e, size_n, size_k // 2)
|
191 |
+
|
192 |
+
for i in range(e):
|
193 |
+
qweight = weight[i].view(torch.int32).T.contiguous()
|
194 |
+
|
195 |
+
marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
|
196 |
+
perm=perm,
|
197 |
+
size_k=size_k,
|
198 |
+
size_n=size_n,
|
199 |
+
num_bits=4)
|
200 |
+
tensor_list.append(marlin_qweight)
|
201 |
+
|
202 |
+
weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
|
203 |
+
weight = torch.nn.Parameter(weight, requires_grad=False)
|
204 |
+
|
205 |
+
setattr(layer, name, weight)
|
206 |
+
|
207 |
+
# WEIGHT SCALES
|
208 |
+
# Permute scales
|
209 |
+
for name in ["w13", "w2"]:
|
210 |
+
scales = getattr(layer, name + "_weight_scale").to(param_dtype)
|
211 |
+
global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
|
212 |
+
|
213 |
+
tensor_list = []
|
214 |
+
if "w13" in name:
|
215 |
+
size_n, size_k = n * 2, k
|
216 |
+
else:
|
217 |
+
size_n, size_k = k, n
|
218 |
+
|
219 |
+
for i in range(e):
|
220 |
+
marlin_scales = marlin_permute_scales(s=scales[i].T,
|
221 |
+
size_k=size_k,
|
222 |
+
size_n=size_n,
|
223 |
+
group_size=16)
|
224 |
+
marlin_scales = fp4_marlin_process_scales(marlin_scales)
|
225 |
+
tensor_list.append(marlin_scales)
|
226 |
+
|
227 |
+
scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
|
228 |
+
scales = torch.nn.Parameter(scales, requires_grad=False)
|
229 |
+
setattr(layer, name + "_weight_scale", scales)
|
230 |
+
|
231 |
+
global_scale = fp4_marlin_process_global_scale(global_scale)
|
232 |
+
global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
|
233 |
+
setattr(layer, name + "_weight_scale_2", global_scale)
|
234 |
+
|
235 |
+
|
236 |
+
def rand_marlin_weight_fp4_like(weight, group_size):
|
237 |
+
assert group_size > 0
|
238 |
+
size_n, size_k = weight.shape
|
239 |
+
device = weight.device
|
240 |
+
|
241 |
+
scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6
|
242 |
+
global_scale = scales.max() / 448
|
243 |
+
scales = (scales / global_scale).to(torch.float8_e4m3fn)
|
244 |
+
|
245 |
+
fp4_weight = torch.randint(0,
|
246 |
+
256, (size_n, size_k // 2),
|
247 |
+
dtype=torch.uint8,
|
248 |
+
device=weight.device)
|
249 |
+
fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
|
250 |
+
((fp4_weight & 0b01110000) >> 2))
|
251 |
+
fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
|
252 |
+
fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
|
253 |
+
|
254 |
+
fp4_weight2 = fp4_weight << 4
|
255 |
+
fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
|
256 |
+
((fp4_weight2 & 0b01110000) >> 2))
|
257 |
+
fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
|
258 |
+
fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
|
259 |
+
|
260 |
+
weight_ref = torch.cat(
|
261 |
+
[fp4_weight_part_2.unsqueeze(2),
|
262 |
+
fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
|
263 |
+
weight_ref = weight_ref * global_scale.to(weight.dtype) * \
|
264 |
+
scales.repeat_interleave(group_size, 1).to(weight.dtype)
|
265 |
+
|
266 |
+
marlin_qweight = ops.gptq_marlin_repack(
|
267 |
+
b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
|
268 |
+
perm=torch.empty(0, dtype=torch.int, device=device),
|
269 |
+
size_k=size_k,
|
270 |
+
size_n=size_n,
|
271 |
+
num_bits=4,
|
272 |
+
)
|
273 |
+
|
274 |
+
marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
|
275 |
+
size_k=size_k,
|
276 |
+
size_n=size_n,
|
277 |
+
group_size=group_size)
|
278 |
+
marlin_scales = fp4_marlin_process_scales(marlin_scales)
|
279 |
+
|
280 |
+
global_scale = fp4_marlin_process_global_scale(global_scale)
|
281 |
+
|
282 |
+
return weight_ref.T, marlin_qweight, marlin_scales, global_scale
|
build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py
CHANGED
@@ -1,10 +1,13 @@
|
|
|
|
|
|
|
|
1 |
from typing import Optional
|
2 |
|
3 |
import torch
|
4 |
|
5 |
import quantization as ops
|
6 |
|
7 |
-
from .marlin_utils import marlin_make_workspace, marlin_permute_scales
|
8 |
|
9 |
|
10 |
def is_fp8_marlin_supported():
|
@@ -13,88 +16,107 @@ def is_fp8_marlin_supported():
|
|
13 |
return capability >= 80
|
14 |
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def apply_fp8_marlin_linear(
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
) -> torch.Tensor:
|
25 |
# For GPUs that lack FP8 hardware support, we can leverage the
|
26 |
# Marlin kernel for fast weight-only FP8 quantization
|
27 |
|
28 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
29 |
-
out_shape = input.shape[:-1] + (size_n,)
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
if bias is not None:
|
43 |
output.add_(bias) # In-place add
|
44 |
|
45 |
return output.reshape(out_shape)
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
layer: torch.nn.Module, strategy: str = "tensor"
|
50 |
-
) -> None:
|
51 |
-
part_size_n = layer.output_size_per_partition
|
52 |
-
part_size_k = layer.input_size_per_partition
|
53 |
-
|
54 |
-
device = layer.weight.device
|
55 |
-
|
56 |
-
# WORKSPACE
|
57 |
-
layer.workspace = marlin_make_workspace(part_size_n, device)
|
58 |
-
|
59 |
-
# WEIGHT
|
60 |
-
# Repack weights to marlin format
|
61 |
-
marlin_qweight = ops.gptq_marlin_repack(
|
62 |
-
b_q_weight=pack_fp8_to_int32(layer.weight),
|
63 |
-
perm=torch.empty(0, dtype=torch.int, device=device),
|
64 |
-
size_k=part_size_k,
|
65 |
-
size_n=part_size_n,
|
66 |
-
num_bits=8,
|
67 |
-
)
|
68 |
-
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
|
69 |
-
|
70 |
-
# WEIGHT SCALES
|
71 |
-
scales = layer.weight_scale.to(layer.orig_dtype)
|
72 |
-
# Permute scales
|
73 |
-
marlin_scales = marlin_permute_scales(
|
74 |
-
s=scales, size_k=part_size_k, size_n=part_size_n, group_size=-1
|
75 |
-
)
|
76 |
-
layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
|
77 |
-
|
78 |
-
|
79 |
-
def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
|
80 |
"""
|
81 |
Repack FP8 weights to gptq format (packed int32 elements)
|
82 |
"""
|
83 |
assert fp8_tensor.dtype == torch.float8_e4m3fn
|
84 |
-
assert fp8_tensor.
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
-
|
90 |
-
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
packed = (
|
94 |
-
byte_tensor[:, 0].to(torch.int32)
|
95 |
-
| (byte_tensor[:, 1].to(torch.int32) << 8)
|
96 |
-
| (byte_tensor[:, 2].to(torch.int32) << 16)
|
97 |
-
| (byte_tensor[:, 3].to(torch.int32) << 24)
|
98 |
-
)
|
99 |
|
100 |
-
return
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
from typing import Optional
|
5 |
|
6 |
import torch
|
7 |
|
8 |
import quantization as ops
|
9 |
|
10 |
+
from .marlin_utils import USE_FP32_REDUCE_DEFAULT, marlin_make_workspace, marlin_permute_scales
|
11 |
|
12 |
|
13 |
def is_fp8_marlin_supported():
|
|
|
16 |
return capability >= 80
|
17 |
|
18 |
|
19 |
+
def fp8_fused_exponent_bias_into_scales(scales):
|
20 |
+
fp8_exponent = 4
|
21 |
+
if scales.dtype == torch.half:
|
22 |
+
target_exponent = 5
|
23 |
+
elif scales.dtype == torch.bfloat16:
|
24 |
+
target_exponent = 8
|
25 |
+
# exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
|
26 |
+
# exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
|
27 |
+
exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
|
28 |
+
s = torch.ones_like(scales) * 2
|
29 |
+
s = s**exponent_bias
|
30 |
+
return scales * s
|
31 |
+
|
32 |
+
|
33 |
def apply_fp8_marlin_linear(
|
34 |
+
input: torch.Tensor,
|
35 |
+
weight: torch.Tensor,
|
36 |
+
weight_scale: torch.Tensor,
|
37 |
+
workspace: torch.Tensor,
|
38 |
+
size_n: int,
|
39 |
+
size_k: int,
|
40 |
+
bias: Optional[torch.Tensor],
|
41 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
42 |
# For GPUs that lack FP8 hardware support, we can leverage the
|
43 |
# Marlin kernel for fast weight-only FP8 quantization
|
44 |
|
45 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
46 |
+
out_shape = input.shape[:-1] + (size_n, )
|
47 |
+
|
48 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
49 |
+
n=size_n,
|
50 |
+
k=size_k,
|
51 |
+
device=input.device,
|
52 |
+
dtype=input.dtype)
|
53 |
+
|
54 |
+
output = ops.gptq_marlin_gemm(a=reshaped_x,
|
55 |
+
c=None,
|
56 |
+
b_q_weight=weight,
|
57 |
+
b_scales=weight_scale,
|
58 |
+
global_scale=None,
|
59 |
+
b_zeros=None,
|
60 |
+
g_idx=None,
|
61 |
+
perm=None,
|
62 |
+
workspace=workspace,
|
63 |
+
b_q_type=scalar_types.float8_e4m3fn,
|
64 |
+
size_m=reshaped_x.size(0),
|
65 |
+
size_n=size_n,
|
66 |
+
size_k=size_k,
|
67 |
+
use_atomic_add=use_atomic_add,
|
68 |
+
use_fp32_reduce=use_fp32_reduce)
|
69 |
|
70 |
if bias is not None:
|
71 |
output.add_(bias) # In-place add
|
72 |
|
73 |
return output.reshape(out_shape)
|
74 |
|
75 |
+
def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
|
76 |
+
size_k_first: bool = True) -> torch.Tensor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
"""
|
78 |
Repack FP8 weights to gptq format (packed int32 elements)
|
79 |
"""
|
80 |
assert fp8_tensor.dtype == torch.float8_e4m3fn
|
81 |
+
assert fp8_tensor.ndim == 2
|
82 |
+
|
83 |
+
fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
|
84 |
+
fp8_tensor = fp8_tensor.contiguous()
|
85 |
+
# fp8_tensor is contiguous and have shape (N, K) now
|
86 |
+
# with `.view(torch.int32)`, it become (N, K // 4)
|
87 |
+
int32_tensor = fp8_tensor.view(torch.int32)
|
88 |
+
return int32_tensor.T.contiguous() if size_k_first else int32_tensor
|
89 |
+
|
90 |
+
|
91 |
+
def marlin_quant_fp8_torch(weight, group_size):
|
92 |
+
size_n, size_k = weight.shape
|
93 |
+
device = weight.device
|
94 |
+
|
95 |
+
if group_size != -1:
|
96 |
+
scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
|
97 |
+
repeated_scales = scales.repeat_interleave(group_size, 1)
|
98 |
+
fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
|
99 |
+
weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
|
100 |
+
else:
|
101 |
+
scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
|
102 |
+
repeated_scales = scales.repeat_interleave(size_k, 1)
|
103 |
+
fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
|
104 |
+
weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
|
105 |
+
|
106 |
+
packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
|
107 |
+
marlin_qweight = ops.gptq_marlin_repack(
|
108 |
+
b_q_weight=packed_weight,
|
109 |
+
perm=torch.empty(0, dtype=torch.int, device=device),
|
110 |
+
size_k=size_k,
|
111 |
+
size_n=size_n,
|
112 |
+
num_bits=8,
|
113 |
+
)
|
114 |
|
115 |
+
marlin_scales = marlin_permute_scales(s=scales.T,
|
116 |
+
size_k=size_k,
|
117 |
+
size_n=size_n,
|
118 |
+
group_size=group_size)
|
119 |
|
120 |
+
marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
+
return weight_ref.T, marlin_qweight, marlin_scales
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/__init__.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
|
2 |
from .cutlass import (
|
|
|
3 |
cutlass_scaled_mm_supports_fp8,
|
4 |
cutlass_scaled_mm,
|
5 |
cutlass_scaled_mm_azp,
|
6 |
)
|
7 |
from .marlin import (
|
8 |
awq_marlin_repack,
|
9 |
-
fp8_marlin_gemm,
|
10 |
gptq_marlin_gemm,
|
11 |
gptq_marlin_repack,
|
12 |
gptq_marlin_24_gemm,
|
@@ -25,8 +25,8 @@ __all__ = [
|
|
25 |
"awq_marlin_repack",
|
26 |
"cutlass_scaled_mm",
|
27 |
"cutlass_scaled_mm_azp",
|
|
|
28 |
"cutlass_scaled_mm_supports_fp8",
|
29 |
-
"fp8_marlin_gemm",
|
30 |
"gptq_marlin_24_gemm",
|
31 |
"gptq_marlin_gemm",
|
32 |
"gptq_marlin_repack",
|
|
|
1 |
from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
|
2 |
from .cutlass import (
|
3 |
+
cutlass_scaled_mm_supports_block_fp8,
|
4 |
cutlass_scaled_mm_supports_fp8,
|
5 |
cutlass_scaled_mm,
|
6 |
cutlass_scaled_mm_azp,
|
7 |
)
|
8 |
from .marlin import (
|
9 |
awq_marlin_repack,
|
|
|
10 |
gptq_marlin_gemm,
|
11 |
gptq_marlin_repack,
|
12 |
gptq_marlin_24_gemm,
|
|
|
25 |
"awq_marlin_repack",
|
26 |
"cutlass_scaled_mm",
|
27 |
"cutlass_scaled_mm_azp",
|
28 |
+
"cutlass_scaled_mm_supports_block_fp8",
|
29 |
"cutlass_scaled_mm_supports_fp8",
|
|
|
30 |
"gptq_marlin_24_gemm",
|
31 |
"gptq_marlin_gemm",
|
32 |
"gptq_marlin_repack",
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _quantization_9035540
|
3 |
+
ops = torch.ops._quantization_9035540
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_quantization_9035540::{op_name}"
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:219fc94b48e46777769dd2cd61785791b4fd00c58824d6de5252defbf48c30e5
|
3 |
+
size 159999608
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/compressed_tensors.py
CHANGED
@@ -2,17 +2,7 @@ from typing import Optional, Tuple
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
-
|
6 |
-
from ._ops import ops
|
7 |
-
except ImportError as e:
|
8 |
-
# Fallback for local development.
|
9 |
-
try:
|
10 |
-
import _quantization
|
11 |
-
|
12 |
-
ops = torch.ops._quantization
|
13 |
-
except ImportError:
|
14 |
-
raise e
|
15 |
-
|
16 |
|
17 |
# fp8
|
18 |
def scaled_fp8_quant(
|
@@ -21,7 +11,8 @@ def scaled_fp8_quant(
|
|
21 |
num_token_padding: Optional[int] = None,
|
22 |
scale_ub: Optional[torch.Tensor] = None,
|
23 |
use_per_token_if_dynamic: bool = False,
|
24 |
-
|
|
|
25 |
"""
|
26 |
Quantize input tensor to FP8 and return quantized tensor and scale.
|
27 |
|
@@ -42,30 +33,36 @@ def scaled_fp8_quant(
|
|
42 |
in the dynamic quantization case.
|
43 |
|
44 |
Returns:
|
45 |
-
|
46 |
scaling factor.
|
47 |
"""
|
48 |
# This code assumes batch_dim and num_tokens are flattened
|
49 |
-
assert input.ndim == 2
|
50 |
-
shape: Union[
|
51 |
-
# For
|
52 |
-
|
53 |
-
# if current_platform.is_rocm() else torch.float8_e4m3fn
|
54 |
-
out_dtype = torch.float8_e4m3fn
|
55 |
if num_token_padding:
|
56 |
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
57 |
-
output
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if scale is None:
|
60 |
if use_per_token_if_dynamic:
|
61 |
-
scale = torch.empty((shape[0], 1),
|
62 |
-
|
|
|
|
|
|
|
63 |
else:
|
64 |
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
65 |
ops.dynamic_scaled_fp8_quant(output, input, scale)
|
66 |
else:
|
67 |
# num_token_padding not implemented for this case
|
68 |
-
assert scale.numel() == 1
|
69 |
ops.static_scaled_fp8_quant(output, input, scale)
|
70 |
|
71 |
return output, scale
|
@@ -76,8 +73,8 @@ def scaled_int8_quant(
|
|
76 |
input: torch.Tensor,
|
77 |
scale: Optional[torch.Tensor] = None,
|
78 |
azp: Optional[torch.Tensor] = None,
|
79 |
-
symmetric: bool = True
|
80 |
-
) ->
|
81 |
"""
|
82 |
Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
|
83 |
|
@@ -90,21 +87,25 @@ def scaled_int8_quant(
|
|
90 |
symmetric: Whether to use symmetric quantization (scale only, azp ignored).
|
91 |
|
92 |
Returns:
|
93 |
-
|
94 |
"""
|
95 |
output = torch.empty_like(input, dtype=torch.int8)
|
96 |
if scale is not None:
|
97 |
# static-per-tensor quantization.
|
98 |
assert symmetric == (
|
99 |
-
azp
|
100 |
-
|
101 |
ops.static_scaled_int8_quant(output, input, scale, azp)
|
102 |
return output, scale, azp
|
103 |
|
104 |
# dynamic-per-token quantization.
|
105 |
-
input_scales = torch.empty(
|
106 |
-
|
107 |
-
|
108 |
-
input_azp = None if symmetric else torch.empty_like(input_scales,
|
109 |
-
|
|
|
|
|
110 |
return output, input_scales, input_azp
|
|
|
|
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
+
from ._ops import ops
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# fp8
|
8 |
def scaled_fp8_quant(
|
|
|
11 |
num_token_padding: Optional[int] = None,
|
12 |
scale_ub: Optional[torch.Tensor] = None,
|
13 |
use_per_token_if_dynamic: bool = False,
|
14 |
+
output: Optional[torch.Tensor] = None,
|
15 |
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
16 |
"""
|
17 |
Quantize input tensor to FP8 and return quantized tensor and scale.
|
18 |
|
|
|
33 |
in the dynamic quantization case.
|
34 |
|
35 |
Returns:
|
36 |
+
tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
|
37 |
scaling factor.
|
38 |
"""
|
39 |
# This code assumes batch_dim and num_tokens are flattened
|
40 |
+
assert (input.ndim == 2)
|
41 |
+
shape: Union[tuple[int, int], torch.Size] = input.shape
|
42 |
+
# For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
|
43 |
+
out_dtype: torch.dtype = current_platform.fp8_dtype()
|
|
|
|
|
44 |
if num_token_padding:
|
45 |
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
46 |
+
if output is None:
|
47 |
+
output = torch.empty(shape, device=input.device, dtype=out_dtype)
|
48 |
+
else:
|
49 |
+
assert num_token_padding is None, \
|
50 |
+
"padding not supported if output passed in"
|
51 |
+
assert output.dtype == out_dtype
|
52 |
|
53 |
if scale is None:
|
54 |
if use_per_token_if_dynamic:
|
55 |
+
scale = torch.empty((shape[0], 1),
|
56 |
+
device=input.device,
|
57 |
+
dtype=torch.float32)
|
58 |
+
ops.dynamic_per_token_scaled_fp8_quant(
|
59 |
+
output, input.contiguous(), scale, scale_ub)
|
60 |
else:
|
61 |
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
62 |
ops.dynamic_scaled_fp8_quant(output, input, scale)
|
63 |
else:
|
64 |
# num_token_padding not implemented for this case
|
65 |
+
assert (scale.numel() == 1 and num_token_padding is None)
|
66 |
ops.static_scaled_fp8_quant(output, input, scale)
|
67 |
|
68 |
return output, scale
|
|
|
73 |
input: torch.Tensor,
|
74 |
scale: Optional[torch.Tensor] = None,
|
75 |
azp: Optional[torch.Tensor] = None,
|
76 |
+
symmetric: bool = True
|
77 |
+
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
78 |
"""
|
79 |
Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
|
80 |
|
|
|
87 |
symmetric: Whether to use symmetric quantization (scale only, azp ignored).
|
88 |
|
89 |
Returns:
|
90 |
+
tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
|
91 |
"""
|
92 |
output = torch.empty_like(input, dtype=torch.int8)
|
93 |
if scale is not None:
|
94 |
# static-per-tensor quantization.
|
95 |
assert symmetric == (
|
96 |
+
azp
|
97 |
+
is None), "azp must only be provided for asymmetric quantization."
|
98 |
ops.static_scaled_int8_quant(output, input, scale, azp)
|
99 |
return output, scale, azp
|
100 |
|
101 |
# dynamic-per-token quantization.
|
102 |
+
input_scales = torch.empty((input.numel() // input.shape[-1], 1),
|
103 |
+
device=input.device,
|
104 |
+
dtype=torch.float32)
|
105 |
+
input_azp = None if symmetric else torch.empty_like(input_scales,
|
106 |
+
dtype=torch.int32)
|
107 |
+
ops.dynamic_scaled_int8_quant(output, input.contiguous(),
|
108 |
+
input_scales, input_azp)
|
109 |
return output, input_scales, input_azp
|
110 |
+
|
111 |
+
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/cutlass.py
CHANGED
@@ -2,22 +2,18 @@ from typing import Optional
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
except ImportError as e:
|
8 |
-
# Fallback for local development.
|
9 |
-
try:
|
10 |
-
import _quantization
|
11 |
-
|
12 |
-
ops = torch.ops._quantization
|
13 |
-
except ImportError:
|
14 |
-
raise e
|
15 |
|
16 |
|
17 |
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
18 |
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
21 |
def cutlass_scaled_mm(
|
22 |
a: torch.Tensor,
|
23 |
b: torch.Tensor,
|
@@ -33,12 +29,10 @@ def cutlass_scaled_mm(
|
|
33 |
m = a.shape[0]
|
34 |
n = b.shape[1]
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
41 |
-
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
42 |
|
43 |
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
44 |
|
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
+
from ._ops import ops
|
6 |
+
from .platforms import current_platform
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
10 |
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
11 |
|
12 |
|
13 |
+
def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
|
14 |
+
return ops.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
|
15 |
+
|
16 |
+
|
17 |
def cutlass_scaled_mm(
|
18 |
a: torch.Tensor,
|
19 |
b: torch.Tensor,
|
|
|
29 |
m = a.shape[0]
|
30 |
n = b.shape[1]
|
31 |
|
32 |
+
cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
33 |
+
if not cutlass_compatible_b:
|
34 |
+
from .triton_scaled_mm import triton_scaled_mm
|
35 |
+
return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
|
|
|
|
36 |
|
37 |
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
38 |
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/marlin.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import TYPE_CHECKING
|
2 |
|
3 |
import torch
|
4 |
|
@@ -30,58 +30,30 @@ except ImportError as e:
|
|
30 |
from .scalar_type import ScalarType
|
31 |
|
32 |
|
33 |
-
# fp8 marlin
|
34 |
-
def fp8_marlin_gemm(
|
35 |
-
a: torch.Tensor,
|
36 |
-
b_q_weight: torch.Tensor,
|
37 |
-
b_scales: torch.Tensor,
|
38 |
-
workspace: torch.Tensor,
|
39 |
-
num_bits: int,
|
40 |
-
size_m: int,
|
41 |
-
size_n: int,
|
42 |
-
size_k: int,
|
43 |
-
) -> torch.Tensor:
|
44 |
-
return ops.fp8_marlin_gemm(
|
45 |
-
a, b_q_weight, b_scales, workspace, num_bits, size_m, size_n, size_k
|
46 |
-
)
|
47 |
-
|
48 |
-
|
49 |
# gptq_marlin
|
50 |
-
def gptq_marlin_gemm(
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
) -> torch.Tensor:
|
67 |
-
return ops.gptq_marlin_gemm(
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
perm,
|
74 |
-
workspace,
|
75 |
-
b_q_type.id,
|
76 |
-
size_m,
|
77 |
-
size_n,
|
78 |
-
size_k,
|
79 |
-
is_k_full,
|
80 |
-
has_zp,
|
81 |
-
use_fp32_reduce,
|
82 |
-
is_zp_float,
|
83 |
-
)
|
84 |
-
|
85 |
|
86 |
# gptq_marlin
|
87 |
def gptq_marlin_repack(
|
@@ -153,14 +125,6 @@ def marlin_qqq_gemm(
|
|
153 |
# Fake ops
|
154 |
|
155 |
if hasattr(ops, "gptq_marlin_24_gemm"):
|
156 |
-
@register_fake(add_op_namespace_prefix("fp8_marlin_gemm"))
|
157 |
-
def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
158 |
-
b_scales: torch.Tensor, workspace: torch.Tensor,
|
159 |
-
num_bits: int, size_m: torch.SymInt,
|
160 |
-
size_n: torch.SymInt,
|
161 |
-
size_k: torch.SymInt) -> torch.Tensor:
|
162 |
-
return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
|
163 |
-
|
164 |
@register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
|
165 |
def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
166 |
b_meta: torch.Tensor, b_scales: torch.Tensor,
|
@@ -172,20 +136,22 @@ if hasattr(ops, "gptq_marlin_24_gemm"):
|
|
172 |
|
173 |
@register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
|
174 |
def _gptq_marlin_gemm_fake(a: torch.Tensor,
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
189 |
return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
|
190 |
|
191 |
@register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))
|
|
|
1 |
+
from typing import TYPE_CHECKING, Optional
|
2 |
|
3 |
import torch
|
4 |
|
|
|
30 |
from .scalar_type import ScalarType
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# gptq_marlin
|
34 |
+
def gptq_marlin_gemm(a: torch.Tensor,
|
35 |
+
c: Optional[torch.Tensor],
|
36 |
+
b_q_weight: torch.Tensor,
|
37 |
+
b_scales: torch.Tensor,
|
38 |
+
global_scale: Optional[torch.Tensor],
|
39 |
+
b_zeros: Optional[torch.Tensor],
|
40 |
+
g_idx: Optional[torch.Tensor],
|
41 |
+
perm: Optional[torch.Tensor],
|
42 |
+
workspace: torch.Tensor,
|
43 |
+
b_q_type: ScalarType,
|
44 |
+
size_m: int,
|
45 |
+
size_n: int,
|
46 |
+
size_k: int,
|
47 |
+
is_k_full: bool = True,
|
48 |
+
use_atomic_add: bool = False,
|
49 |
+
use_fp32_reduce: bool = False,
|
50 |
+
is_zp_float: bool = False) -> torch.Tensor:
|
51 |
+
return ops.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
|
52 |
+
global_scale, b_zeros, g_idx, perm,
|
53 |
+
workspace, b_q_type.id, size_m,
|
54 |
+
size_n, size_k, is_k_full,
|
55 |
+
use_atomic_add, use_fp32_reduce,
|
56 |
+
is_zp_float)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# gptq_marlin
|
59 |
def gptq_marlin_repack(
|
|
|
125 |
# Fake ops
|
126 |
|
127 |
if hasattr(ops, "gptq_marlin_24_gemm"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
@register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
|
129 |
def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
130 |
b_meta: torch.Tensor, b_scales: torch.Tensor,
|
|
|
136 |
|
137 |
@register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
|
138 |
def _gptq_marlin_gemm_fake(a: torch.Tensor,
|
139 |
+
c: Optional[torch.Tensor],
|
140 |
+
b_q_weight: torch.Tensor,
|
141 |
+
b_scales: torch.Tensor,
|
142 |
+
global_scale: Optional[torch.Tensor],
|
143 |
+
b_zeros: Optional[torch.Tensor],
|
144 |
+
g_idx: Optional[torch.Tensor],
|
145 |
+
perm: Optional[torch.Tensor],
|
146 |
+
workspace: torch.Tensor,
|
147 |
+
b_q_type_id: int,
|
148 |
+
size_m: torch.SymInt,
|
149 |
+
size_n: torch.SymInt,
|
150 |
+
size_k: torch.SymInt,
|
151 |
+
is_k_full: bool = True,
|
152 |
+
use_atomic_add: bool = False,
|
153 |
+
use_fp32_reduce: bool = False,
|
154 |
+
is_zp_float: bool = False) -> torch.Tensor:
|
155 |
return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
|
156 |
|
157 |
@register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/platforms.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from functools import lru_cache
|
3 |
+
from typing import NamedTuple
|
4 |
+
|
5 |
+
import torch
|
6 |
+
|
7 |
+
IS_ROCM = torch.version.hip is not None
|
8 |
+
|
9 |
+
|
10 |
+
class DeviceCapability(NamedTuple):
|
11 |
+
major: int
|
12 |
+
minor: int
|
13 |
+
|
14 |
+
def as_version_str(self) -> str:
|
15 |
+
return f"{self.major}.{self.minor}"
|
16 |
+
|
17 |
+
def to_int(self) -> int:
|
18 |
+
"""
|
19 |
+
Express device capability as an integer ``<major><minor>``.
|
20 |
+
|
21 |
+
It is assumed that the minor version is always a single digit.
|
22 |
+
"""
|
23 |
+
assert 0 <= self.minor < 10
|
24 |
+
return self.major * 10 + self.minor
|
25 |
+
|
26 |
+
|
27 |
+
class Platform(ABC):
|
28 |
+
simple_compile_backend: str = "inductor"
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
@abstractmethod
|
32 |
+
def get_device_name(cls, device_id: int = 0) -> str: ...
|
33 |
+
|
34 |
+
@abstractmethod
|
35 |
+
def is_rocm(self): ...
|
36 |
+
|
37 |
+
|
38 |
+
class CudaPlatform(Platform):
|
39 |
+
@classmethod
|
40 |
+
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
41 |
+
major, minor = torch.cuda.get_device_capability(device_id)
|
42 |
+
return DeviceCapability(major=major, minor=minor)
|
43 |
+
|
44 |
+
@classmethod
|
45 |
+
@lru_cache(maxsize=8)
|
46 |
+
def get_device_name(cls, device_id: int = 0) -> str:
|
47 |
+
return torch.cuda.get_device_name(0)
|
48 |
+
|
49 |
+
def is_rocm(self):
|
50 |
+
return False
|
51 |
+
|
52 |
+
|
53 |
+
class RocmPlatform(Platform):
|
54 |
+
@classmethod
|
55 |
+
@lru_cache(maxsize=8)
|
56 |
+
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
57 |
+
major, minor = torch.cuda.get_device_capability(device_id)
|
58 |
+
return DeviceCapability(major=major, minor=minor)
|
59 |
+
|
60 |
+
@classmethod
|
61 |
+
@lru_cache(maxsize=8)
|
62 |
+
def get_device_name(cls, device_id: int = 0) -> str:
|
63 |
+
return torch.cuda.get_device_name(device_id)
|
64 |
+
|
65 |
+
def is_rocm(self):
|
66 |
+
return True
|
67 |
+
|
68 |
+
|
69 |
+
current_platform = RocmPlatform() if IS_ROCM else CudaPlatform()
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/scalar_type.py
CHANGED
@@ -1,9 +1,14 @@
|
|
|
|
|
|
|
|
1 |
import functools
|
2 |
import struct
|
3 |
from dataclasses import dataclass
|
4 |
from enum import Enum
|
5 |
from typing import Optional, Union
|
6 |
|
|
|
|
|
7 |
|
8 |
# Mirrors enum in `core/scalar_type.hpp`
|
9 |
class NanRepr(Enum):
|
@@ -121,8 +126,8 @@ class ScalarType:
|
|
121 |
min_raw = max_raw | sign_bit_double
|
122 |
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
123 |
else:
|
124 |
-
assert (not self.is_signed() or
|
125 |
-
|
126 |
|
127 |
if self.is_signed():
|
128 |
return -(1 << (self.size_bits - 1))
|
@@ -156,6 +161,8 @@ class ScalarType:
|
|
156 |
assert offset <= 64, \
|
157 |
f"ScalarType fields too big {offset} to fit into an int64"
|
158 |
|
|
|
|
|
159 |
return val
|
160 |
|
161 |
@property
|
@@ -293,6 +300,13 @@ class ScalarType:
|
|
293 |
ret.id # noqa B018: make sure the id is cached
|
294 |
return ret
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
# naming generally follows: https://github.com/jax-ml/ml_dtypes
|
298 |
# for floating point types (leading f) the scheme is:
|
@@ -319,6 +333,9 @@ class scalar_types:
|
|
319 |
# fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
|
320 |
float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
|
321 |
|
|
|
|
|
|
|
322 |
# "gptq" types
|
323 |
uint2b2 = ScalarType.uint(2, 2)
|
324 |
uint3b4 = ScalarType.uint(3, 4)
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
import functools
|
5 |
import struct
|
6 |
from dataclasses import dataclass
|
7 |
from enum import Enum
|
8 |
from typing import Optional, Union
|
9 |
|
10 |
+
_SCALAR_TYPES_ID_MAP = {}
|
11 |
+
|
12 |
|
13 |
# Mirrors enum in `core/scalar_type.hpp`
|
14 |
class NanRepr(Enum):
|
|
|
126 |
min_raw = max_raw | sign_bit_double
|
127 |
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
128 |
else:
|
129 |
+
assert (not self.is_signed() or self.size_bits
|
130 |
+
<= 64), "Cannot represent min as a int64_t"
|
131 |
|
132 |
if self.is_signed():
|
133 |
return -(1 << (self.size_bits - 1))
|
|
|
161 |
assert offset <= 64, \
|
162 |
f"ScalarType fields too big {offset} to fit into an int64"
|
163 |
|
164 |
+
_SCALAR_TYPES_ID_MAP[val] = self
|
165 |
+
|
166 |
return val
|
167 |
|
168 |
@property
|
|
|
300 |
ret.id # noqa B018: make sure the id is cached
|
301 |
return ret
|
302 |
|
303 |
+
@classmethod
|
304 |
+
def from_id(cls, scalar_type_id: int):
|
305 |
+
if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
|
306 |
+
raise ValueError(
|
307 |
+
f"scalar_type_id {scalar_type_id} doesn't exists.")
|
308 |
+
return _SCALAR_TYPES_ID_MAP[scalar_type_id]
|
309 |
+
|
310 |
|
311 |
# naming generally follows: https://github.com/jax-ml/ml_dtypes
|
312 |
# for floating point types (leading f) the scheme is:
|
|
|
333 |
# fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
|
334 |
float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
|
335 |
|
336 |
+
# fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
|
337 |
+
float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
|
338 |
+
|
339 |
# "gptq" types
|
340 |
uint2b2 = ScalarType.uint(2, 2)
|
341 |
uint3b4 = ScalarType.uint(3, 4)
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
|
3 |
import numpy
|
4 |
import torch
|
@@ -42,7 +45,9 @@ USE_FP32_REDUCE_DEFAULT = True
|
|
42 |
# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
|
43 |
# TODO: we may want to move this into the C++ so its closer to the actual impl
|
44 |
def query_marlin_supported_quant_types(
|
45 |
-
has_zp:
|
|
|
|
|
46 |
):
|
47 |
if device_capability is None:
|
48 |
capability_tuple = torch.cuda.get_device_capability()
|
@@ -51,137 +56,141 @@ def query_marlin_supported_quant_types(
|
|
51 |
if device_capability < 80:
|
52 |
return []
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
if has_zp:
|
55 |
# AWQ style, unsigned + runtime zero-point
|
56 |
-
return [scalar_types.uint4
|
57 |
else:
|
58 |
# GPTQ style, unsigned + symmetric bias
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
62 |
|
63 |
|
64 |
def _check_marlin_supported(
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
) -> Tuple[bool, Optional[str]]:
|
70 |
|
71 |
if device_capability is None:
|
72 |
capability_tuple = torch.cuda.get_device_capability()
|
73 |
device_capability = capability_tuple[0] * 10 + capability_tuple[1]
|
74 |
|
75 |
-
supported_types = query_marlin_supported_quant_types(
|
|
|
76 |
|
77 |
if quant_type not in supported_types:
|
78 |
-
return (
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
return (
|
87 |
-
False,
|
88 |
-
f"Marlin does not support group_size = {group_size}. "
|
89 |
-
f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
|
90 |
-
"are supported.",
|
91 |
-
)
|
92 |
|
93 |
return True, None
|
94 |
|
95 |
|
96 |
-
def check_marlin_supported(
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
)
|
102 |
-
cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
|
103 |
return cond
|
104 |
|
105 |
|
106 |
-
def verify_marlin_supported(
|
107 |
-
|
108 |
-
) -> None:
|
109 |
cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
|
110 |
if not cond:
|
111 |
assert err_msg is not None
|
112 |
raise ValueError(err_msg)
|
113 |
|
114 |
|
115 |
-
def verify_marlin_supports_shape(
|
116 |
-
|
117 |
-
|
118 |
-
input_size: int,
|
119 |
-
group_size: int,
|
120 |
-
) -> None:
|
121 |
|
122 |
# Validate output_size_per_partition
|
123 |
if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
|
124 |
-
raise ValueError(
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
"with --quantization gptq."
|
130 |
-
)
|
131 |
|
132 |
# Validate input_size_per_partition
|
133 |
if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
|
134 |
-
raise ValueError(
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
if group_size < input_size and input_size_per_partition % group_size != 0:
|
143 |
raise ValueError(
|
144 |
f"Weight input_size_per_partition = {input_size_per_partition}"
|
145 |
-
f" is not divisible by group_size = {group_size}."
|
146 |
"Consider reducing tensor_parallel_size or running "
|
147 |
-
"with --quantization gptq."
|
148 |
-
)
|
149 |
|
150 |
|
151 |
-
def check_marlin_supports_shape(
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
group_size: int,
|
156 |
-
) -> Tuple[bool, Optional[str]]:
|
157 |
try:
|
158 |
-
verify_marlin_supports_shape(
|
159 |
-
|
160 |
-
|
161 |
except ValueError as e:
|
162 |
return False, e.__str__()
|
163 |
return True, None
|
164 |
|
165 |
|
166 |
-
def marlin_make_workspace(
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
|
171 |
-
) * GPTQ_MARLIN_MAX_PARALLEL
|
172 |
|
173 |
-
return torch.zeros(
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
|
178 |
def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
|
179 |
return (not act_order) or (act_order and not is_row_parallel)
|
180 |
|
181 |
|
182 |
-
def marlin_repeat_scales_on_all_ranks(
|
183 |
-
|
184 |
-
) -> bool:
|
185 |
# Need to repeat scales on every rank if act_ordering or
|
186 |
# channelwise and RowParallelLinear
|
187 |
is_channelwise = group_size == -1
|
@@ -189,35 +198,34 @@ def marlin_repeat_scales_on_all_ranks(
|
|
189 |
|
190 |
|
191 |
def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
192 |
-
return torch.nn.Parameter(
|
193 |
-
|
194 |
-
)
|
195 |
|
196 |
|
197 |
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
198 |
-
return torch.nn.Parameter(
|
199 |
-
|
200 |
-
)
|
201 |
|
202 |
|
203 |
-
def marlin_sort_g_idx(
|
|
|
204 |
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
205 |
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
206 |
|
207 |
|
208 |
def get_scale_perms():
|
209 |
-
scale_perm:
|
210 |
for i in range(8):
|
211 |
scale_perm.extend([i + 8 * j for j in range(8)])
|
212 |
-
scale_perm_single:
|
213 |
for i in range(4):
|
214 |
-
scale_perm_single.extend(
|
|
|
215 |
return scale_perm, scale_perm_single
|
216 |
|
217 |
|
218 |
-
def marlin_permute_scales(
|
219 |
-
|
220 |
-
) -> torch.Tensor:
|
221 |
|
222 |
scale_perm, scale_perm_single = get_scale_perms()
|
223 |
if group_size < size_k and group_size != -1:
|
@@ -247,9 +255,8 @@ def marlin_moe_permute_scales(
|
|
247 |
return output
|
248 |
|
249 |
|
250 |
-
def marlin_zero_points(
|
251 |
-
|
252 |
-
) -> torch.Tensor:
|
253 |
# Permute zero-points in a similar way to scales, but do not use the
|
254 |
# "single" permutation, since zero-points are applied on every MMA
|
255 |
scale_perm, _ = get_scale_perms()
|
@@ -270,9 +277,8 @@ def marlin_zero_points(
|
|
270 |
return zp
|
271 |
|
272 |
|
273 |
-
def awq_to_marlin_zero_points(
|
274 |
-
|
275 |
-
) -> torch.Tensor:
|
276 |
# AWQ zero-points are quantized and packed on the column dim.
|
277 |
# In addition, the values are permuted based on dequantizer.
|
278 |
# Here we undo both of these, and then apply marlin permutation
|
@@ -294,9 +300,8 @@ def awq_to_marlin_zero_points(
|
|
294 |
return marlin_zp
|
295 |
|
296 |
|
297 |
-
def moe_awq_to_marlin_zero_points(
|
298 |
-
|
299 |
-
):
|
300 |
num_experts = q_zp_packed.shape[0]
|
301 |
output = torch.empty(
|
302 |
(num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
|
@@ -304,45 +309,97 @@ def moe_awq_to_marlin_zero_points(
|
|
304 |
dtype=q_zp_packed.dtype,
|
305 |
)
|
306 |
for e in range(num_experts):
|
307 |
-
output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
|
|
|
308 |
return output
|
309 |
|
310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
def apply_gptq_marlin_linear(
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
) -> torch.Tensor:
|
326 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
327 |
-
out_shape = input.shape[:-1] + (output_size_per_partition,)
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
|
347 |
if bias is not None:
|
348 |
output.add_(bias) # In-place add
|
@@ -351,39 +408,43 @@ def apply_gptq_marlin_linear(
|
|
351 |
|
352 |
|
353 |
def apply_awq_marlin_linear(
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
) -> torch.Tensor:
|
367 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
368 |
-
out_shape = input.shape[:-1] + (output_size_per_partition,)
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
|
|
|
|
|
|
|
|
|
|
387 |
|
388 |
if bias is not None:
|
389 |
output.add_(bias) # In-place add
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
+
from typing import Optional
|
5 |
|
6 |
import numpy
|
7 |
import torch
|
|
|
45 |
# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
|
46 |
# TODO: we may want to move this into the C++ so its closer to the actual impl
|
47 |
def query_marlin_supported_quant_types(
|
48 |
+
has_zp: Optional[bool] = None,
|
49 |
+
include_fp_type: bool = True,
|
50 |
+
device_capability: Optional[int] = None,
|
51 |
):
|
52 |
if device_capability is None:
|
53 |
capability_tuple = torch.cuda.get_device_capability()
|
|
|
56 |
if device_capability < 80:
|
57 |
return []
|
58 |
|
59 |
+
# - has_zp is True: return quant_types that has zero points
|
60 |
+
# - has_zp is False: return quant_types that has not zero points
|
61 |
+
# - has_zp is None: both
|
62 |
+
if has_zp is None:
|
63 |
+
types0 = query_marlin_supported_quant_types(False, include_fp_type,
|
64 |
+
device_capability)
|
65 |
+
types1 = query_marlin_supported_quant_types(True, include_fp_type,
|
66 |
+
device_capability)
|
67 |
+
return types0 + types1
|
68 |
+
|
69 |
if has_zp:
|
70 |
# AWQ style, unsigned + runtime zero-point
|
71 |
+
return [scalar_types.uint4]
|
72 |
else:
|
73 |
# GPTQ style, unsigned + symmetric bias
|
74 |
+
res = [scalar_types.uint4b8, scalar_types.uint8b128]
|
75 |
+
if include_fp_type:
|
76 |
+
res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
|
77 |
+
return res
|
78 |
|
79 |
|
80 |
def _check_marlin_supported(
|
81 |
+
quant_type: ScalarType,
|
82 |
+
group_size: Optional[int],
|
83 |
+
has_zp: bool,
|
84 |
+
device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
|
|
|
85 |
|
86 |
if device_capability is None:
|
87 |
capability_tuple = torch.cuda.get_device_capability()
|
88 |
device_capability = capability_tuple[0] * 10 + capability_tuple[1]
|
89 |
|
90 |
+
supported_types = query_marlin_supported_quant_types(
|
91 |
+
has_zp, True, device_capability)
|
92 |
|
93 |
if quant_type not in supported_types:
|
94 |
+
return (False, f"Marlin does not support weight_bits = {quant_type}. "
|
95 |
+
f"Only types = {supported_types} "
|
96 |
+
f"are supported (for group_size = {group_size}, "
|
97 |
+
f"device_capability = {device_capability}, zp = {has_zp}).")
|
98 |
+
if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
|
99 |
+
return (False, f"Marlin does not support group_size = {group_size}. "
|
100 |
+
f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
|
101 |
+
"are supported.")
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
return True, None
|
104 |
|
105 |
|
106 |
+
def check_marlin_supported(quant_type: ScalarType,
|
107 |
+
group_size: int,
|
108 |
+
has_zp: bool = False,
|
109 |
+
device_capability: Optional[int] = None) -> bool:
|
110 |
+
cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
|
111 |
+
device_capability)
|
|
|
112 |
return cond
|
113 |
|
114 |
|
115 |
+
def verify_marlin_supported(quant_type: ScalarType,
|
116 |
+
group_size: int,
|
117 |
+
has_zp: bool = False) -> None:
|
118 |
cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
|
119 |
if not cond:
|
120 |
assert err_msg is not None
|
121 |
raise ValueError(err_msg)
|
122 |
|
123 |
|
124 |
+
def verify_marlin_supports_shape(output_size_per_partition: int,
|
125 |
+
input_size_per_partition: int,
|
126 |
+
input_size: int, group_size: int) -> None:
|
|
|
|
|
|
|
127 |
|
128 |
# Validate output_size_per_partition
|
129 |
if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
|
130 |
+
raise ValueError(f"Weight output_size_per_partition = "
|
131 |
+
f"{output_size_per_partition} is not divisible by "
|
132 |
+
f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
|
133 |
+
"Consider reducing tensor_parallel_size or running "
|
134 |
+
"with --quantization gptq.")
|
|
|
|
|
135 |
|
136 |
# Validate input_size_per_partition
|
137 |
if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
|
138 |
+
raise ValueError(f"Weight input_size_per_partition = "
|
139 |
+
f"{input_size_per_partition} is not divisible "
|
140 |
+
f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
|
141 |
+
"Consider reducing tensor_parallel_size or running "
|
142 |
+
"with --quantization gptq.")
|
143 |
+
|
144 |
+
if (group_size < input_size
|
145 |
+
and input_size_per_partition % group_size != 0):
|
|
|
146 |
raise ValueError(
|
147 |
f"Weight input_size_per_partition = {input_size_per_partition}"
|
148 |
+
f" is not divisible by group_size = {group_size}. "
|
149 |
"Consider reducing tensor_parallel_size or running "
|
150 |
+
"with --quantization gptq.")
|
|
|
151 |
|
152 |
|
153 |
+
def check_marlin_supports_shape(output_size_per_partition: int,
|
154 |
+
input_size_per_partition: int,
|
155 |
+
input_size: int, group_size: int) \
|
156 |
+
-> tuple[bool, Optional[str]]:
|
|
|
|
|
157 |
try:
|
158 |
+
verify_marlin_supports_shape(output_size_per_partition,
|
159 |
+
input_size_per_partition, input_size,
|
160 |
+
group_size)
|
161 |
except ValueError as e:
|
162 |
return False, e.__str__()
|
163 |
return True, None
|
164 |
|
165 |
|
166 |
+
def marlin_make_workspace(output_size_per_partition: int,
|
167 |
+
device: torch.device) -> torch.Tensor:
|
168 |
+
max_workspace_size = (output_size_per_partition //
|
169 |
+
GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
|
|
|
|
|
170 |
|
171 |
+
return torch.zeros(max_workspace_size,
|
172 |
+
dtype=torch.int,
|
173 |
+
device=device,
|
174 |
+
requires_grad=False)
|
175 |
+
|
176 |
+
|
177 |
+
def marlin_make_workspace_new(device: torch.device,
|
178 |
+
max_blocks_per_sm: int = 1) -> torch.Tensor:
|
179 |
+
# In the new marlin kernel, we use the num of threadblocks as workspace
|
180 |
+
# size. The num of threadblocks is is sms_count * max_blocks_per_sm.
|
181 |
+
sms = torch.cuda.get_device_properties(device).multi_processor_count
|
182 |
+
return torch.zeros(sms * max_blocks_per_sm,
|
183 |
+
dtype=torch.int,
|
184 |
+
device=device,
|
185 |
+
requires_grad=False)
|
186 |
|
187 |
|
188 |
def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
|
189 |
return (not act_order) or (act_order and not is_row_parallel)
|
190 |
|
191 |
|
192 |
+
def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
|
193 |
+
is_row_parallel: bool) -> bool:
|
|
|
194 |
# Need to repeat scales on every rank if act_ordering or
|
195 |
# channelwise and RowParallelLinear
|
196 |
is_channelwise = group_size == -1
|
|
|
198 |
|
199 |
|
200 |
def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
201 |
+
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
202 |
+
requires_grad=False)
|
|
|
203 |
|
204 |
|
205 |
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
206 |
+
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
207 |
+
requires_grad=False)
|
|
|
208 |
|
209 |
|
210 |
+
def marlin_sort_g_idx(
|
211 |
+
g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
212 |
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
213 |
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
214 |
|
215 |
|
216 |
def get_scale_perms():
|
217 |
+
scale_perm: list[int] = []
|
218 |
for i in range(8):
|
219 |
scale_perm.extend([i + 8 * j for j in range(8)])
|
220 |
+
scale_perm_single: list[int] = []
|
221 |
for i in range(4):
|
222 |
+
scale_perm_single.extend(
|
223 |
+
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
224 |
return scale_perm, scale_perm_single
|
225 |
|
226 |
|
227 |
+
def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
|
228 |
+
group_size: int) -> torch.Tensor:
|
|
|
229 |
|
230 |
scale_perm, scale_perm_single = get_scale_perms()
|
231 |
if group_size < size_k and group_size != -1:
|
|
|
255 |
return output
|
256 |
|
257 |
|
258 |
+
def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
|
259 |
+
num_bits: int) -> torch.Tensor:
|
|
|
260 |
# Permute zero-points in a similar way to scales, but do not use the
|
261 |
# "single" permutation, since zero-points are applied on every MMA
|
262 |
scale_perm, _ = get_scale_perms()
|
|
|
277 |
return zp
|
278 |
|
279 |
|
280 |
+
def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
281 |
+
size_n: int, num_bits: int) -> torch.Tensor:
|
|
|
282 |
# AWQ zero-points are quantized and packed on the column dim.
|
283 |
# In addition, the values are permuted based on dequantizer.
|
284 |
# Here we undo both of these, and then apply marlin permutation
|
|
|
300 |
return marlin_zp
|
301 |
|
302 |
|
303 |
+
def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
304 |
+
size_n: int, num_bits: int):
|
|
|
305 |
num_experts = q_zp_packed.shape[0]
|
306 |
output = torch.empty(
|
307 |
(num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
|
|
|
309 |
dtype=q_zp_packed.dtype,
|
310 |
)
|
311 |
for e in range(num_experts):
|
312 |
+
output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
|
313 |
+
num_bits)
|
314 |
return output
|
315 |
|
316 |
|
317 |
+
def maybe_warn_marlin_atomic_add(device, dtype):
|
318 |
+
if torch.compiler.is_dynamo_compiling():
|
319 |
+
return
|
320 |
+
device_capability = torch.cuda.get_device_capability(device)
|
321 |
+
if device_capability[0] < 9 and dtype == torch.bfloat16:
|
322 |
+
logger.info_once(
|
323 |
+
"You are running Marlin kernel with bf16 on GPUs before SM90. "
|
324 |
+
"You can consider change to fp16 to achieve better performance "
|
325 |
+
"if possible.")
|
326 |
+
|
327 |
+
|
328 |
+
def maybe_warn_marlin_atomic_add_env():
|
329 |
+
if torch.compiler.is_dynamo_compiling():
|
330 |
+
return
|
331 |
+
if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
|
332 |
+
return
|
333 |
+
logger.info_once(
|
334 |
+
"Marlin kernel can achieve better performance for small size_n "
|
335 |
+
"with experimental use_atomic_add feature. "
|
336 |
+
"You can consider set environment variable "
|
337 |
+
"VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
|
338 |
+
|
339 |
+
|
340 |
+
def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
|
341 |
+
dtype: torch.dtype) -> bool:
|
342 |
+
|
343 |
+
# the performance of atomicAdd is better than global reduce
|
344 |
+
# only when m*n is small and k is large
|
345 |
+
if n >= 2048 or k < 2048 or device.type != "cuda":
|
346 |
+
return False
|
347 |
+
|
348 |
+
# disable atomicAdd reduce by default,
|
349 |
+
# one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
|
350 |
+
if not envs.VLLM_MARLIN_USE_ATOMIC_ADD:
|
351 |
+
maybe_warn_marlin_atomic_add_env()
|
352 |
+
return False
|
353 |
+
|
354 |
+
# sm8x doesn't support atomicAdd + bfloat16 natively
|
355 |
+
device_capability = torch.cuda.get_device_capability(device)
|
356 |
+
if device_capability[0] < 9 and dtype == torch.bfloat16:
|
357 |
+
maybe_warn_marlin_atomic_add(device, dtype)
|
358 |
+
return False
|
359 |
+
|
360 |
+
return True
|
361 |
+
|
362 |
+
|
363 |
def apply_gptq_marlin_linear(
|
364 |
+
input: torch.Tensor,
|
365 |
+
weight: torch.Tensor,
|
366 |
+
weight_scale: torch.Tensor,
|
367 |
+
weight_zp: torch.Tensor,
|
368 |
+
g_idx: torch.Tensor,
|
369 |
+
g_idx_sort_indices: torch.Tensor,
|
370 |
+
workspace: torch.Tensor,
|
371 |
+
wtype: ScalarType,
|
372 |
+
output_size_per_partition: int,
|
373 |
+
input_size_per_partition: int,
|
374 |
+
is_k_full: bool,
|
375 |
+
bias: Optional[torch.Tensor] = None,
|
376 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
|
|
377 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
378 |
+
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
379 |
+
|
380 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
381 |
+
n=output_size_per_partition,
|
382 |
+
k=reshaped_x.size(1),
|
383 |
+
device=input.device,
|
384 |
+
dtype=input.dtype)
|
385 |
+
|
386 |
+
output = ops.gptq_marlin_gemm(reshaped_x,
|
387 |
+
None,
|
388 |
+
weight,
|
389 |
+
weight_scale,
|
390 |
+
None,
|
391 |
+
weight_zp,
|
392 |
+
g_idx,
|
393 |
+
g_idx_sort_indices,
|
394 |
+
workspace,
|
395 |
+
wtype,
|
396 |
+
size_m=reshaped_x.shape[0],
|
397 |
+
size_n=output_size_per_partition,
|
398 |
+
size_k=input_size_per_partition,
|
399 |
+
is_k_full=is_k_full,
|
400 |
+
use_atomic_add=use_atomic_add,
|
401 |
+
use_fp32_reduce=use_fp32_reduce,
|
402 |
+
is_zp_float=False)
|
403 |
|
404 |
if bias is not None:
|
405 |
output.add_(bias) # In-place add
|
|
|
408 |
|
409 |
|
410 |
def apply_awq_marlin_linear(
|
411 |
+
input: torch.Tensor,
|
412 |
+
weight: torch.Tensor,
|
413 |
+
weight_scale: torch.Tensor,
|
414 |
+
weight_zp: torch.Tensor,
|
415 |
+
g_idx: torch.Tensor,
|
416 |
+
g_idx_sort_indices: torch.Tensor,
|
417 |
+
workspace: torch.Tensor,
|
418 |
+
quant_type: ScalarType,
|
419 |
+
output_size_per_partition: int,
|
420 |
+
input_size_per_partition: int,
|
421 |
+
bias: Optional[torch.Tensor] = None,
|
422 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
|
|
423 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
424 |
+
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
425 |
+
|
426 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
427 |
+
n=output_size_per_partition,
|
428 |
+
k=reshaped_x.size(1),
|
429 |
+
device=input.device,
|
430 |
+
dtype=input.dtype)
|
431 |
+
|
432 |
+
output = ops.gptq_marlin_gemm(reshaped_x,
|
433 |
+
None,
|
434 |
+
weight,
|
435 |
+
weight_scale,
|
436 |
+
None,
|
437 |
+
weight_zp,
|
438 |
+
g_idx,
|
439 |
+
g_idx_sort_indices,
|
440 |
+
workspace,
|
441 |
+
quant_type,
|
442 |
+
size_m=reshaped_x.shape[0],
|
443 |
+
size_n=output_size_per_partition,
|
444 |
+
size_k=input_size_per_partition,
|
445 |
+
use_atomic_add=use_atomic_add,
|
446 |
+
use_fp32_reduce=use_fp32_reduce,
|
447 |
+
is_zp_float=False)
|
448 |
|
449 |
if bias is not None:
|
450 |
output.add_(bias) # In-place add
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
import torch
|
7 |
+
|
8 |
+
import quantization as ops
|
9 |
+
|
10 |
+
from .marlin_utils import (
|
11 |
+
USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
|
12 |
+
should_use_atomic_add_reduce)
|
13 |
+
from quantization.scalar_type import scalar_types
|
14 |
+
|
15 |
+
FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16]
|
16 |
+
|
17 |
+
|
18 |
+
def is_fp4_marlin_supported():
|
19 |
+
capability = torch.cuda.get_device_capability()
|
20 |
+
capability = capability[0] * 10 + capability[1]
|
21 |
+
return capability >= 80
|
22 |
+
|
23 |
+
|
24 |
+
def fp4_marlin_process_scales(marlin_scales):
|
25 |
+
if not (marlin_scales >= 0).all():
|
26 |
+
logger.warning_once(
|
27 |
+
"NVFP4 Marlin assumes the scales to be >=0, but has encountered "
|
28 |
+
"negative scales. Accuracy will likely be degraded. This is "
|
29 |
+
"because it changes the scales from FP8-S1E4M3 to a special "
|
30 |
+
"FP8-S0E5M3 format to speedup the dequantization.")
|
31 |
+
|
32 |
+
# convert to half first, we would convert to fp8 later
|
33 |
+
marlin_scales = marlin_scales.to(torch.half)
|
34 |
+
|
35 |
+
# 8 is the number of scale number using by one thread
|
36 |
+
marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
|
37 |
+
marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
|
38 |
+
marlin_scales.size(0) * 2, -1)
|
39 |
+
|
40 |
+
# fit the layout of fp8 dequantization
|
41 |
+
marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
|
42 |
+
marlin_scales.size(0), -1)
|
43 |
+
|
44 |
+
# We assume that weight_scale (FP8-S1E4M3) is always greater
|
45 |
+
# than or equal to 0. So we can convert
|
46 |
+
# (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format.
|
47 |
+
# After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1
|
48 |
+
# when weight_scale > 0. This allows us to have an exponent bias
|
49 |
+
# closer to zero after dequantization.
|
50 |
+
|
51 |
+
marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1
|
52 |
+
marlin_scales = marlin_scales.view(torch.float8_e4m3fn)
|
53 |
+
marlin_scales = marlin_scales[:, 1::2].contiguous()
|
54 |
+
|
55 |
+
return marlin_scales
|
56 |
+
|
57 |
+
|
58 |
+
def fp4_marlin_process_global_scale(global_scale):
|
59 |
+
assert global_scale.dtype in [torch.half, torch.bfloat16]
|
60 |
+
fp4_exponent = 2
|
61 |
+
if global_scale.dtype == torch.half:
|
62 |
+
target_exponent = 5
|
63 |
+
elif global_scale.dtype == torch.bfloat16:
|
64 |
+
target_exponent = 8
|
65 |
+
# exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
|
66 |
+
# exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
|
67 |
+
exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
|
68 |
+
return global_scale * (2.0**(exponent_bias - 7))
|
69 |
+
|
70 |
+
|
71 |
+
def apply_fp4_marlin_linear(
|
72 |
+
input: torch.Tensor,
|
73 |
+
weight: torch.Tensor,
|
74 |
+
weight_scale: torch.Tensor,
|
75 |
+
weight_scale_2: torch.Tensor,
|
76 |
+
workspace: torch.Tensor,
|
77 |
+
size_n: int,
|
78 |
+
size_k: int,
|
79 |
+
bias: Optional[torch.Tensor] = None,
|
80 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
81 |
+
# For GPUs that lack FP4 hardware support, we can leverage the
|
82 |
+
# Marlin kernel for fast weight-only FP4 quantization
|
83 |
+
|
84 |
+
reshaped_x = input.reshape(-1, input.shape[-1])
|
85 |
+
out_shape = input.shape[:-1] + (size_n, )
|
86 |
+
|
87 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
88 |
+
n=size_n,
|
89 |
+
k=size_k,
|
90 |
+
device=input.device,
|
91 |
+
dtype=input.dtype)
|
92 |
+
|
93 |
+
output = ops.gptq_marlin_gemm(a=reshaped_x,
|
94 |
+
c=None,
|
95 |
+
b_q_weight=weight,
|
96 |
+
b_scales=weight_scale,
|
97 |
+
global_scale=weight_scale_2,
|
98 |
+
b_zeros=None,
|
99 |
+
g_idx=None,
|
100 |
+
perm=None,
|
101 |
+
workspace=workspace,
|
102 |
+
b_q_type=scalar_types.float4_e2m1f,
|
103 |
+
size_m=reshaped_x.size(0),
|
104 |
+
size_n=size_n,
|
105 |
+
size_k=size_k,
|
106 |
+
use_atomic_add=use_atomic_add,
|
107 |
+
use_fp32_reduce=use_fp32_reduce)
|
108 |
+
|
109 |
+
if bias is not None:
|
110 |
+
output.add_(bias) # In-place add
|
111 |
+
|
112 |
+
return output.reshape(out_shape)
|
113 |
+
|
114 |
+
|
115 |
+
def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
|
116 |
+
logger.warning_once(
|
117 |
+
"Your GPU does not have native support for FP4 computation but "
|
118 |
+
"FP4 quantization is being used. Weight-only FP4 compression will "
|
119 |
+
"be used leveraging the Marlin kernel. This may degrade "
|
120 |
+
"performance for compute-heavy workloads.")
|
121 |
+
|
122 |
+
part_size_n = layer.output_size_per_partition
|
123 |
+
part_size_k = layer.input_size_per_partition
|
124 |
+
param_dtype = layer.params_dtype
|
125 |
+
|
126 |
+
assert layer.weight.shape == (part_size_n, part_size_k // 2)
|
127 |
+
|
128 |
+
device = layer.weight.device
|
129 |
+
|
130 |
+
# WORKSPACE
|
131 |
+
layer.workspace = marlin_make_workspace_new(device)
|
132 |
+
|
133 |
+
# WEIGHT
|
134 |
+
# Repack weights to marlin format
|
135 |
+
perm = torch.empty(0, dtype=torch.int, device=device)
|
136 |
+
qweight = layer.weight.view(torch.int32).T.contiguous()
|
137 |
+
|
138 |
+
marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
|
139 |
+
perm=perm,
|
140 |
+
size_k=part_size_k,
|
141 |
+
size_n=part_size_n,
|
142 |
+
num_bits=4)
|
143 |
+
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
|
144 |
+
|
145 |
+
# WEIGHT SCALES
|
146 |
+
# Permute scales
|
147 |
+
weight_scale = layer.weight_scale.T.to(param_dtype)
|
148 |
+
weight_scale = marlin_permute_scales(s=weight_scale,
|
149 |
+
size_k=part_size_k,
|
150 |
+
size_n=part_size_n,
|
151 |
+
group_size=16)
|
152 |
+
weight_scale = fp4_marlin_process_scales(weight_scale)
|
153 |
+
layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
|
154 |
+
|
155 |
+
weight_scale_2 = layer.weight_scale_2.to(param_dtype)
|
156 |
+
weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
|
157 |
+
layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
|
158 |
+
requires_grad=False)
|
159 |
+
|
160 |
+
return
|
161 |
+
|
162 |
+
|
163 |
+
def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
|
164 |
+
logger.warning_once(
|
165 |
+
"Your GPU does not have native support for FP4 computation but "
|
166 |
+
"FP4 quantization is being used. Weight-only FP4 compression will "
|
167 |
+
"be used leveraging the Marlin kernel. This may degrade "
|
168 |
+
"performance for compute-heavy workloads.")
|
169 |
+
|
170 |
+
e = layer.num_experts
|
171 |
+
k = layer.hidden_size
|
172 |
+
n = layer.intermediate_size_per_partition
|
173 |
+
|
174 |
+
# WORKSPACE
|
175 |
+
device = layer.w13_weight.device
|
176 |
+
param_dtype = layer.params_dtype
|
177 |
+
layer.workspace = marlin_make_workspace_new(device, 4)
|
178 |
+
perm = torch.empty(0, dtype=torch.int, device=device)
|
179 |
+
|
180 |
+
# WEIGHT
|
181 |
+
# Repack weights to marlin format
|
182 |
+
for name in ["w13_weight", "w2_weight"]:
|
183 |
+
weight = getattr(layer, name)
|
184 |
+
tensor_list = []
|
185 |
+
if "w13" in name:
|
186 |
+
size_n, size_k = n * 2, k
|
187 |
+
else:
|
188 |
+
size_n, size_k = k, n
|
189 |
+
|
190 |
+
assert weight.shape == (e, size_n, size_k // 2)
|
191 |
+
|
192 |
+
for i in range(e):
|
193 |
+
qweight = weight[i].view(torch.int32).T.contiguous()
|
194 |
+
|
195 |
+
marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
|
196 |
+
perm=perm,
|
197 |
+
size_k=size_k,
|
198 |
+
size_n=size_n,
|
199 |
+
num_bits=4)
|
200 |
+
tensor_list.append(marlin_qweight)
|
201 |
+
|
202 |
+
weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
|
203 |
+
weight = torch.nn.Parameter(weight, requires_grad=False)
|
204 |
+
|
205 |
+
setattr(layer, name, weight)
|
206 |
+
|
207 |
+
# WEIGHT SCALES
|
208 |
+
# Permute scales
|
209 |
+
for name in ["w13", "w2"]:
|
210 |
+
scales = getattr(layer, name + "_weight_scale").to(param_dtype)
|
211 |
+
global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
|
212 |
+
|
213 |
+
tensor_list = []
|
214 |
+
if "w13" in name:
|
215 |
+
size_n, size_k = n * 2, k
|
216 |
+
else:
|
217 |
+
size_n, size_k = k, n
|
218 |
+
|
219 |
+
for i in range(e):
|
220 |
+
marlin_scales = marlin_permute_scales(s=scales[i].T,
|
221 |
+
size_k=size_k,
|
222 |
+
size_n=size_n,
|
223 |
+
group_size=16)
|
224 |
+
marlin_scales = fp4_marlin_process_scales(marlin_scales)
|
225 |
+
tensor_list.append(marlin_scales)
|
226 |
+
|
227 |
+
scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
|
228 |
+
scales = torch.nn.Parameter(scales, requires_grad=False)
|
229 |
+
setattr(layer, name + "_weight_scale", scales)
|
230 |
+
|
231 |
+
global_scale = fp4_marlin_process_global_scale(global_scale)
|
232 |
+
global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
|
233 |
+
setattr(layer, name + "_weight_scale_2", global_scale)
|
234 |
+
|
235 |
+
|
236 |
+
def rand_marlin_weight_fp4_like(weight, group_size):
|
237 |
+
assert group_size > 0
|
238 |
+
size_n, size_k = weight.shape
|
239 |
+
device = weight.device
|
240 |
+
|
241 |
+
scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6
|
242 |
+
global_scale = scales.max() / 448
|
243 |
+
scales = (scales / global_scale).to(torch.float8_e4m3fn)
|
244 |
+
|
245 |
+
fp4_weight = torch.randint(0,
|
246 |
+
256, (size_n, size_k // 2),
|
247 |
+
dtype=torch.uint8,
|
248 |
+
device=weight.device)
|
249 |
+
fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
|
250 |
+
((fp4_weight & 0b01110000) >> 2))
|
251 |
+
fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
|
252 |
+
fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
|
253 |
+
|
254 |
+
fp4_weight2 = fp4_weight << 4
|
255 |
+
fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
|
256 |
+
((fp4_weight2 & 0b01110000) >> 2))
|
257 |
+
fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
|
258 |
+
fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
|
259 |
+
|
260 |
+
weight_ref = torch.cat(
|
261 |
+
[fp4_weight_part_2.unsqueeze(2),
|
262 |
+
fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
|
263 |
+
weight_ref = weight_ref * global_scale.to(weight.dtype) * \
|
264 |
+
scales.repeat_interleave(group_size, 1).to(weight.dtype)
|
265 |
+
|
266 |
+
marlin_qweight = ops.gptq_marlin_repack(
|
267 |
+
b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
|
268 |
+
perm=torch.empty(0, dtype=torch.int, device=device),
|
269 |
+
size_k=size_k,
|
270 |
+
size_n=size_n,
|
271 |
+
num_bits=4,
|
272 |
+
)
|
273 |
+
|
274 |
+
marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
|
275 |
+
size_k=size_k,
|
276 |
+
size_n=size_n,
|
277 |
+
group_size=group_size)
|
278 |
+
marlin_scales = fp4_marlin_process_scales(marlin_scales)
|
279 |
+
|
280 |
+
global_scale = fp4_marlin_process_global_scale(global_scale)
|
281 |
+
|
282 |
+
return weight_ref.T, marlin_qweight, marlin_scales, global_scale
|
build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py
CHANGED
@@ -1,10 +1,13 @@
|
|
|
|
|
|
|
|
1 |
from typing import Optional
|
2 |
|
3 |
import torch
|
4 |
|
5 |
import quantization as ops
|
6 |
|
7 |
-
from .marlin_utils import marlin_make_workspace, marlin_permute_scales
|
8 |
|
9 |
|
10 |
def is_fp8_marlin_supported():
|
@@ -13,88 +16,107 @@ def is_fp8_marlin_supported():
|
|
13 |
return capability >= 80
|
14 |
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def apply_fp8_marlin_linear(
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
) -> torch.Tensor:
|
25 |
# For GPUs that lack FP8 hardware support, we can leverage the
|
26 |
# Marlin kernel for fast weight-only FP8 quantization
|
27 |
|
28 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
29 |
-
out_shape = input.shape[:-1] + (size_n,)
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
if bias is not None:
|
43 |
output.add_(bias) # In-place add
|
44 |
|
45 |
return output.reshape(out_shape)
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
layer: torch.nn.Module, strategy: str = "tensor"
|
50 |
-
) -> None:
|
51 |
-
part_size_n = layer.output_size_per_partition
|
52 |
-
part_size_k = layer.input_size_per_partition
|
53 |
-
|
54 |
-
device = layer.weight.device
|
55 |
-
|
56 |
-
# WORKSPACE
|
57 |
-
layer.workspace = marlin_make_workspace(part_size_n, device)
|
58 |
-
|
59 |
-
# WEIGHT
|
60 |
-
# Repack weights to marlin format
|
61 |
-
marlin_qweight = ops.gptq_marlin_repack(
|
62 |
-
b_q_weight=pack_fp8_to_int32(layer.weight),
|
63 |
-
perm=torch.empty(0, dtype=torch.int, device=device),
|
64 |
-
size_k=part_size_k,
|
65 |
-
size_n=part_size_n,
|
66 |
-
num_bits=8,
|
67 |
-
)
|
68 |
-
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
|
69 |
-
|
70 |
-
# WEIGHT SCALES
|
71 |
-
scales = layer.weight_scale.to(layer.orig_dtype)
|
72 |
-
# Permute scales
|
73 |
-
marlin_scales = marlin_permute_scales(
|
74 |
-
s=scales, size_k=part_size_k, size_n=part_size_n, group_size=-1
|
75 |
-
)
|
76 |
-
layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
|
77 |
-
|
78 |
-
|
79 |
-
def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
|
80 |
"""
|
81 |
Repack FP8 weights to gptq format (packed int32 elements)
|
82 |
"""
|
83 |
assert fp8_tensor.dtype == torch.float8_e4m3fn
|
84 |
-
assert fp8_tensor.
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
-
|
90 |
-
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
packed = (
|
94 |
-
byte_tensor[:, 0].to(torch.int32)
|
95 |
-
| (byte_tensor[:, 1].to(torch.int32) << 8)
|
96 |
-
| (byte_tensor[:, 2].to(torch.int32) << 16)
|
97 |
-
| (byte_tensor[:, 3].to(torch.int32) << 24)
|
98 |
-
)
|
99 |
|
100 |
-
return
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
from typing import Optional
|
5 |
|
6 |
import torch
|
7 |
|
8 |
import quantization as ops
|
9 |
|
10 |
+
from .marlin_utils import USE_FP32_REDUCE_DEFAULT, marlin_make_workspace, marlin_permute_scales
|
11 |
|
12 |
|
13 |
def is_fp8_marlin_supported():
|
|
|
16 |
return capability >= 80
|
17 |
|
18 |
|
19 |
+
def fp8_fused_exponent_bias_into_scales(scales):
|
20 |
+
fp8_exponent = 4
|
21 |
+
if scales.dtype == torch.half:
|
22 |
+
target_exponent = 5
|
23 |
+
elif scales.dtype == torch.bfloat16:
|
24 |
+
target_exponent = 8
|
25 |
+
# exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
|
26 |
+
# exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
|
27 |
+
exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
|
28 |
+
s = torch.ones_like(scales) * 2
|
29 |
+
s = s**exponent_bias
|
30 |
+
return scales * s
|
31 |
+
|
32 |
+
|
33 |
def apply_fp8_marlin_linear(
|
34 |
+
input: torch.Tensor,
|
35 |
+
weight: torch.Tensor,
|
36 |
+
weight_scale: torch.Tensor,
|
37 |
+
workspace: torch.Tensor,
|
38 |
+
size_n: int,
|
39 |
+
size_k: int,
|
40 |
+
bias: Optional[torch.Tensor],
|
41 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
42 |
# For GPUs that lack FP8 hardware support, we can leverage the
|
43 |
# Marlin kernel for fast weight-only FP8 quantization
|
44 |
|
45 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
46 |
+
out_shape = input.shape[:-1] + (size_n, )
|
47 |
+
|
48 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
49 |
+
n=size_n,
|
50 |
+
k=size_k,
|
51 |
+
device=input.device,
|
52 |
+
dtype=input.dtype)
|
53 |
+
|
54 |
+
output = ops.gptq_marlin_gemm(a=reshaped_x,
|
55 |
+
c=None,
|
56 |
+
b_q_weight=weight,
|
57 |
+
b_scales=weight_scale,
|
58 |
+
global_scale=None,
|
59 |
+
b_zeros=None,
|
60 |
+
g_idx=None,
|
61 |
+
perm=None,
|
62 |
+
workspace=workspace,
|
63 |
+
b_q_type=scalar_types.float8_e4m3fn,
|
64 |
+
size_m=reshaped_x.size(0),
|
65 |
+
size_n=size_n,
|
66 |
+
size_k=size_k,
|
67 |
+
use_atomic_add=use_atomic_add,
|
68 |
+
use_fp32_reduce=use_fp32_reduce)
|
69 |
|
70 |
if bias is not None:
|
71 |
output.add_(bias) # In-place add
|
72 |
|
73 |
return output.reshape(out_shape)
|
74 |
|
75 |
+
def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
|
76 |
+
size_k_first: bool = True) -> torch.Tensor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
"""
|
78 |
Repack FP8 weights to gptq format (packed int32 elements)
|
79 |
"""
|
80 |
assert fp8_tensor.dtype == torch.float8_e4m3fn
|
81 |
+
assert fp8_tensor.ndim == 2
|
82 |
+
|
83 |
+
fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
|
84 |
+
fp8_tensor = fp8_tensor.contiguous()
|
85 |
+
# fp8_tensor is contiguous and have shape (N, K) now
|
86 |
+
# with `.view(torch.int32)`, it become (N, K // 4)
|
87 |
+
int32_tensor = fp8_tensor.view(torch.int32)
|
88 |
+
return int32_tensor.T.contiguous() if size_k_first else int32_tensor
|
89 |
+
|
90 |
+
|
91 |
+
def marlin_quant_fp8_torch(weight, group_size):
|
92 |
+
size_n, size_k = weight.shape
|
93 |
+
device = weight.device
|
94 |
+
|
95 |
+
if group_size != -1:
|
96 |
+
scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
|
97 |
+
repeated_scales = scales.repeat_interleave(group_size, 1)
|
98 |
+
fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
|
99 |
+
weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
|
100 |
+
else:
|
101 |
+
scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
|
102 |
+
repeated_scales = scales.repeat_interleave(size_k, 1)
|
103 |
+
fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
|
104 |
+
weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
|
105 |
+
|
106 |
+
packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
|
107 |
+
marlin_qweight = ops.gptq_marlin_repack(
|
108 |
+
b_q_weight=packed_weight,
|
109 |
+
perm=torch.empty(0, dtype=torch.int, device=device),
|
110 |
+
size_k=size_k,
|
111 |
+
size_n=size_n,
|
112 |
+
num_bits=8,
|
113 |
+
)
|
114 |
|
115 |
+
marlin_scales = marlin_permute_scales(s=scales.T,
|
116 |
+
size_k=size_k,
|
117 |
+
size_n=size_n,
|
118 |
+
group_size=group_size)
|
119 |
|
120 |
+
marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
+
return weight_ref.T, marlin_qweight, marlin_scales
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/__init__.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
|
2 |
from .cutlass import (
|
|
|
3 |
cutlass_scaled_mm_supports_fp8,
|
4 |
cutlass_scaled_mm,
|
5 |
cutlass_scaled_mm_azp,
|
6 |
)
|
7 |
from .marlin import (
|
8 |
awq_marlin_repack,
|
9 |
-
fp8_marlin_gemm,
|
10 |
gptq_marlin_gemm,
|
11 |
gptq_marlin_repack,
|
12 |
gptq_marlin_24_gemm,
|
@@ -25,8 +25,8 @@ __all__ = [
|
|
25 |
"awq_marlin_repack",
|
26 |
"cutlass_scaled_mm",
|
27 |
"cutlass_scaled_mm_azp",
|
|
|
28 |
"cutlass_scaled_mm_supports_fp8",
|
29 |
-
"fp8_marlin_gemm",
|
30 |
"gptq_marlin_24_gemm",
|
31 |
"gptq_marlin_gemm",
|
32 |
"gptq_marlin_repack",
|
|
|
1 |
from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
|
2 |
from .cutlass import (
|
3 |
+
cutlass_scaled_mm_supports_block_fp8,
|
4 |
cutlass_scaled_mm_supports_fp8,
|
5 |
cutlass_scaled_mm,
|
6 |
cutlass_scaled_mm_azp,
|
7 |
)
|
8 |
from .marlin import (
|
9 |
awq_marlin_repack,
|
|
|
10 |
gptq_marlin_gemm,
|
11 |
gptq_marlin_repack,
|
12 |
gptq_marlin_24_gemm,
|
|
|
25 |
"awq_marlin_repack",
|
26 |
"cutlass_scaled_mm",
|
27 |
"cutlass_scaled_mm_azp",
|
28 |
+
"cutlass_scaled_mm_supports_block_fp8",
|
29 |
"cutlass_scaled_mm_supports_fp8",
|
|
|
30 |
"gptq_marlin_24_gemm",
|
31 |
"gptq_marlin_gemm",
|
32 |
"gptq_marlin_repack",
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _quantization_9035540
|
3 |
+
ops = torch.ops._quantization_9035540
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_quantization_9035540::{op_name}"
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d670f7d449a8d177ce46784fb4617dcb0edc30f8d8a62305ed1213310256167
|
3 |
+
size 296561248
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/compressed_tensors.py
CHANGED
@@ -2,17 +2,7 @@ from typing import Optional, Tuple
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
-
|
6 |
-
from ._ops import ops
|
7 |
-
except ImportError as e:
|
8 |
-
# Fallback for local development.
|
9 |
-
try:
|
10 |
-
import _quantization
|
11 |
-
|
12 |
-
ops = torch.ops._quantization
|
13 |
-
except ImportError:
|
14 |
-
raise e
|
15 |
-
|
16 |
|
17 |
# fp8
|
18 |
def scaled_fp8_quant(
|
@@ -21,7 +11,8 @@ def scaled_fp8_quant(
|
|
21 |
num_token_padding: Optional[int] = None,
|
22 |
scale_ub: Optional[torch.Tensor] = None,
|
23 |
use_per_token_if_dynamic: bool = False,
|
24 |
-
|
|
|
25 |
"""
|
26 |
Quantize input tensor to FP8 and return quantized tensor and scale.
|
27 |
|
@@ -42,30 +33,36 @@ def scaled_fp8_quant(
|
|
42 |
in the dynamic quantization case.
|
43 |
|
44 |
Returns:
|
45 |
-
|
46 |
scaling factor.
|
47 |
"""
|
48 |
# This code assumes batch_dim and num_tokens are flattened
|
49 |
-
assert input.ndim == 2
|
50 |
-
shape: Union[
|
51 |
-
# For
|
52 |
-
|
53 |
-
# if current_platform.is_rocm() else torch.float8_e4m3fn
|
54 |
-
out_dtype = torch.float8_e4m3fn
|
55 |
if num_token_padding:
|
56 |
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
57 |
-
output
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if scale is None:
|
60 |
if use_per_token_if_dynamic:
|
61 |
-
scale = torch.empty((shape[0], 1),
|
62 |
-
|
|
|
|
|
|
|
63 |
else:
|
64 |
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
65 |
ops.dynamic_scaled_fp8_quant(output, input, scale)
|
66 |
else:
|
67 |
# num_token_padding not implemented for this case
|
68 |
-
assert scale.numel() == 1
|
69 |
ops.static_scaled_fp8_quant(output, input, scale)
|
70 |
|
71 |
return output, scale
|
@@ -76,8 +73,8 @@ def scaled_int8_quant(
|
|
76 |
input: torch.Tensor,
|
77 |
scale: Optional[torch.Tensor] = None,
|
78 |
azp: Optional[torch.Tensor] = None,
|
79 |
-
symmetric: bool = True
|
80 |
-
) ->
|
81 |
"""
|
82 |
Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
|
83 |
|
@@ -90,21 +87,25 @@ def scaled_int8_quant(
|
|
90 |
symmetric: Whether to use symmetric quantization (scale only, azp ignored).
|
91 |
|
92 |
Returns:
|
93 |
-
|
94 |
"""
|
95 |
output = torch.empty_like(input, dtype=torch.int8)
|
96 |
if scale is not None:
|
97 |
# static-per-tensor quantization.
|
98 |
assert symmetric == (
|
99 |
-
azp
|
100 |
-
|
101 |
ops.static_scaled_int8_quant(output, input, scale, azp)
|
102 |
return output, scale, azp
|
103 |
|
104 |
# dynamic-per-token quantization.
|
105 |
-
input_scales = torch.empty(
|
106 |
-
|
107 |
-
|
108 |
-
input_azp = None if symmetric else torch.empty_like(input_scales,
|
109 |
-
|
|
|
|
|
110 |
return output, input_scales, input_azp
|
|
|
|
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
+
from ._ops import ops
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# fp8
|
8 |
def scaled_fp8_quant(
|
|
|
11 |
num_token_padding: Optional[int] = None,
|
12 |
scale_ub: Optional[torch.Tensor] = None,
|
13 |
use_per_token_if_dynamic: bool = False,
|
14 |
+
output: Optional[torch.Tensor] = None,
|
15 |
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
16 |
"""
|
17 |
Quantize input tensor to FP8 and return quantized tensor and scale.
|
18 |
|
|
|
33 |
in the dynamic quantization case.
|
34 |
|
35 |
Returns:
|
36 |
+
tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
|
37 |
scaling factor.
|
38 |
"""
|
39 |
# This code assumes batch_dim and num_tokens are flattened
|
40 |
+
assert (input.ndim == 2)
|
41 |
+
shape: Union[tuple[int, int], torch.Size] = input.shape
|
42 |
+
# For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
|
43 |
+
out_dtype: torch.dtype = current_platform.fp8_dtype()
|
|
|
|
|
44 |
if num_token_padding:
|
45 |
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
46 |
+
if output is None:
|
47 |
+
output = torch.empty(shape, device=input.device, dtype=out_dtype)
|
48 |
+
else:
|
49 |
+
assert num_token_padding is None, \
|
50 |
+
"padding not supported if output passed in"
|
51 |
+
assert output.dtype == out_dtype
|
52 |
|
53 |
if scale is None:
|
54 |
if use_per_token_if_dynamic:
|
55 |
+
scale = torch.empty((shape[0], 1),
|
56 |
+
device=input.device,
|
57 |
+
dtype=torch.float32)
|
58 |
+
ops.dynamic_per_token_scaled_fp8_quant(
|
59 |
+
output, input.contiguous(), scale, scale_ub)
|
60 |
else:
|
61 |
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
62 |
ops.dynamic_scaled_fp8_quant(output, input, scale)
|
63 |
else:
|
64 |
# num_token_padding not implemented for this case
|
65 |
+
assert (scale.numel() == 1 and num_token_padding is None)
|
66 |
ops.static_scaled_fp8_quant(output, input, scale)
|
67 |
|
68 |
return output, scale
|
|
|
73 |
input: torch.Tensor,
|
74 |
scale: Optional[torch.Tensor] = None,
|
75 |
azp: Optional[torch.Tensor] = None,
|
76 |
+
symmetric: bool = True
|
77 |
+
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
78 |
"""
|
79 |
Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
|
80 |
|
|
|
87 |
symmetric: Whether to use symmetric quantization (scale only, azp ignored).
|
88 |
|
89 |
Returns:
|
90 |
+
tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
|
91 |
"""
|
92 |
output = torch.empty_like(input, dtype=torch.int8)
|
93 |
if scale is not None:
|
94 |
# static-per-tensor quantization.
|
95 |
assert symmetric == (
|
96 |
+
azp
|
97 |
+
is None), "azp must only be provided for asymmetric quantization."
|
98 |
ops.static_scaled_int8_quant(output, input, scale, azp)
|
99 |
return output, scale, azp
|
100 |
|
101 |
# dynamic-per-token quantization.
|
102 |
+
input_scales = torch.empty((input.numel() // input.shape[-1], 1),
|
103 |
+
device=input.device,
|
104 |
+
dtype=torch.float32)
|
105 |
+
input_azp = None if symmetric else torch.empty_like(input_scales,
|
106 |
+
dtype=torch.int32)
|
107 |
+
ops.dynamic_scaled_int8_quant(output, input.contiguous(),
|
108 |
+
input_scales, input_azp)
|
109 |
return output, input_scales, input_azp
|
110 |
+
|
111 |
+
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/cutlass.py
CHANGED
@@ -2,22 +2,18 @@ from typing import Optional
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
except ImportError as e:
|
8 |
-
# Fallback for local development.
|
9 |
-
try:
|
10 |
-
import _quantization
|
11 |
-
|
12 |
-
ops = torch.ops._quantization
|
13 |
-
except ImportError:
|
14 |
-
raise e
|
15 |
|
16 |
|
17 |
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
18 |
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
21 |
def cutlass_scaled_mm(
|
22 |
a: torch.Tensor,
|
23 |
b: torch.Tensor,
|
@@ -33,12 +29,10 @@ def cutlass_scaled_mm(
|
|
33 |
m = a.shape[0]
|
34 |
n = b.shape[1]
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
41 |
-
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
42 |
|
43 |
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
44 |
|
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
+
from ._ops import ops
|
6 |
+
from .platforms import current_platform
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
10 |
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
11 |
|
12 |
|
13 |
+
def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
|
14 |
+
return ops.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
|
15 |
+
|
16 |
+
|
17 |
def cutlass_scaled_mm(
|
18 |
a: torch.Tensor,
|
19 |
b: torch.Tensor,
|
|
|
29 |
m = a.shape[0]
|
30 |
n = b.shape[1]
|
31 |
|
32 |
+
cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
33 |
+
if not cutlass_compatible_b:
|
34 |
+
from .triton_scaled_mm import triton_scaled_mm
|
35 |
+
return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
|
|
|
|
36 |
|
37 |
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
38 |
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/marlin.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import TYPE_CHECKING
|
2 |
|
3 |
import torch
|
4 |
|
@@ -30,58 +30,30 @@ except ImportError as e:
|
|
30 |
from .scalar_type import ScalarType
|
31 |
|
32 |
|
33 |
-
# fp8 marlin
|
34 |
-
def fp8_marlin_gemm(
|
35 |
-
a: torch.Tensor,
|
36 |
-
b_q_weight: torch.Tensor,
|
37 |
-
b_scales: torch.Tensor,
|
38 |
-
workspace: torch.Tensor,
|
39 |
-
num_bits: int,
|
40 |
-
size_m: int,
|
41 |
-
size_n: int,
|
42 |
-
size_k: int,
|
43 |
-
) -> torch.Tensor:
|
44 |
-
return ops.fp8_marlin_gemm(
|
45 |
-
a, b_q_weight, b_scales, workspace, num_bits, size_m, size_n, size_k
|
46 |
-
)
|
47 |
-
|
48 |
-
|
49 |
# gptq_marlin
|
50 |
-
def gptq_marlin_gemm(
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
) -> torch.Tensor:
|
67 |
-
return ops.gptq_marlin_gemm(
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
perm,
|
74 |
-
workspace,
|
75 |
-
b_q_type.id,
|
76 |
-
size_m,
|
77 |
-
size_n,
|
78 |
-
size_k,
|
79 |
-
is_k_full,
|
80 |
-
has_zp,
|
81 |
-
use_fp32_reduce,
|
82 |
-
is_zp_float,
|
83 |
-
)
|
84 |
-
|
85 |
|
86 |
# gptq_marlin
|
87 |
def gptq_marlin_repack(
|
@@ -153,14 +125,6 @@ def marlin_qqq_gemm(
|
|
153 |
# Fake ops
|
154 |
|
155 |
if hasattr(ops, "gptq_marlin_24_gemm"):
|
156 |
-
@register_fake(add_op_namespace_prefix("fp8_marlin_gemm"))
|
157 |
-
def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
158 |
-
b_scales: torch.Tensor, workspace: torch.Tensor,
|
159 |
-
num_bits: int, size_m: torch.SymInt,
|
160 |
-
size_n: torch.SymInt,
|
161 |
-
size_k: torch.SymInt) -> torch.Tensor:
|
162 |
-
return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
|
163 |
-
|
164 |
@register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
|
165 |
def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
166 |
b_meta: torch.Tensor, b_scales: torch.Tensor,
|
@@ -172,20 +136,22 @@ if hasattr(ops, "gptq_marlin_24_gemm"):
|
|
172 |
|
173 |
@register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
|
174 |
def _gptq_marlin_gemm_fake(a: torch.Tensor,
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
189 |
return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
|
190 |
|
191 |
@register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))
|
|
|
1 |
+
from typing import TYPE_CHECKING, Optional
|
2 |
|
3 |
import torch
|
4 |
|
|
|
30 |
from .scalar_type import ScalarType
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# gptq_marlin
|
34 |
+
def gptq_marlin_gemm(a: torch.Tensor,
|
35 |
+
c: Optional[torch.Tensor],
|
36 |
+
b_q_weight: torch.Tensor,
|
37 |
+
b_scales: torch.Tensor,
|
38 |
+
global_scale: Optional[torch.Tensor],
|
39 |
+
b_zeros: Optional[torch.Tensor],
|
40 |
+
g_idx: Optional[torch.Tensor],
|
41 |
+
perm: Optional[torch.Tensor],
|
42 |
+
workspace: torch.Tensor,
|
43 |
+
b_q_type: ScalarType,
|
44 |
+
size_m: int,
|
45 |
+
size_n: int,
|
46 |
+
size_k: int,
|
47 |
+
is_k_full: bool = True,
|
48 |
+
use_atomic_add: bool = False,
|
49 |
+
use_fp32_reduce: bool = False,
|
50 |
+
is_zp_float: bool = False) -> torch.Tensor:
|
51 |
+
return ops.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
|
52 |
+
global_scale, b_zeros, g_idx, perm,
|
53 |
+
workspace, b_q_type.id, size_m,
|
54 |
+
size_n, size_k, is_k_full,
|
55 |
+
use_atomic_add, use_fp32_reduce,
|
56 |
+
is_zp_float)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# gptq_marlin
|
59 |
def gptq_marlin_repack(
|
|
|
125 |
# Fake ops
|
126 |
|
127 |
if hasattr(ops, "gptq_marlin_24_gemm"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
@register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
|
129 |
def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
|
130 |
b_meta: torch.Tensor, b_scales: torch.Tensor,
|
|
|
136 |
|
137 |
@register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
|
138 |
def _gptq_marlin_gemm_fake(a: torch.Tensor,
|
139 |
+
c: Optional[torch.Tensor],
|
140 |
+
b_q_weight: torch.Tensor,
|
141 |
+
b_scales: torch.Tensor,
|
142 |
+
global_scale: Optional[torch.Tensor],
|
143 |
+
b_zeros: Optional[torch.Tensor],
|
144 |
+
g_idx: Optional[torch.Tensor],
|
145 |
+
perm: Optional[torch.Tensor],
|
146 |
+
workspace: torch.Tensor,
|
147 |
+
b_q_type_id: int,
|
148 |
+
size_m: torch.SymInt,
|
149 |
+
size_n: torch.SymInt,
|
150 |
+
size_k: torch.SymInt,
|
151 |
+
is_k_full: bool = True,
|
152 |
+
use_atomic_add: bool = False,
|
153 |
+
use_fp32_reduce: bool = False,
|
154 |
+
is_zp_float: bool = False) -> torch.Tensor:
|
155 |
return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
|
156 |
|
157 |
@register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/platforms.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from functools import lru_cache
|
3 |
+
from typing import NamedTuple
|
4 |
+
|
5 |
+
import torch
|
6 |
+
|
7 |
+
IS_ROCM = torch.version.hip is not None
|
8 |
+
|
9 |
+
|
10 |
+
class DeviceCapability(NamedTuple):
|
11 |
+
major: int
|
12 |
+
minor: int
|
13 |
+
|
14 |
+
def as_version_str(self) -> str:
|
15 |
+
return f"{self.major}.{self.minor}"
|
16 |
+
|
17 |
+
def to_int(self) -> int:
|
18 |
+
"""
|
19 |
+
Express device capability as an integer ``<major><minor>``.
|
20 |
+
|
21 |
+
It is assumed that the minor version is always a single digit.
|
22 |
+
"""
|
23 |
+
assert 0 <= self.minor < 10
|
24 |
+
return self.major * 10 + self.minor
|
25 |
+
|
26 |
+
|
27 |
+
class Platform(ABC):
|
28 |
+
simple_compile_backend: str = "inductor"
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
@abstractmethod
|
32 |
+
def get_device_name(cls, device_id: int = 0) -> str: ...
|
33 |
+
|
34 |
+
@abstractmethod
|
35 |
+
def is_rocm(self): ...
|
36 |
+
|
37 |
+
|
38 |
+
class CudaPlatform(Platform):
|
39 |
+
@classmethod
|
40 |
+
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
41 |
+
major, minor = torch.cuda.get_device_capability(device_id)
|
42 |
+
return DeviceCapability(major=major, minor=minor)
|
43 |
+
|
44 |
+
@classmethod
|
45 |
+
@lru_cache(maxsize=8)
|
46 |
+
def get_device_name(cls, device_id: int = 0) -> str:
|
47 |
+
return torch.cuda.get_device_name(0)
|
48 |
+
|
49 |
+
def is_rocm(self):
|
50 |
+
return False
|
51 |
+
|
52 |
+
|
53 |
+
class RocmPlatform(Platform):
|
54 |
+
@classmethod
|
55 |
+
@lru_cache(maxsize=8)
|
56 |
+
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
57 |
+
major, minor = torch.cuda.get_device_capability(device_id)
|
58 |
+
return DeviceCapability(major=major, minor=minor)
|
59 |
+
|
60 |
+
@classmethod
|
61 |
+
@lru_cache(maxsize=8)
|
62 |
+
def get_device_name(cls, device_id: int = 0) -> str:
|
63 |
+
return torch.cuda.get_device_name(device_id)
|
64 |
+
|
65 |
+
def is_rocm(self):
|
66 |
+
return True
|
67 |
+
|
68 |
+
|
69 |
+
current_platform = RocmPlatform() if IS_ROCM else CudaPlatform()
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/scalar_type.py
CHANGED
@@ -1,9 +1,14 @@
|
|
|
|
|
|
|
|
1 |
import functools
|
2 |
import struct
|
3 |
from dataclasses import dataclass
|
4 |
from enum import Enum
|
5 |
from typing import Optional, Union
|
6 |
|
|
|
|
|
7 |
|
8 |
# Mirrors enum in `core/scalar_type.hpp`
|
9 |
class NanRepr(Enum):
|
@@ -121,8 +126,8 @@ class ScalarType:
|
|
121 |
min_raw = max_raw | sign_bit_double
|
122 |
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
123 |
else:
|
124 |
-
assert (not self.is_signed() or
|
125 |
-
|
126 |
|
127 |
if self.is_signed():
|
128 |
return -(1 << (self.size_bits - 1))
|
@@ -156,6 +161,8 @@ class ScalarType:
|
|
156 |
assert offset <= 64, \
|
157 |
f"ScalarType fields too big {offset} to fit into an int64"
|
158 |
|
|
|
|
|
159 |
return val
|
160 |
|
161 |
@property
|
@@ -293,6 +300,13 @@ class ScalarType:
|
|
293 |
ret.id # noqa B018: make sure the id is cached
|
294 |
return ret
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
# naming generally follows: https://github.com/jax-ml/ml_dtypes
|
298 |
# for floating point types (leading f) the scheme is:
|
@@ -319,6 +333,9 @@ class scalar_types:
|
|
319 |
# fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
|
320 |
float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
|
321 |
|
|
|
|
|
|
|
322 |
# "gptq" types
|
323 |
uint2b2 = ScalarType.uint(2, 2)
|
324 |
uint3b4 = ScalarType.uint(3, 4)
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
import functools
|
5 |
import struct
|
6 |
from dataclasses import dataclass
|
7 |
from enum import Enum
|
8 |
from typing import Optional, Union
|
9 |
|
10 |
+
_SCALAR_TYPES_ID_MAP = {}
|
11 |
+
|
12 |
|
13 |
# Mirrors enum in `core/scalar_type.hpp`
|
14 |
class NanRepr(Enum):
|
|
|
126 |
min_raw = max_raw | sign_bit_double
|
127 |
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
128 |
else:
|
129 |
+
assert (not self.is_signed() or self.size_bits
|
130 |
+
<= 64), "Cannot represent min as a int64_t"
|
131 |
|
132 |
if self.is_signed():
|
133 |
return -(1 << (self.size_bits - 1))
|
|
|
161 |
assert offset <= 64, \
|
162 |
f"ScalarType fields too big {offset} to fit into an int64"
|
163 |
|
164 |
+
_SCALAR_TYPES_ID_MAP[val] = self
|
165 |
+
|
166 |
return val
|
167 |
|
168 |
@property
|
|
|
300 |
ret.id # noqa B018: make sure the id is cached
|
301 |
return ret
|
302 |
|
303 |
+
@classmethod
|
304 |
+
def from_id(cls, scalar_type_id: int):
|
305 |
+
if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
|
306 |
+
raise ValueError(
|
307 |
+
f"scalar_type_id {scalar_type_id} doesn't exists.")
|
308 |
+
return _SCALAR_TYPES_ID_MAP[scalar_type_id]
|
309 |
+
|
310 |
|
311 |
# naming generally follows: https://github.com/jax-ml/ml_dtypes
|
312 |
# for floating point types (leading f) the scheme is:
|
|
|
333 |
# fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
|
334 |
float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
|
335 |
|
336 |
+
# fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
|
337 |
+
float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
|
338 |
+
|
339 |
# "gptq" types
|
340 |
uint2b2 = ScalarType.uint(2, 2)
|
341 |
uint3b4 = ScalarType.uint(3, 4)
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
|
3 |
import numpy
|
4 |
import torch
|
@@ -42,7 +45,9 @@ USE_FP32_REDUCE_DEFAULT = True
|
|
42 |
# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
|
43 |
# TODO: we may want to move this into the C++ so its closer to the actual impl
|
44 |
def query_marlin_supported_quant_types(
|
45 |
-
has_zp:
|
|
|
|
|
46 |
):
|
47 |
if device_capability is None:
|
48 |
capability_tuple = torch.cuda.get_device_capability()
|
@@ -51,137 +56,141 @@ def query_marlin_supported_quant_types(
|
|
51 |
if device_capability < 80:
|
52 |
return []
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
if has_zp:
|
55 |
# AWQ style, unsigned + runtime zero-point
|
56 |
-
return [scalar_types.uint4
|
57 |
else:
|
58 |
# GPTQ style, unsigned + symmetric bias
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
62 |
|
63 |
|
64 |
def _check_marlin_supported(
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
) -> Tuple[bool, Optional[str]]:
|
70 |
|
71 |
if device_capability is None:
|
72 |
capability_tuple = torch.cuda.get_device_capability()
|
73 |
device_capability = capability_tuple[0] * 10 + capability_tuple[1]
|
74 |
|
75 |
-
supported_types = query_marlin_supported_quant_types(
|
|
|
76 |
|
77 |
if quant_type not in supported_types:
|
78 |
-
return (
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
return (
|
87 |
-
False,
|
88 |
-
f"Marlin does not support group_size = {group_size}. "
|
89 |
-
f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
|
90 |
-
"are supported.",
|
91 |
-
)
|
92 |
|
93 |
return True, None
|
94 |
|
95 |
|
96 |
-
def check_marlin_supported(
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
)
|
102 |
-
cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
|
103 |
return cond
|
104 |
|
105 |
|
106 |
-
def verify_marlin_supported(
|
107 |
-
|
108 |
-
) -> None:
|
109 |
cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
|
110 |
if not cond:
|
111 |
assert err_msg is not None
|
112 |
raise ValueError(err_msg)
|
113 |
|
114 |
|
115 |
-
def verify_marlin_supports_shape(
|
116 |
-
|
117 |
-
|
118 |
-
input_size: int,
|
119 |
-
group_size: int,
|
120 |
-
) -> None:
|
121 |
|
122 |
# Validate output_size_per_partition
|
123 |
if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
|
124 |
-
raise ValueError(
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
"with --quantization gptq."
|
130 |
-
)
|
131 |
|
132 |
# Validate input_size_per_partition
|
133 |
if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
|
134 |
-
raise ValueError(
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
if group_size < input_size and input_size_per_partition % group_size != 0:
|
143 |
raise ValueError(
|
144 |
f"Weight input_size_per_partition = {input_size_per_partition}"
|
145 |
-
f" is not divisible by group_size = {group_size}."
|
146 |
"Consider reducing tensor_parallel_size or running "
|
147 |
-
"with --quantization gptq."
|
148 |
-
)
|
149 |
|
150 |
|
151 |
-
def check_marlin_supports_shape(
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
group_size: int,
|
156 |
-
) -> Tuple[bool, Optional[str]]:
|
157 |
try:
|
158 |
-
verify_marlin_supports_shape(
|
159 |
-
|
160 |
-
|
161 |
except ValueError as e:
|
162 |
return False, e.__str__()
|
163 |
return True, None
|
164 |
|
165 |
|
166 |
-
def marlin_make_workspace(
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
|
171 |
-
) * GPTQ_MARLIN_MAX_PARALLEL
|
172 |
|
173 |
-
return torch.zeros(
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
|
178 |
def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
|
179 |
return (not act_order) or (act_order and not is_row_parallel)
|
180 |
|
181 |
|
182 |
-
def marlin_repeat_scales_on_all_ranks(
|
183 |
-
|
184 |
-
) -> bool:
|
185 |
# Need to repeat scales on every rank if act_ordering or
|
186 |
# channelwise and RowParallelLinear
|
187 |
is_channelwise = group_size == -1
|
@@ -189,35 +198,34 @@ def marlin_repeat_scales_on_all_ranks(
|
|
189 |
|
190 |
|
191 |
def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
192 |
-
return torch.nn.Parameter(
|
193 |
-
|
194 |
-
)
|
195 |
|
196 |
|
197 |
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
198 |
-
return torch.nn.Parameter(
|
199 |
-
|
200 |
-
)
|
201 |
|
202 |
|
203 |
-
def marlin_sort_g_idx(
|
|
|
204 |
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
205 |
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
206 |
|
207 |
|
208 |
def get_scale_perms():
|
209 |
-
scale_perm:
|
210 |
for i in range(8):
|
211 |
scale_perm.extend([i + 8 * j for j in range(8)])
|
212 |
-
scale_perm_single:
|
213 |
for i in range(4):
|
214 |
-
scale_perm_single.extend(
|
|
|
215 |
return scale_perm, scale_perm_single
|
216 |
|
217 |
|
218 |
-
def marlin_permute_scales(
|
219 |
-
|
220 |
-
) -> torch.Tensor:
|
221 |
|
222 |
scale_perm, scale_perm_single = get_scale_perms()
|
223 |
if group_size < size_k and group_size != -1:
|
@@ -247,9 +255,8 @@ def marlin_moe_permute_scales(
|
|
247 |
return output
|
248 |
|
249 |
|
250 |
-
def marlin_zero_points(
|
251 |
-
|
252 |
-
) -> torch.Tensor:
|
253 |
# Permute zero-points in a similar way to scales, but do not use the
|
254 |
# "single" permutation, since zero-points are applied on every MMA
|
255 |
scale_perm, _ = get_scale_perms()
|
@@ -270,9 +277,8 @@ def marlin_zero_points(
|
|
270 |
return zp
|
271 |
|
272 |
|
273 |
-
def awq_to_marlin_zero_points(
|
274 |
-
|
275 |
-
) -> torch.Tensor:
|
276 |
# AWQ zero-points are quantized and packed on the column dim.
|
277 |
# In addition, the values are permuted based on dequantizer.
|
278 |
# Here we undo both of these, and then apply marlin permutation
|
@@ -294,9 +300,8 @@ def awq_to_marlin_zero_points(
|
|
294 |
return marlin_zp
|
295 |
|
296 |
|
297 |
-
def moe_awq_to_marlin_zero_points(
|
298 |
-
|
299 |
-
):
|
300 |
num_experts = q_zp_packed.shape[0]
|
301 |
output = torch.empty(
|
302 |
(num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
|
@@ -304,45 +309,97 @@ def moe_awq_to_marlin_zero_points(
|
|
304 |
dtype=q_zp_packed.dtype,
|
305 |
)
|
306 |
for e in range(num_experts):
|
307 |
-
output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
|
|
|
308 |
return output
|
309 |
|
310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
def apply_gptq_marlin_linear(
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
) -> torch.Tensor:
|
326 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
327 |
-
out_shape = input.shape[:-1] + (output_size_per_partition,)
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
|
347 |
if bias is not None:
|
348 |
output.add_(bias) # In-place add
|
@@ -351,39 +408,43 @@ def apply_gptq_marlin_linear(
|
|
351 |
|
352 |
|
353 |
def apply_awq_marlin_linear(
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
) -> torch.Tensor:
|
367 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
368 |
-
out_shape = input.shape[:-1] + (output_size_per_partition,)
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
|
|
|
|
|
|
|
|
|
|
387 |
|
388 |
if bias is not None:
|
389 |
output.add_(bias) # In-place add
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
+
from typing import Optional
|
5 |
|
6 |
import numpy
|
7 |
import torch
|
|
|
45 |
# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
|
46 |
# TODO: we may want to move this into the C++ so its closer to the actual impl
|
47 |
def query_marlin_supported_quant_types(
|
48 |
+
has_zp: Optional[bool] = None,
|
49 |
+
include_fp_type: bool = True,
|
50 |
+
device_capability: Optional[int] = None,
|
51 |
):
|
52 |
if device_capability is None:
|
53 |
capability_tuple = torch.cuda.get_device_capability()
|
|
|
56 |
if device_capability < 80:
|
57 |
return []
|
58 |
|
59 |
+
# - has_zp is True: return quant_types that has zero points
|
60 |
+
# - has_zp is False: return quant_types that has not zero points
|
61 |
+
# - has_zp is None: both
|
62 |
+
if has_zp is None:
|
63 |
+
types0 = query_marlin_supported_quant_types(False, include_fp_type,
|
64 |
+
device_capability)
|
65 |
+
types1 = query_marlin_supported_quant_types(True, include_fp_type,
|
66 |
+
device_capability)
|
67 |
+
return types0 + types1
|
68 |
+
|
69 |
if has_zp:
|
70 |
# AWQ style, unsigned + runtime zero-point
|
71 |
+
return [scalar_types.uint4]
|
72 |
else:
|
73 |
# GPTQ style, unsigned + symmetric bias
|
74 |
+
res = [scalar_types.uint4b8, scalar_types.uint8b128]
|
75 |
+
if include_fp_type:
|
76 |
+
res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
|
77 |
+
return res
|
78 |
|
79 |
|
80 |
def _check_marlin_supported(
|
81 |
+
quant_type: ScalarType,
|
82 |
+
group_size: Optional[int],
|
83 |
+
has_zp: bool,
|
84 |
+
device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
|
|
|
85 |
|
86 |
if device_capability is None:
|
87 |
capability_tuple = torch.cuda.get_device_capability()
|
88 |
device_capability = capability_tuple[0] * 10 + capability_tuple[1]
|
89 |
|
90 |
+
supported_types = query_marlin_supported_quant_types(
|
91 |
+
has_zp, True, device_capability)
|
92 |
|
93 |
if quant_type not in supported_types:
|
94 |
+
return (False, f"Marlin does not support weight_bits = {quant_type}. "
|
95 |
+
f"Only types = {supported_types} "
|
96 |
+
f"are supported (for group_size = {group_size}, "
|
97 |
+
f"device_capability = {device_capability}, zp = {has_zp}).")
|
98 |
+
if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
|
99 |
+
return (False, f"Marlin does not support group_size = {group_size}. "
|
100 |
+
f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
|
101 |
+
"are supported.")
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
return True, None
|
104 |
|
105 |
|
106 |
+
def check_marlin_supported(quant_type: ScalarType,
|
107 |
+
group_size: int,
|
108 |
+
has_zp: bool = False,
|
109 |
+
device_capability: Optional[int] = None) -> bool:
|
110 |
+
cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
|
111 |
+
device_capability)
|
|
|
112 |
return cond
|
113 |
|
114 |
|
115 |
+
def verify_marlin_supported(quant_type: ScalarType,
|
116 |
+
group_size: int,
|
117 |
+
has_zp: bool = False) -> None:
|
118 |
cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
|
119 |
if not cond:
|
120 |
assert err_msg is not None
|
121 |
raise ValueError(err_msg)
|
122 |
|
123 |
|
124 |
+
def verify_marlin_supports_shape(output_size_per_partition: int,
|
125 |
+
input_size_per_partition: int,
|
126 |
+
input_size: int, group_size: int) -> None:
|
|
|
|
|
|
|
127 |
|
128 |
# Validate output_size_per_partition
|
129 |
if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
|
130 |
+
raise ValueError(f"Weight output_size_per_partition = "
|
131 |
+
f"{output_size_per_partition} is not divisible by "
|
132 |
+
f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
|
133 |
+
"Consider reducing tensor_parallel_size or running "
|
134 |
+
"with --quantization gptq.")
|
|
|
|
|
135 |
|
136 |
# Validate input_size_per_partition
|
137 |
if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
|
138 |
+
raise ValueError(f"Weight input_size_per_partition = "
|
139 |
+
f"{input_size_per_partition} is not divisible "
|
140 |
+
f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
|
141 |
+
"Consider reducing tensor_parallel_size or running "
|
142 |
+
"with --quantization gptq.")
|
143 |
+
|
144 |
+
if (group_size < input_size
|
145 |
+
and input_size_per_partition % group_size != 0):
|
|
|
146 |
raise ValueError(
|
147 |
f"Weight input_size_per_partition = {input_size_per_partition}"
|
148 |
+
f" is not divisible by group_size = {group_size}. "
|
149 |
"Consider reducing tensor_parallel_size or running "
|
150 |
+
"with --quantization gptq.")
|
|
|
151 |
|
152 |
|
153 |
+
def check_marlin_supports_shape(output_size_per_partition: int,
|
154 |
+
input_size_per_partition: int,
|
155 |
+
input_size: int, group_size: int) \
|
156 |
+
-> tuple[bool, Optional[str]]:
|
|
|
|
|
157 |
try:
|
158 |
+
verify_marlin_supports_shape(output_size_per_partition,
|
159 |
+
input_size_per_partition, input_size,
|
160 |
+
group_size)
|
161 |
except ValueError as e:
|
162 |
return False, e.__str__()
|
163 |
return True, None
|
164 |
|
165 |
|
166 |
+
def marlin_make_workspace(output_size_per_partition: int,
|
167 |
+
device: torch.device) -> torch.Tensor:
|
168 |
+
max_workspace_size = (output_size_per_partition //
|
169 |
+
GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
|
|
|
|
|
170 |
|
171 |
+
return torch.zeros(max_workspace_size,
|
172 |
+
dtype=torch.int,
|
173 |
+
device=device,
|
174 |
+
requires_grad=False)
|
175 |
+
|
176 |
+
|
177 |
+
def marlin_make_workspace_new(device: torch.device,
|
178 |
+
max_blocks_per_sm: int = 1) -> torch.Tensor:
|
179 |
+
# In the new marlin kernel, we use the num of threadblocks as workspace
|
180 |
+
# size. The num of threadblocks is is sms_count * max_blocks_per_sm.
|
181 |
+
sms = torch.cuda.get_device_properties(device).multi_processor_count
|
182 |
+
return torch.zeros(sms * max_blocks_per_sm,
|
183 |
+
dtype=torch.int,
|
184 |
+
device=device,
|
185 |
+
requires_grad=False)
|
186 |
|
187 |
|
188 |
def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
|
189 |
return (not act_order) or (act_order and not is_row_parallel)
|
190 |
|
191 |
|
192 |
+
def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
|
193 |
+
is_row_parallel: bool) -> bool:
|
|
|
194 |
# Need to repeat scales on every rank if act_ordering or
|
195 |
# channelwise and RowParallelLinear
|
196 |
is_channelwise = group_size == -1
|
|
|
198 |
|
199 |
|
200 |
def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
201 |
+
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
202 |
+
requires_grad=False)
|
|
|
203 |
|
204 |
|
205 |
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
206 |
+
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
207 |
+
requires_grad=False)
|
|
|
208 |
|
209 |
|
210 |
+
def marlin_sort_g_idx(
|
211 |
+
g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
212 |
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
213 |
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
214 |
|
215 |
|
216 |
def get_scale_perms():
|
217 |
+
scale_perm: list[int] = []
|
218 |
for i in range(8):
|
219 |
scale_perm.extend([i + 8 * j for j in range(8)])
|
220 |
+
scale_perm_single: list[int] = []
|
221 |
for i in range(4):
|
222 |
+
scale_perm_single.extend(
|
223 |
+
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
224 |
return scale_perm, scale_perm_single
|
225 |
|
226 |
|
227 |
+
def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
|
228 |
+
group_size: int) -> torch.Tensor:
|
|
|
229 |
|
230 |
scale_perm, scale_perm_single = get_scale_perms()
|
231 |
if group_size < size_k and group_size != -1:
|
|
|
255 |
return output
|
256 |
|
257 |
|
258 |
+
def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
|
259 |
+
num_bits: int) -> torch.Tensor:
|
|
|
260 |
# Permute zero-points in a similar way to scales, but do not use the
|
261 |
# "single" permutation, since zero-points are applied on every MMA
|
262 |
scale_perm, _ = get_scale_perms()
|
|
|
277 |
return zp
|
278 |
|
279 |
|
280 |
+
def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
281 |
+
size_n: int, num_bits: int) -> torch.Tensor:
|
|
|
282 |
# AWQ zero-points are quantized and packed on the column dim.
|
283 |
# In addition, the values are permuted based on dequantizer.
|
284 |
# Here we undo both of these, and then apply marlin permutation
|
|
|
300 |
return marlin_zp
|
301 |
|
302 |
|
303 |
+
def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
304 |
+
size_n: int, num_bits: int):
|
|
|
305 |
num_experts = q_zp_packed.shape[0]
|
306 |
output = torch.empty(
|
307 |
(num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
|
|
|
309 |
dtype=q_zp_packed.dtype,
|
310 |
)
|
311 |
for e in range(num_experts):
|
312 |
+
output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
|
313 |
+
num_bits)
|
314 |
return output
|
315 |
|
316 |
|
317 |
+
def maybe_warn_marlin_atomic_add(device, dtype):
|
318 |
+
if torch.compiler.is_dynamo_compiling():
|
319 |
+
return
|
320 |
+
device_capability = torch.cuda.get_device_capability(device)
|
321 |
+
if device_capability[0] < 9 and dtype == torch.bfloat16:
|
322 |
+
logger.info_once(
|
323 |
+
"You are running Marlin kernel with bf16 on GPUs before SM90. "
|
324 |
+
"You can consider change to fp16 to achieve better performance "
|
325 |
+
"if possible.")
|
326 |
+
|
327 |
+
|
328 |
+
def maybe_warn_marlin_atomic_add_env():
|
329 |
+
if torch.compiler.is_dynamo_compiling():
|
330 |
+
return
|
331 |
+
if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
|
332 |
+
return
|
333 |
+
logger.info_once(
|
334 |
+
"Marlin kernel can achieve better performance for small size_n "
|
335 |
+
"with experimental use_atomic_add feature. "
|
336 |
+
"You can consider set environment variable "
|
337 |
+
"VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
|
338 |
+
|
339 |
+
|
340 |
+
def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
|
341 |
+
dtype: torch.dtype) -> bool:
|
342 |
+
|
343 |
+
# the performance of atomicAdd is better than global reduce
|
344 |
+
# only when m*n is small and k is large
|
345 |
+
if n >= 2048 or k < 2048 or device.type != "cuda":
|
346 |
+
return False
|
347 |
+
|
348 |
+
# disable atomicAdd reduce by default,
|
349 |
+
# one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
|
350 |
+
if not envs.VLLM_MARLIN_USE_ATOMIC_ADD:
|
351 |
+
maybe_warn_marlin_atomic_add_env()
|
352 |
+
return False
|
353 |
+
|
354 |
+
# sm8x doesn't support atomicAdd + bfloat16 natively
|
355 |
+
device_capability = torch.cuda.get_device_capability(device)
|
356 |
+
if device_capability[0] < 9 and dtype == torch.bfloat16:
|
357 |
+
maybe_warn_marlin_atomic_add(device, dtype)
|
358 |
+
return False
|
359 |
+
|
360 |
+
return True
|
361 |
+
|
362 |
+
|
363 |
def apply_gptq_marlin_linear(
|
364 |
+
input: torch.Tensor,
|
365 |
+
weight: torch.Tensor,
|
366 |
+
weight_scale: torch.Tensor,
|
367 |
+
weight_zp: torch.Tensor,
|
368 |
+
g_idx: torch.Tensor,
|
369 |
+
g_idx_sort_indices: torch.Tensor,
|
370 |
+
workspace: torch.Tensor,
|
371 |
+
wtype: ScalarType,
|
372 |
+
output_size_per_partition: int,
|
373 |
+
input_size_per_partition: int,
|
374 |
+
is_k_full: bool,
|
375 |
+
bias: Optional[torch.Tensor] = None,
|
376 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
|
|
377 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
378 |
+
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
379 |
+
|
380 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
381 |
+
n=output_size_per_partition,
|
382 |
+
k=reshaped_x.size(1),
|
383 |
+
device=input.device,
|
384 |
+
dtype=input.dtype)
|
385 |
+
|
386 |
+
output = ops.gptq_marlin_gemm(reshaped_x,
|
387 |
+
None,
|
388 |
+
weight,
|
389 |
+
weight_scale,
|
390 |
+
None,
|
391 |
+
weight_zp,
|
392 |
+
g_idx,
|
393 |
+
g_idx_sort_indices,
|
394 |
+
workspace,
|
395 |
+
wtype,
|
396 |
+
size_m=reshaped_x.shape[0],
|
397 |
+
size_n=output_size_per_partition,
|
398 |
+
size_k=input_size_per_partition,
|
399 |
+
is_k_full=is_k_full,
|
400 |
+
use_atomic_add=use_atomic_add,
|
401 |
+
use_fp32_reduce=use_fp32_reduce,
|
402 |
+
is_zp_float=False)
|
403 |
|
404 |
if bias is not None:
|
405 |
output.add_(bias) # In-place add
|
|
|
408 |
|
409 |
|
410 |
def apply_awq_marlin_linear(
|
411 |
+
input: torch.Tensor,
|
412 |
+
weight: torch.Tensor,
|
413 |
+
weight_scale: torch.Tensor,
|
414 |
+
weight_zp: torch.Tensor,
|
415 |
+
g_idx: torch.Tensor,
|
416 |
+
g_idx_sort_indices: torch.Tensor,
|
417 |
+
workspace: torch.Tensor,
|
418 |
+
quant_type: ScalarType,
|
419 |
+
output_size_per_partition: int,
|
420 |
+
input_size_per_partition: int,
|
421 |
+
bias: Optional[torch.Tensor] = None,
|
422 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
|
|
423 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
424 |
+
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
425 |
+
|
426 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
427 |
+
n=output_size_per_partition,
|
428 |
+
k=reshaped_x.size(1),
|
429 |
+
device=input.device,
|
430 |
+
dtype=input.dtype)
|
431 |
+
|
432 |
+
output = ops.gptq_marlin_gemm(reshaped_x,
|
433 |
+
None,
|
434 |
+
weight,
|
435 |
+
weight_scale,
|
436 |
+
None,
|
437 |
+
weight_zp,
|
438 |
+
g_idx,
|
439 |
+
g_idx_sort_indices,
|
440 |
+
workspace,
|
441 |
+
quant_type,
|
442 |
+
size_m=reshaped_x.shape[0],
|
443 |
+
size_n=output_size_per_partition,
|
444 |
+
size_k=input_size_per_partition,
|
445 |
+
use_atomic_add=use_atomic_add,
|
446 |
+
use_fp32_reduce=use_fp32_reduce,
|
447 |
+
is_zp_float=False)
|
448 |
|
449 |
if bias is not None:
|
450 |
output.add_(bias) # In-place add
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils_fp4.py
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
import torch
|
7 |
+
|
8 |
+
import quantization as ops
|
9 |
+
|
10 |
+
from .marlin_utils import (
|
11 |
+
USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
|
12 |
+
should_use_atomic_add_reduce)
|
13 |
+
from quantization.scalar_type import scalar_types
|
14 |
+
|
15 |
+
FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16]
|
16 |
+
|
17 |
+
|
18 |
+
def is_fp4_marlin_supported():
|
19 |
+
capability = torch.cuda.get_device_capability()
|
20 |
+
capability = capability[0] * 10 + capability[1]
|
21 |
+
return capability >= 80
|
22 |
+
|
23 |
+
|
24 |
+
def fp4_marlin_process_scales(marlin_scales):
|
25 |
+
if not (marlin_scales >= 0).all():
|
26 |
+
logger.warning_once(
|
27 |
+
"NVFP4 Marlin assumes the scales to be >=0, but has encountered "
|
28 |
+
"negative scales. Accuracy will likely be degraded. This is "
|
29 |
+
"because it changes the scales from FP8-S1E4M3 to a special "
|
30 |
+
"FP8-S0E5M3 format to speedup the dequantization.")
|
31 |
+
|
32 |
+
# convert to half first, we would convert to fp8 later
|
33 |
+
marlin_scales = marlin_scales.to(torch.half)
|
34 |
+
|
35 |
+
# 8 is the number of scale number using by one thread
|
36 |
+
marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
|
37 |
+
marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
|
38 |
+
marlin_scales.size(0) * 2, -1)
|
39 |
+
|
40 |
+
# fit the layout of fp8 dequantization
|
41 |
+
marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
|
42 |
+
marlin_scales.size(0), -1)
|
43 |
+
|
44 |
+
# We assume that weight_scale (FP8-S1E4M3) is always greater
|
45 |
+
# than or equal to 0. So we can convert
|
46 |
+
# (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format.
|
47 |
+
# After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1
|
48 |
+
# when weight_scale > 0. This allows us to have an exponent bias
|
49 |
+
# closer to zero after dequantization.
|
50 |
+
|
51 |
+
marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1
|
52 |
+
marlin_scales = marlin_scales.view(torch.float8_e4m3fn)
|
53 |
+
marlin_scales = marlin_scales[:, 1::2].contiguous()
|
54 |
+
|
55 |
+
return marlin_scales
|
56 |
+
|
57 |
+
|
58 |
+
def fp4_marlin_process_global_scale(global_scale):
|
59 |
+
assert global_scale.dtype in [torch.half, torch.bfloat16]
|
60 |
+
fp4_exponent = 2
|
61 |
+
if global_scale.dtype == torch.half:
|
62 |
+
target_exponent = 5
|
63 |
+
elif global_scale.dtype == torch.bfloat16:
|
64 |
+
target_exponent = 8
|
65 |
+
# exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
|
66 |
+
# exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
|
67 |
+
exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
|
68 |
+
return global_scale * (2.0**(exponent_bias - 7))
|
69 |
+
|
70 |
+
|
71 |
+
def apply_fp4_marlin_linear(
|
72 |
+
input: torch.Tensor,
|
73 |
+
weight: torch.Tensor,
|
74 |
+
weight_scale: torch.Tensor,
|
75 |
+
weight_scale_2: torch.Tensor,
|
76 |
+
workspace: torch.Tensor,
|
77 |
+
size_n: int,
|
78 |
+
size_k: int,
|
79 |
+
bias: Optional[torch.Tensor] = None,
|
80 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
81 |
+
# For GPUs that lack FP4 hardware support, we can leverage the
|
82 |
+
# Marlin kernel for fast weight-only FP4 quantization
|
83 |
+
|
84 |
+
reshaped_x = input.reshape(-1, input.shape[-1])
|
85 |
+
out_shape = input.shape[:-1] + (size_n, )
|
86 |
+
|
87 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
88 |
+
n=size_n,
|
89 |
+
k=size_k,
|
90 |
+
device=input.device,
|
91 |
+
dtype=input.dtype)
|
92 |
+
|
93 |
+
output = ops.gptq_marlin_gemm(a=reshaped_x,
|
94 |
+
c=None,
|
95 |
+
b_q_weight=weight,
|
96 |
+
b_scales=weight_scale,
|
97 |
+
global_scale=weight_scale_2,
|
98 |
+
b_zeros=None,
|
99 |
+
g_idx=None,
|
100 |
+
perm=None,
|
101 |
+
workspace=workspace,
|
102 |
+
b_q_type=scalar_types.float4_e2m1f,
|
103 |
+
size_m=reshaped_x.size(0),
|
104 |
+
size_n=size_n,
|
105 |
+
size_k=size_k,
|
106 |
+
use_atomic_add=use_atomic_add,
|
107 |
+
use_fp32_reduce=use_fp32_reduce)
|
108 |
+
|
109 |
+
if bias is not None:
|
110 |
+
output.add_(bias) # In-place add
|
111 |
+
|
112 |
+
return output.reshape(out_shape)
|
113 |
+
|
114 |
+
|
115 |
+
def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
|
116 |
+
logger.warning_once(
|
117 |
+
"Your GPU does not have native support for FP4 computation but "
|
118 |
+
"FP4 quantization is being used. Weight-only FP4 compression will "
|
119 |
+
"be used leveraging the Marlin kernel. This may degrade "
|
120 |
+
"performance for compute-heavy workloads.")
|
121 |
+
|
122 |
+
part_size_n = layer.output_size_per_partition
|
123 |
+
part_size_k = layer.input_size_per_partition
|
124 |
+
param_dtype = layer.params_dtype
|
125 |
+
|
126 |
+
assert layer.weight.shape == (part_size_n, part_size_k // 2)
|
127 |
+
|
128 |
+
device = layer.weight.device
|
129 |
+
|
130 |
+
# WORKSPACE
|
131 |
+
layer.workspace = marlin_make_workspace_new(device)
|
132 |
+
|
133 |
+
# WEIGHT
|
134 |
+
# Repack weights to marlin format
|
135 |
+
perm = torch.empty(0, dtype=torch.int, device=device)
|
136 |
+
qweight = layer.weight.view(torch.int32).T.contiguous()
|
137 |
+
|
138 |
+
marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
|
139 |
+
perm=perm,
|
140 |
+
size_k=part_size_k,
|
141 |
+
size_n=part_size_n,
|
142 |
+
num_bits=4)
|
143 |
+
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
|
144 |
+
|
145 |
+
# WEIGHT SCALES
|
146 |
+
# Permute scales
|
147 |
+
weight_scale = layer.weight_scale.T.to(param_dtype)
|
148 |
+
weight_scale = marlin_permute_scales(s=weight_scale,
|
149 |
+
size_k=part_size_k,
|
150 |
+
size_n=part_size_n,
|
151 |
+
group_size=16)
|
152 |
+
weight_scale = fp4_marlin_process_scales(weight_scale)
|
153 |
+
layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
|
154 |
+
|
155 |
+
weight_scale_2 = layer.weight_scale_2.to(param_dtype)
|
156 |
+
weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
|
157 |
+
layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
|
158 |
+
requires_grad=False)
|
159 |
+
|
160 |
+
return
|
161 |
+
|
162 |
+
|
163 |
+
def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
|
164 |
+
logger.warning_once(
|
165 |
+
"Your GPU does not have native support for FP4 computation but "
|
166 |
+
"FP4 quantization is being used. Weight-only FP4 compression will "
|
167 |
+
"be used leveraging the Marlin kernel. This may degrade "
|
168 |
+
"performance for compute-heavy workloads.")
|
169 |
+
|
170 |
+
e = layer.num_experts
|
171 |
+
k = layer.hidden_size
|
172 |
+
n = layer.intermediate_size_per_partition
|
173 |
+
|
174 |
+
# WORKSPACE
|
175 |
+
device = layer.w13_weight.device
|
176 |
+
param_dtype = layer.params_dtype
|
177 |
+
layer.workspace = marlin_make_workspace_new(device, 4)
|
178 |
+
perm = torch.empty(0, dtype=torch.int, device=device)
|
179 |
+
|
180 |
+
# WEIGHT
|
181 |
+
# Repack weights to marlin format
|
182 |
+
for name in ["w13_weight", "w2_weight"]:
|
183 |
+
weight = getattr(layer, name)
|
184 |
+
tensor_list = []
|
185 |
+
if "w13" in name:
|
186 |
+
size_n, size_k = n * 2, k
|
187 |
+
else:
|
188 |
+
size_n, size_k = k, n
|
189 |
+
|
190 |
+
assert weight.shape == (e, size_n, size_k // 2)
|
191 |
+
|
192 |
+
for i in range(e):
|
193 |
+
qweight = weight[i].view(torch.int32).T.contiguous()
|
194 |
+
|
195 |
+
marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
|
196 |
+
perm=perm,
|
197 |
+
size_k=size_k,
|
198 |
+
size_n=size_n,
|
199 |
+
num_bits=4)
|
200 |
+
tensor_list.append(marlin_qweight)
|
201 |
+
|
202 |
+
weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
|
203 |
+
weight = torch.nn.Parameter(weight, requires_grad=False)
|
204 |
+
|
205 |
+
setattr(layer, name, weight)
|
206 |
+
|
207 |
+
# WEIGHT SCALES
|
208 |
+
# Permute scales
|
209 |
+
for name in ["w13", "w2"]:
|
210 |
+
scales = getattr(layer, name + "_weight_scale").to(param_dtype)
|
211 |
+
global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
|
212 |
+
|
213 |
+
tensor_list = []
|
214 |
+
if "w13" in name:
|
215 |
+
size_n, size_k = n * 2, k
|
216 |
+
else:
|
217 |
+
size_n, size_k = k, n
|
218 |
+
|
219 |
+
for i in range(e):
|
220 |
+
marlin_scales = marlin_permute_scales(s=scales[i].T,
|
221 |
+
size_k=size_k,
|
222 |
+
size_n=size_n,
|
223 |
+
group_size=16)
|
224 |
+
marlin_scales = fp4_marlin_process_scales(marlin_scales)
|
225 |
+
tensor_list.append(marlin_scales)
|
226 |
+
|
227 |
+
scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
|
228 |
+
scales = torch.nn.Parameter(scales, requires_grad=False)
|
229 |
+
setattr(layer, name + "_weight_scale", scales)
|
230 |
+
|
231 |
+
global_scale = fp4_marlin_process_global_scale(global_scale)
|
232 |
+
global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
|
233 |
+
setattr(layer, name + "_weight_scale_2", global_scale)
|
234 |
+
|
235 |
+
|
236 |
+
def rand_marlin_weight_fp4_like(weight, group_size):
|
237 |
+
assert group_size > 0
|
238 |
+
size_n, size_k = weight.shape
|
239 |
+
device = weight.device
|
240 |
+
|
241 |
+
scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6
|
242 |
+
global_scale = scales.max() / 448
|
243 |
+
scales = (scales / global_scale).to(torch.float8_e4m3fn)
|
244 |
+
|
245 |
+
fp4_weight = torch.randint(0,
|
246 |
+
256, (size_n, size_k // 2),
|
247 |
+
dtype=torch.uint8,
|
248 |
+
device=weight.device)
|
249 |
+
fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
|
250 |
+
((fp4_weight & 0b01110000) >> 2))
|
251 |
+
fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
|
252 |
+
fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
|
253 |
+
|
254 |
+
fp4_weight2 = fp4_weight << 4
|
255 |
+
fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
|
256 |
+
((fp4_weight2 & 0b01110000) >> 2))
|
257 |
+
fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
|
258 |
+
fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
|
259 |
+
|
260 |
+
weight_ref = torch.cat(
|
261 |
+
[fp4_weight_part_2.unsqueeze(2),
|
262 |
+
fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
|
263 |
+
weight_ref = weight_ref * global_scale.to(weight.dtype) * \
|
264 |
+
scales.repeat_interleave(group_size, 1).to(weight.dtype)
|
265 |
+
|
266 |
+
marlin_qweight = ops.gptq_marlin_repack(
|
267 |
+
b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
|
268 |
+
perm=torch.empty(0, dtype=torch.int, device=device),
|
269 |
+
size_k=size_k,
|
270 |
+
size_n=size_n,
|
271 |
+
num_bits=4,
|
272 |
+
)
|
273 |
+
|
274 |
+
marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
|
275 |
+
size_k=size_k,
|
276 |
+
size_n=size_n,
|
277 |
+
group_size=group_size)
|
278 |
+
marlin_scales = fp4_marlin_process_scales(marlin_scales)
|
279 |
+
|
280 |
+
global_scale = fp4_marlin_process_global_scale(global_scale)
|
281 |
+
|
282 |
+
return weight_ref.T, marlin_qweight, marlin_scales, global_scale
|
build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils_fp8.py
CHANGED
@@ -1,10 +1,13 @@
|
|
|
|
|
|
|
|
1 |
from typing import Optional
|
2 |
|
3 |
import torch
|
4 |
|
5 |
import quantization as ops
|
6 |
|
7 |
-
from .marlin_utils import marlin_make_workspace, marlin_permute_scales
|
8 |
|
9 |
|
10 |
def is_fp8_marlin_supported():
|
@@ -13,88 +16,107 @@ def is_fp8_marlin_supported():
|
|
13 |
return capability >= 80
|
14 |
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def apply_fp8_marlin_linear(
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
) -> torch.Tensor:
|
25 |
# For GPUs that lack FP8 hardware support, we can leverage the
|
26 |
# Marlin kernel for fast weight-only FP8 quantization
|
27 |
|
28 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
29 |
-
out_shape = input.shape[:-1] + (size_n,)
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
if bias is not None:
|
43 |
output.add_(bias) # In-place add
|
44 |
|
45 |
return output.reshape(out_shape)
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
layer: torch.nn.Module, strategy: str = "tensor"
|
50 |
-
) -> None:
|
51 |
-
part_size_n = layer.output_size_per_partition
|
52 |
-
part_size_k = layer.input_size_per_partition
|
53 |
-
|
54 |
-
device = layer.weight.device
|
55 |
-
|
56 |
-
# WORKSPACE
|
57 |
-
layer.workspace = marlin_make_workspace(part_size_n, device)
|
58 |
-
|
59 |
-
# WEIGHT
|
60 |
-
# Repack weights to marlin format
|
61 |
-
marlin_qweight = ops.gptq_marlin_repack(
|
62 |
-
b_q_weight=pack_fp8_to_int32(layer.weight),
|
63 |
-
perm=torch.empty(0, dtype=torch.int, device=device),
|
64 |
-
size_k=part_size_k,
|
65 |
-
size_n=part_size_n,
|
66 |
-
num_bits=8,
|
67 |
-
)
|
68 |
-
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
|
69 |
-
|
70 |
-
# WEIGHT SCALES
|
71 |
-
scales = layer.weight_scale.to(layer.orig_dtype)
|
72 |
-
# Permute scales
|
73 |
-
marlin_scales = marlin_permute_scales(
|
74 |
-
s=scales, size_k=part_size_k, size_n=part_size_n, group_size=-1
|
75 |
-
)
|
76 |
-
layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
|
77 |
-
|
78 |
-
|
79 |
-
def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
|
80 |
"""
|
81 |
Repack FP8 weights to gptq format (packed int32 elements)
|
82 |
"""
|
83 |
assert fp8_tensor.dtype == torch.float8_e4m3fn
|
84 |
-
assert fp8_tensor.
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
-
|
90 |
-
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
packed = (
|
94 |
-
byte_tensor[:, 0].to(torch.int32)
|
95 |
-
| (byte_tensor[:, 1].to(torch.int32) << 8)
|
96 |
-
| (byte_tensor[:, 2].to(torch.int32) << 16)
|
97 |
-
| (byte_tensor[:, 3].to(torch.int32) << 24)
|
98 |
-
)
|
99 |
|
100 |
-
return
|
|
|
1 |
+
# SPDX-License-Identifier: Apache-2.0
|
2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
+
|
4 |
from typing import Optional
|
5 |
|
6 |
import torch
|
7 |
|
8 |
import quantization as ops
|
9 |
|
10 |
+
from .marlin_utils import USE_FP32_REDUCE_DEFAULT, marlin_make_workspace, marlin_permute_scales
|
11 |
|
12 |
|
13 |
def is_fp8_marlin_supported():
|
|
|
16 |
return capability >= 80
|
17 |
|
18 |
|
19 |
+
def fp8_fused_exponent_bias_into_scales(scales):
|
20 |
+
fp8_exponent = 4
|
21 |
+
if scales.dtype == torch.half:
|
22 |
+
target_exponent = 5
|
23 |
+
elif scales.dtype == torch.bfloat16:
|
24 |
+
target_exponent = 8
|
25 |
+
# exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
|
26 |
+
# exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
|
27 |
+
exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
|
28 |
+
s = torch.ones_like(scales) * 2
|
29 |
+
s = s**exponent_bias
|
30 |
+
return scales * s
|
31 |
+
|
32 |
+
|
33 |
def apply_fp8_marlin_linear(
|
34 |
+
input: torch.Tensor,
|
35 |
+
weight: torch.Tensor,
|
36 |
+
weight_scale: torch.Tensor,
|
37 |
+
workspace: torch.Tensor,
|
38 |
+
size_n: int,
|
39 |
+
size_k: int,
|
40 |
+
bias: Optional[torch.Tensor],
|
41 |
+
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
42 |
# For GPUs that lack FP8 hardware support, we can leverage the
|
43 |
# Marlin kernel for fast weight-only FP8 quantization
|
44 |
|
45 |
reshaped_x = input.reshape(-1, input.shape[-1])
|
46 |
+
out_shape = input.shape[:-1] + (size_n, )
|
47 |
+
|
48 |
+
use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
|
49 |
+
n=size_n,
|
50 |
+
k=size_k,
|
51 |
+
device=input.device,
|
52 |
+
dtype=input.dtype)
|
53 |
+
|
54 |
+
output = ops.gptq_marlin_gemm(a=reshaped_x,
|
55 |
+
c=None,
|
56 |
+
b_q_weight=weight,
|
57 |
+
b_scales=weight_scale,
|
58 |
+
global_scale=None,
|
59 |
+
b_zeros=None,
|
60 |
+
g_idx=None,
|
61 |
+
perm=None,
|
62 |
+
workspace=workspace,
|
63 |
+
b_q_type=scalar_types.float8_e4m3fn,
|
64 |
+
size_m=reshaped_x.size(0),
|
65 |
+
size_n=size_n,
|
66 |
+
size_k=size_k,
|
67 |
+
use_atomic_add=use_atomic_add,
|
68 |
+
use_fp32_reduce=use_fp32_reduce)
|
69 |
|
70 |
if bias is not None:
|
71 |
output.add_(bias) # In-place add
|
72 |
|
73 |
return output.reshape(out_shape)
|
74 |
|
75 |
+
def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
|
76 |
+
size_k_first: bool = True) -> torch.Tensor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
"""
|
78 |
Repack FP8 weights to gptq format (packed int32 elements)
|
79 |
"""
|
80 |
assert fp8_tensor.dtype == torch.float8_e4m3fn
|
81 |
+
assert fp8_tensor.ndim == 2
|
82 |
+
|
83 |
+
fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
|
84 |
+
fp8_tensor = fp8_tensor.contiguous()
|
85 |
+
# fp8_tensor is contiguous and have shape (N, K) now
|
86 |
+
# with `.view(torch.int32)`, it become (N, K // 4)
|
87 |
+
int32_tensor = fp8_tensor.view(torch.int32)
|
88 |
+
return int32_tensor.T.contiguous() if size_k_first else int32_tensor
|
89 |
+
|
90 |
+
|
91 |
+
def marlin_quant_fp8_torch(weight, group_size):
|
92 |
+
size_n, size_k = weight.shape
|
93 |
+
device = weight.device
|
94 |
+
|
95 |
+
if group_size != -1:
|
96 |
+
scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
|
97 |
+
repeated_scales = scales.repeat_interleave(group_size, 1)
|
98 |
+
fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
|
99 |
+
weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
|
100 |
+
else:
|
101 |
+
scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
|
102 |
+
repeated_scales = scales.repeat_interleave(size_k, 1)
|
103 |
+
fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
|
104 |
+
weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
|
105 |
+
|
106 |
+
packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
|
107 |
+
marlin_qweight = ops.gptq_marlin_repack(
|
108 |
+
b_q_weight=packed_weight,
|
109 |
+
perm=torch.empty(0, dtype=torch.int, device=device),
|
110 |
+
size_k=size_k,
|
111 |
+
size_n=size_n,
|
112 |
+
num_bits=8,
|
113 |
+
)
|
114 |
|
115 |
+
marlin_scales = marlin_permute_scales(s=scales.T,
|
116 |
+
size_k=size_k,
|
117 |
+
size_n=size_n,
|
118 |
+
group_size=group_size)
|
119 |
|
120 |
+
marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
+
return weight_ref.T, marlin_qweight, marlin_scales
|