danieldk HF Staff commited on 7 days ago

Commit

82ffd1f

1 Parent(s): dfa7d18

Build (x86_64)

Browse files

Files changed (50) hide show

build/torch26-cxx11-cu118-x86_64-linux/quantization/__init__.py +9 -0
build/torch26-cxx11-cu118-x86_64-linux/quantization/_ops.py +3 -3
build/torch26-cxx11-cu118-x86_64-linux/quantization/{_quantization_e8730d8_dirty.abi3.so → _quantization_dfa7d18.abi3.so} +2 -2
build/torch26-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py +3 -1
build/torch26-cxx11-cu118-x86_64-linux/quantization/platforms.py +35 -0
build/torch26-cxx11-cu124-x86_64-linux/quantization/__init__.py +9 -0
build/torch26-cxx11-cu124-x86_64-linux/quantization/_ops.py +3 -3
build/torch26-cxx11-cu124-x86_64-linux/quantization/{_quantization_e8730d8_dirty.abi3.so → _quantization_dfa7d18.abi3.so} +2 -2
build/torch26-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py +3 -1
build/torch26-cxx11-cu124-x86_64-linux/quantization/platforms.py +35 -0
build/torch26-cxx11-cu126-x86_64-linux/quantization/__init__.py +9 -0
build/torch26-cxx11-cu126-x86_64-linux/quantization/_ops.py +3 -3
build/torch26-cxx11-cu126-x86_64-linux/quantization/{_quantization_e8730d8_dirty.abi3.so → _quantization_dfa7d18.abi3.so} +2 -2
build/torch26-cxx11-cu126-x86_64-linux/quantization/compressed_tensors.py +3 -1
build/torch26-cxx11-cu126-x86_64-linux/quantization/platforms.py +35 -0
build/torch26-cxx98-cu118-x86_64-linux/quantization/__init__.py +9 -0
build/torch26-cxx98-cu118-x86_64-linux/quantization/_ops.py +3 -3
build/torch26-cxx98-cu118-x86_64-linux/quantization/{_quantization_e8730d8_dirty.abi3.so → _quantization_dfa7d18.abi3.so} +2 -2
build/torch26-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py +3 -1
build/torch26-cxx98-cu118-x86_64-linux/quantization/platforms.py +35 -0
build/torch26-cxx98-cu124-x86_64-linux/quantization/__init__.py +9 -0
build/torch26-cxx98-cu124-x86_64-linux/quantization/_ops.py +3 -3
build/torch26-cxx98-cu124-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so +3 -0
build/torch26-cxx98-cu124-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so +0 -3
build/torch26-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py +3 -1
build/torch26-cxx98-cu124-x86_64-linux/quantization/platforms.py +35 -0
build/torch26-cxx98-cu126-x86_64-linux/quantization/__init__.py +9 -0
build/torch26-cxx98-cu126-x86_64-linux/quantization/_ops.py +3 -3
build/torch26-cxx98-cu126-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so +3 -0
build/torch26-cxx98-cu126-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so +0 -3
build/torch26-cxx98-cu126-x86_64-linux/quantization/compressed_tensors.py +3 -1
build/torch26-cxx98-cu126-x86_64-linux/quantization/platforms.py +35 -0
build/torch27-cxx11-cu118-x86_64-linux/quantization/__init__.py +9 -0
build/torch27-cxx11-cu118-x86_64-linux/quantization/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so +3 -0
build/torch27-cxx11-cu118-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so +0 -3
build/torch27-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py +3 -1
build/torch27-cxx11-cu118-x86_64-linux/quantization/platforms.py +35 -0
build/torch27-cxx11-cu126-x86_64-linux/quantization/__init__.py +9 -0
build/torch27-cxx11-cu126-x86_64-linux/quantization/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so +3 -0
build/torch27-cxx11-cu126-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so +0 -3
build/torch27-cxx11-cu126-x86_64-linux/quantization/compressed_tensors.py +3 -1
build/torch27-cxx11-cu126-x86_64-linux/quantization/platforms.py +35 -0
build/torch27-cxx11-cu128-x86_64-linux/quantization/__init__.py +9 -0
build/torch27-cxx11-cu128-x86_64-linux/quantization/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so +3 -0
build/torch27-cxx11-cu128-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so +0 -3
build/torch27-cxx11-cu128-x86_64-linux/quantization/compressed_tensors.py +3 -1
build/torch27-cxx11-cu128-x86_64-linux/quantization/platforms.py +35 -0

build/torch26-cxx11-cu118-x86_64-linux/quantization/__init__.py CHANGED Viewed

@@ -19,6 +19,11 @@ from .scalar_type import (
 )
 from ._ops import ops
 __all__ = [
     "ScalarType",
@@ -32,7 +37,11 @@ __all__ = [
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
     "ops",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

 )
 from ._ops import ops
+from .utils import marlin_utils
+from .utils import marlin_utils_fp4
+from .utils import marlin_utils_fp8
+from .utils import quant_utils
 __all__ = [
     "ScalarType",
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
+    "marlin_utils",
+    "marlin_utils_fp4",
+    "marlin_utils_fp8",
     "ops",
+    "quant_utils",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

build/torch26-cxx11-cu118-x86_64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_e8730d8_dirty
-ops = torch.ops._quantization_e8730d8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_e8730d8_dirty::{op_name}"

 import torch
+from . import _quantization_dfa7d18
+ops = torch.ops._quantization_dfa7d18
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_dfa7d18::{op_name}"

build/torch26-cxx11-cu118-x86_64-linux/quantization/{_quantization_e8730d8_dirty.abi3.so → _quantization_dfa7d18.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:57e3f40d3bd58464cc005538c0d5376d64d7b6051f819f34c024f5b1940afb0f
-size 155760312

 version https://git-lfs.github.com/spec/v1
+oid sha256:05b3dbcc1c3200458ec526bc95169a8b286704dbbfe93b1b5bb580d490be4f3d
+size 155751904

build/torch26-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Optional, Tuple
 import torch
 from ._ops import ops
 # fp8
 def scaled_fp8_quant(

+from typing import Optional, Union
 import torch
 from ._ops import ops
+from .platforms import current_platform
 # fp8
 def scaled_fp8_quant(

build/torch26-cxx11-cu118-x86_64-linux/quantization/platforms.py CHANGED Viewed

@@ -27,6 +27,29 @@ class DeviceCapability(NamedTuple):
 class Platform(ABC):
     simple_compile_backend: str = "inductor"
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
@@ -51,6 +74,18 @@ class CudaPlatform(Platform):
 class RocmPlatform(Platform):
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

 class Platform(ABC):
     simple_compile_backend: str = "inductor"
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
 class RocmPlatform(Platform):
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

build/torch26-cxx11-cu124-x86_64-linux/quantization/__init__.py CHANGED Viewed

@@ -19,6 +19,11 @@ from .scalar_type import (
 )
 from ._ops import ops
 __all__ = [
     "ScalarType",
@@ -32,7 +37,11 @@ __all__ = [
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
     "ops",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

 )
 from ._ops import ops
+from .utils import marlin_utils
+from .utils import marlin_utils_fp4
+from .utils import marlin_utils_fp8
+from .utils import quant_utils
 __all__ = [
     "ScalarType",
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
+    "marlin_utils",
+    "marlin_utils_fp4",
+    "marlin_utils_fp8",
     "ops",
+    "quant_utils",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

build/torch26-cxx11-cu124-x86_64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_e8730d8_dirty
-ops = torch.ops._quantization_e8730d8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_e8730d8_dirty::{op_name}"

 import torch
+from . import _quantization_dfa7d18
+ops = torch.ops._quantization_dfa7d18
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_dfa7d18::{op_name}"

build/torch26-cxx11-cu124-x86_64-linux/quantization/{_quantization_e8730d8_dirty.abi3.so → _quantization_dfa7d18.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9f064f096a8814f4e1441a58dd10979e111c6dce94fbf11a381a0463150577a
-size 159574104

 version https://git-lfs.github.com/spec/v1
+oid sha256:b5dc49e9b5709f18d3e12ab2d76e37743c31cb2602d219e80173a9c5c0ba1acd
+size 159574040

build/torch26-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Optional, Tuple
 import torch
 from ._ops import ops
 # fp8
 def scaled_fp8_quant(

+from typing import Optional, Union
 import torch
 from ._ops import ops
+from .platforms import current_platform
 # fp8
 def scaled_fp8_quant(

build/torch26-cxx11-cu124-x86_64-linux/quantization/platforms.py CHANGED Viewed

@@ -27,6 +27,29 @@ class DeviceCapability(NamedTuple):
 class Platform(ABC):
     simple_compile_backend: str = "inductor"
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
@@ -51,6 +74,18 @@ class CudaPlatform(Platform):
 class RocmPlatform(Platform):
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

 class Platform(ABC):
     simple_compile_backend: str = "inductor"
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
 class RocmPlatform(Platform):
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

build/torch26-cxx11-cu126-x86_64-linux/quantization/__init__.py CHANGED Viewed

@@ -19,6 +19,11 @@ from .scalar_type import (
 )
 from ._ops import ops
 __all__ = [
     "ScalarType",
@@ -32,7 +37,11 @@ __all__ = [
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
     "ops",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

 )
 from ._ops import ops
+from .utils import marlin_utils
+from .utils import marlin_utils_fp4
+from .utils import marlin_utils_fp8
+from .utils import quant_utils
 __all__ = [
     "ScalarType",
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
+    "marlin_utils",
+    "marlin_utils_fp4",
+    "marlin_utils_fp8",
     "ops",
+    "quant_utils",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

build/torch26-cxx11-cu126-x86_64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_e8730d8_dirty
-ops = torch.ops._quantization_e8730d8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_e8730d8_dirty::{op_name}"

 import torch
+from . import _quantization_dfa7d18
+ops = torch.ops._quantization_dfa7d18
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_dfa7d18::{op_name}"

build/torch26-cxx11-cu126-x86_64-linux/quantization/{_quantization_e8730d8_dirty.abi3.so → _quantization_dfa7d18.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1eb61783122b0f53ed36db827c8ea9cf1da094f22ca0433c575d18993565531f
-size 160276600

 version https://git-lfs.github.com/spec/v1
+oid sha256:af7fad3054f0981d175aa7dcabf9dbe3c556ba0dcee7f20a2c104abd17dce7a5
+size 160280624

build/torch26-cxx11-cu126-x86_64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Optional, Tuple
 import torch
 from ._ops import ops
 # fp8
 def scaled_fp8_quant(

+from typing import Optional, Union
 import torch
 from ._ops import ops
+from .platforms import current_platform
 # fp8
 def scaled_fp8_quant(

build/torch26-cxx11-cu126-x86_64-linux/quantization/platforms.py CHANGED Viewed

@@ -27,6 +27,29 @@ class DeviceCapability(NamedTuple):
 class Platform(ABC):
     simple_compile_backend: str = "inductor"
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
@@ -51,6 +74,18 @@ class CudaPlatform(Platform):
 class RocmPlatform(Platform):
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

 class Platform(ABC):
     simple_compile_backend: str = "inductor"
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
 class RocmPlatform(Platform):
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

build/torch26-cxx98-cu118-x86_64-linux/quantization/__init__.py CHANGED Viewed

@@ -19,6 +19,11 @@ from .scalar_type import (
 )
 from ._ops import ops
 __all__ = [
     "ScalarType",
@@ -32,7 +37,11 @@ __all__ = [
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
     "ops",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

 )
 from ._ops import ops
+from .utils import marlin_utils
+from .utils import marlin_utils_fp4
+from .utils import marlin_utils_fp8
+from .utils import quant_utils
 __all__ = [
     "ScalarType",
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
+    "marlin_utils",
+    "marlin_utils_fp4",
+    "marlin_utils_fp8",
     "ops",
+    "quant_utils",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

build/torch26-cxx98-cu118-x86_64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_e8730d8_dirty
-ops = torch.ops._quantization_e8730d8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_e8730d8_dirty::{op_name}"

 import torch
+from . import _quantization_dfa7d18
+ops = torch.ops._quantization_dfa7d18
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_dfa7d18::{op_name}"

build/torch26-cxx98-cu118-x86_64-linux/quantization/{_quantization_e8730d8_dirty.abi3.so → _quantization_dfa7d18.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e6c2a3029d72467b8bf2fbfcf8e999683e58ab0a1c0eb4bc5fda1d92cfcc179d
-size 155740048

 version https://git-lfs.github.com/spec/v1
+oid sha256:9736b3b73f06d4fd9881fd417dbe72aa7e5e4cbc2845ca247b9427b2b7f2b858
+size 155739832

build/torch26-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Optional, Tuple
 import torch
 from ._ops import ops
 # fp8
 def scaled_fp8_quant(

+from typing import Optional, Union
 import torch
 from ._ops import ops
+from .platforms import current_platform
 # fp8
 def scaled_fp8_quant(

build/torch26-cxx98-cu118-x86_64-linux/quantization/platforms.py CHANGED Viewed

@@ -27,6 +27,29 @@ class DeviceCapability(NamedTuple):
 class Platform(ABC):
     simple_compile_backend: str = "inductor"
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
@@ -51,6 +74,18 @@ class CudaPlatform(Platform):
 class RocmPlatform(Platform):
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

 class Platform(ABC):
     simple_compile_backend: str = "inductor"
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
 class RocmPlatform(Platform):
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

build/torch26-cxx98-cu124-x86_64-linux/quantization/__init__.py CHANGED Viewed

@@ -19,6 +19,11 @@ from .scalar_type import (
 )
 from ._ops import ops
 __all__ = [
     "ScalarType",
@@ -32,7 +37,11 @@ __all__ = [
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
     "ops",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

 )
 from ._ops import ops
+from .utils import marlin_utils
+from .utils import marlin_utils_fp4
+from .utils import marlin_utils_fp8
+from .utils import quant_utils
 __all__ = [
     "ScalarType",
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
+    "marlin_utils",
+    "marlin_utils_fp4",
+    "marlin_utils_fp8",
     "ops",
+    "quant_utils",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

build/torch26-cxx98-cu124-x86_64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_e8730d8_dirty
-ops = torch.ops._quantization_e8730d8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_e8730d8_dirty::{op_name}"

 import torch
+from . import _quantization_dfa7d18
+ops = torch.ops._quantization_dfa7d18
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_dfa7d18::{op_name}"

build/torch26-cxx98-cu124-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fa6583683394285f5d1c65f808a967b2db197831a097c638400b06a544187ba
+size 159570240

build/torch26-cxx98-cu124-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2932ba43dd1ae4848b3077dada99be0088023a56e7b36bac9e863a1977249088
-size 159578496

build/torch26-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Optional, Tuple
 import torch
 from ._ops import ops
 # fp8
 def scaled_fp8_quant(

+from typing import Optional, Union
 import torch
 from ._ops import ops
+from .platforms import current_platform
 # fp8
 def scaled_fp8_quant(

build/torch26-cxx98-cu124-x86_64-linux/quantization/platforms.py CHANGED Viewed

@@ -27,6 +27,29 @@ class DeviceCapability(NamedTuple):
 class Platform(ABC):
     simple_compile_backend: str = "inductor"
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
@@ -51,6 +74,18 @@ class CudaPlatform(Platform):
 class RocmPlatform(Platform):
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

 class Platform(ABC):
     simple_compile_backend: str = "inductor"
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
 class RocmPlatform(Platform):
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

build/torch26-cxx98-cu126-x86_64-linux/quantization/__init__.py CHANGED Viewed

@@ -19,6 +19,11 @@ from .scalar_type import (
 )
 from ._ops import ops
 __all__ = [
     "ScalarType",
@@ -32,7 +37,11 @@ __all__ = [
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
     "ops",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

 )
 from ._ops import ops
+from .utils import marlin_utils
+from .utils import marlin_utils_fp4
+from .utils import marlin_utils_fp8
+from .utils import quant_utils
 __all__ = [
     "ScalarType",
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
+    "marlin_utils",
+    "marlin_utils_fp4",
+    "marlin_utils_fp8",
     "ops",
+    "quant_utils",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

build/torch26-cxx98-cu126-x86_64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_e8730d8_dirty
-ops = torch.ops._quantization_e8730d8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_e8730d8_dirty::{op_name}"

 import torch
+from . import _quantization_dfa7d18
+ops = torch.ops._quantization_dfa7d18
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_dfa7d18::{op_name}"

build/torch26-cxx98-cu126-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:027e39213c07a0d90a7cbd3ea7f7e7415d9a4d561e2d774ab6212512e0452007
+size 160278472

build/torch26-cxx98-cu126-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1dcf28c2d636d90cd8af8bc7b44a3b7d5f5a1a599e7e1c03b06f3800d40f5a60
-size 160274448

build/torch26-cxx98-cu126-x86_64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Optional, Tuple
 import torch
 from ._ops import ops
 # fp8
 def scaled_fp8_quant(

+from typing import Optional, Union
 import torch
 from ._ops import ops
+from .platforms import current_platform
 # fp8
 def scaled_fp8_quant(

build/torch26-cxx98-cu126-x86_64-linux/quantization/platforms.py CHANGED Viewed

@@ -27,6 +27,29 @@ class DeviceCapability(NamedTuple):
 class Platform(ABC):
     simple_compile_backend: str = "inductor"
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
@@ -51,6 +74,18 @@ class CudaPlatform(Platform):
 class RocmPlatform(Platform):
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

 class Platform(ABC):
     simple_compile_backend: str = "inductor"
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
 class RocmPlatform(Platform):
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

build/torch27-cxx11-cu118-x86_64-linux/quantization/__init__.py CHANGED Viewed

@@ -19,6 +19,11 @@ from .scalar_type import (
 )
 from ._ops import ops
 __all__ = [
     "ScalarType",
@@ -32,7 +37,11 @@ __all__ = [
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
     "ops",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

 )
 from ._ops import ops
+from .utils import marlin_utils
+from .utils import marlin_utils_fp4
+from .utils import marlin_utils_fp8
+from .utils import quant_utils
 __all__ = [
     "ScalarType",
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
+    "marlin_utils",
+    "marlin_utils_fp4",
+    "marlin_utils_fp8",
     "ops",
+    "quant_utils",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

build/torch27-cxx11-cu118-x86_64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_e8730d8_dirty
-ops = torch.ops._quantization_e8730d8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_e8730d8_dirty::{op_name}"

 import torch
+from . import _quantization_dfa7d18
+ops = torch.ops._quantization_dfa7d18
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_dfa7d18::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18dc876a3fd8d78af10311486db850cfa1905b6d5cc308a72f44bc0704bc91e6
+size 155752576

build/torch27-cxx11-cu118-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dfeeef0e0e812038c52f838b994c631faa236af0d360246951dfc3e07ab0a461
-size 155756888

build/torch27-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Optional, Tuple
 import torch
 from ._ops import ops
 # fp8
 def scaled_fp8_quant(

+from typing import Optional, Union
 import torch
 from ._ops import ops
+from .platforms import current_platform
 # fp8
 def scaled_fp8_quant(

build/torch27-cxx11-cu118-x86_64-linux/quantization/platforms.py CHANGED Viewed

@@ -27,6 +27,29 @@ class DeviceCapability(NamedTuple):
 class Platform(ABC):
     simple_compile_backend: str = "inductor"
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
@@ -51,6 +74,18 @@ class CudaPlatform(Platform):
 class RocmPlatform(Platform):
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

 class Platform(ABC):
     simple_compile_backend: str = "inductor"
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
 class RocmPlatform(Platform):
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

build/torch27-cxx11-cu126-x86_64-linux/quantization/__init__.py CHANGED Viewed

@@ -19,6 +19,11 @@ from .scalar_type import (
 )
 from ._ops import ops
 __all__ = [
     "ScalarType",
@@ -32,7 +37,11 @@ __all__ = [
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
     "ops",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

 )
 from ._ops import ops
+from .utils import marlin_utils
+from .utils import marlin_utils_fp4
+from .utils import marlin_utils_fp8
+from .utils import quant_utils
 __all__ = [
     "ScalarType",
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
+    "marlin_utils",
+    "marlin_utils_fp4",
+    "marlin_utils_fp8",
     "ops",
+    "quant_utils",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

build/torch27-cxx11-cu126-x86_64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_e8730d8_dirty
-ops = torch.ops._quantization_e8730d8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_e8730d8_dirty::{op_name}"

 import torch
+from . import _quantization_dfa7d18
+ops = torch.ops._quantization_dfa7d18
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_dfa7d18::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e49acf1fe6df71b16edbf8cafc8ba41dbbda45e569b20b867bd8404a8f34db9
+size 160284752

build/torch27-cxx11-cu126-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2d6c023d7381396997b58ff6bdaa002db2ab94a0c0eb17d09512a1a9f8e888d2
-size 160280720

build/torch27-cxx11-cu126-x86_64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Optional, Tuple
 import torch
 from ._ops import ops
 # fp8
 def scaled_fp8_quant(

+from typing import Optional, Union
 import torch
 from ._ops import ops
+from .platforms import current_platform
 # fp8
 def scaled_fp8_quant(

build/torch27-cxx11-cu126-x86_64-linux/quantization/platforms.py CHANGED Viewed

@@ -27,6 +27,29 @@ class DeviceCapability(NamedTuple):
 class Platform(ABC):
     simple_compile_backend: str = "inductor"
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
@@ -51,6 +74,18 @@ class CudaPlatform(Platform):
 class RocmPlatform(Platform):
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

 class Platform(ABC):
     simple_compile_backend: str = "inductor"
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
 class RocmPlatform(Platform):
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

build/torch27-cxx11-cu128-x86_64-linux/quantization/__init__.py CHANGED Viewed

@@ -19,6 +19,11 @@ from .scalar_type import (
 )
 from ._ops import ops
 __all__ = [
     "ScalarType",
@@ -32,7 +37,11 @@ __all__ = [
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
     "ops",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

 )
 from ._ops import ops
+from .utils import marlin_utils
+from .utils import marlin_utils_fp4
+from .utils import marlin_utils_fp8
+from .utils import quant_utils
 __all__ = [
     "ScalarType",
     "gptq_marlin_repack",
     "marlin_gemm",
     "marlin_qqq_gemm",
+    "marlin_utils",
+    "marlin_utils_fp4",
+    "marlin_utils_fp8",
     "ops",
+    "quant_utils",
     "scalar_types",
     "scaled_fp8_quant",
     "scaled_int8_quant",

build/torch27-cxx11-cu128-x86_64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_e8730d8_dirty
-ops = torch.ops._quantization_e8730d8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_e8730d8_dirty::{op_name}"

 import torch
+from . import _quantization_dfa7d18
+ops = torch.ops._quantization_dfa7d18
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_dfa7d18::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/quantization/_quantization_dfa7d18.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c5b228ee9c669189c71da56a54be02d116cb733e17139b02344423fb768a4db
+size 297102992

build/torch27-cxx11-cu128-x86_64-linux/quantization/_quantization_e8730d8_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:718b7895b3e802aee133dcdbdbfd4aafa1dfed30a7a2b08547d97ec738b29c6e
-size 297107160

build/torch27-cxx11-cu128-x86_64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Optional, Tuple
 import torch
 from ._ops import ops
 # fp8
 def scaled_fp8_quant(

+from typing import Optional, Union
 import torch
 from ._ops import ops
+from .platforms import current_platform
 # fp8
 def scaled_fp8_quant(

build/torch27-cxx11-cu128-x86_64-linux/quantization/platforms.py CHANGED Viewed

@@ -27,6 +27,29 @@ class DeviceCapability(NamedTuple):
 class Platform(ABC):
     simple_compile_backend: str = "inductor"
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
@@ -51,6 +74,18 @@ class CudaPlatform(Platform):
 class RocmPlatform(Platform):
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:

 class Platform(ABC):
     simple_compile_backend: str = "inductor"
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
     @classmethod
     @abstractmethod
     def get_device_name(cls, device_id: int = 0) -> str: ...
 class RocmPlatform(Platform):
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: