quantization / cutlass_w8a8 /scaled_mm_c3x_sm90.cu

Sync to vLLM 20250627

8aa00a3 about 1 month ago

1.45 kB

	#include "c3x/scaled_mm_helper.hpp"
	#include "c3x/scaled_mm_kernels.hpp"

	/*
	This file defines quantized GEMM operations using the CUTLASS 3.x API, for
	NVIDIA GPUs with sm90a (Hopper).
	*/

	void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
	torch::Tensor const& b,
	torch::Tensor const& a_scales,
	torch::Tensor const& b_scales,
	std::optional<torch::Tensor> const& bias) {
	dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
	vllm::cutlass_scaled_mm_sm90_fp8,
	vllm::cutlass_scaled_mm_sm90_int8,
	vllm::cutlass_scaled_mm_blockwise_sm90_fp8);
	}

	void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
	torch::Tensor const& b,
	torch::Tensor const& a_scales,
	torch::Tensor const& b_scales,
	torch::Tensor const& azp_adj,
	std::optional<torch::Tensor> const& azp,
	std::optional<torch::Tensor> const& bias) {
	TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
	TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

	vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
	azp, bias);
	}