Do0rMaMu's picture
Upload folder using huggingface_hub
e45d058 verified
/***************************************************************************************************
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief CUTLASS Library handle.
*/
#include <iostream>
#include <stdexcept>
#include <cstdint>
#include "cutlass/library/handle.h"
#include "cutlass/library/singleton.h"
#include "cutlass/library/util.h"
namespace cutlass {
namespace library {
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Constructor
Handle::Handle(
cudaStream_t stream,
size_t workspace_size
):
provider_(Provider::kCUTLASS),
stream_(stream),
workspace_(nullptr),
workspace_size_(0),
scalar_pointer_mode_(ScalarPointerMode::kHost),
last_operation_(nullptr) {
int device_idx = -1;
cudaError_t error = cudaGetDevice(&device_idx);
if (error != cudaSuccess) {
throw std::runtime_error("cudaGetDevice() failed");
}
error = cudaGetDeviceProperties(&device_, device_idx);
if (error != cudaSuccess) {
throw std::runtime_error("cudaGetDeviceProperties() failed");
}
set_workspace_size(workspace_size);
Singleton::get();
}
/// Destructor
Handle::~Handle() {
if (workspace_) {
if (workspace_) {
cudaFree(workspace_);
}
workspace_ = nullptr;
workspace_size_ = 0;
}
}
/// Move constructor
Handle::Handle(Handle && handle) {
device_ = handle.device_;
workspace_size_ = handle.workspace_size_;
workspace_ = handle.workspace_;
stream_ = handle.stream_;
scalar_pointer_mode_ = handle.scalar_pointer_mode_;
handle.workspace_ = nullptr;
handle.workspace_size_ = 0;
}
/// Move assignment operator
Handle & Handle::operator=(Handle && handle) {
provider_ = handle.provider_;
device_ = handle.device_;
workspace_size_ = handle.workspace_size_;
workspace_ = handle.workspace_;
stream_ = handle.stream_;
scalar_pointer_mode_ = handle.scalar_pointer_mode_;
handle.workspace_ = nullptr;
handle.workspace_size_ = 0;
return *this;
}
int Handle::compute_capability() const {
return device_.major * 10 + device_.minor;
}
/// Sets the current CUDA stream
void Handle::set_stream(cudaStream_t stream) {
stream_ = stream;
}
/// Gets the current CUDA stream
cudaStream_t Handle::get_stream() const {
return stream_;
}
/// Gets the current provider
Provider Handle::get_provider() const {
return provider_;
}
/// Sets the provider of operations
void Handle::set_provider(Provider provider) {
provider_ = provider;
}
/// Gets the device workspace size
size_t Handle::get_workspace_size() const {
return workspace_size_;
}
/// Gets a pointer to the device workspace allocation in Global Memory
void *Handle::get_workspace() const {
return workspace_;
}
/// Sets the size of device workspace, invalidating previous calls to get_device_workspace()
void Handle::set_workspace_size(size_t bytes) {
if (bytes != workspace_size_) {
if (workspace_) {
cudaFree(workspace_);
}
workspace_ = nullptr;
workspace_size_ = bytes;
if (workspace_size_) {
cudaError_t error = cudaMalloc((void **)&workspace_, workspace_size_);
if (error != cudaSuccess) {
throw std::runtime_error("Failed to allocate workspace");
}
}
}
if (workspace_) {
cudaError_t error = cudaMemset(workspace_, 0, workspace_size_);
if (error != cudaSuccess) {
throw std::runtime_error("Failed to clear workspace");
}
}
}
/// Gets the scalar pointer mode
ScalarPointerMode Handle::get_scalar_pointer_mode() const {
return scalar_pointer_mode_;
}
/// Sets the scalar pointer mode
void Handle::set_scalar_pointer_mode(ScalarPointerMode mode) {
scalar_pointer_mode_ = mode;
}
/// Gets the last operation
Operation const *Handle::get_last_operation() const {
return last_operation_;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Returns the maximum required alignment for each operator
static int maximum_alignment_requirement(GemmDescription const &desc) {
return std::max(
std::max(desc.A.alignment, desc.B.alignment), desc.C.alignment);
}
/// Returns the largest alignment (in units of elements) the problem satisfies, starting from a
/// given upper limit.
static int gemm_problem_alignment(
int M,
int N,
int K,
NumericTypeID element_A,
void const *ptr_A,
int64_t lda,
int64_t batch_stride_A,
NumericTypeID element_B,
void const *ptr_B,
int64_t ldb,
int64_t batch_stride_B,
NumericTypeID element_C,
void const * ptr_C,
int64_t ldc,
int64_t batch_stride_C,
void const * ptr_D,
int64_t ldd,
int64_t batch_stride_D,
int max_alignment_in_bytes = 16
) {
void const *pointers[] = {
ptr_A, ptr_B, ptr_C, ptr_D
};
int64_t extents[] = {
M, N, K, lda, ldb, ldc, ldd, batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D
};
NumericTypeID elements[] = {
element_A, element_B, element_C
};
for (; max_alignment_in_bytes > 0; max_alignment_in_bytes /= 2) {
bool satisfied = true;
// Can pointers satisfy this?
for (void const *ptr : pointers) {
std::uintptr_t int_ptr = reinterpret_cast<std::uintptr_t>(ptr);
if (int_ptr % max_alignment_in_bytes) {
satisfied = false;
break;
}
}
if (!satisfied) {
continue;
}
// Compute the maximum alignment based on element data types
int max_element_alignment = 0;
for (NumericTypeID type_id : elements) {
int element_alignment = max_alignment_in_bytes * 8 / library::sizeof_bits(type_id);
max_element_alignment = std::max(max_element_alignment, element_alignment);
}
// Can the problem size and leading dimensions satisfy this?
for (int64_t extent : extents) {
if (extent % max_element_alignment) {
satisfied = false;
break;
}
}
if (!satisfied) {
continue;
}
// Yes
return max_element_alignment;
}
// No alignment satisfies this problem
return 0;
}
/// Find the best kernel in descending order of preference.
static Operation const * find_gemm_operation(
GemmOperationFunctionalMap::const_iterator operators_it,
GemmPreferenceKey const preference_key) {
auto cc_it = operators_it->second.upper_bound(preference_key);
if (cc_it == operators_it->second.begin()) {
return nullptr;
}
Operation const *operation = nullptr;
// Search in descending order of compute capability
do {
--cc_it;
// Search tile sizes in order, for now.
for (auto const * op : cc_it->second) {
GemmDescription const &desc = static_cast<GemmDescription const &>(op->description());
int min_cc = desc.tile_description.minimum_compute_capability;
int max_cc = desc.tile_description.maximum_compute_capability;
int op_alignment = maximum_alignment_requirement(desc);
if ((min_cc <= preference_key.compute_capability) &&
(preference_key.compute_capability <= max_cc) &&
(op_alignment <= preference_key.alignment)) {
operation = op;
break;
}
}
} while (!operation && cc_it != operators_it->second.begin());
return operation;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Executes a GEMM computation: D <= alpha * A*B + beta * C
Status Handle::gemm(
int M, /// GEMM M dimension
int N, /// GEMM N dimension
int K, /// GEMM K dimension
NumericTypeID element_compute, /// Data type of internal accumulation
NumericTypeID element_scalar, /// Data type of alpha/beta scalars
void const *alpha, /// Pointer to alpha scalar
NumericTypeID element_A, /// Data type of A matrix elements
LayoutTypeID layout_A, /// Layout of A matrix
ComplexTransform transform_A, /// Complex transformation applied to A matrix - ignored for real-valued matrices
void const * ptr_A, /// Pointer to A matrix in Global Memory
int64_t lda, /// Leading dimension of A matrix
NumericTypeID element_B, /// Data type of B matrix elements
LayoutTypeID layout_B, /// Layout of B matrix
ComplexTransform transform_B, /// Complex transformation applied to B matrix - ignored for real-valued matrices
void const * ptr_B, /// Pointer to B matrix in Global Memory
int64_t ldb, /// Leading dimension of B matrix
void const * beta, /// Pointer to beta scalar
NumericTypeID element_C, /// Data type of C and D matrices
void const * ptr_C, /// Pointer to C matrix
int64_t ldc, /// Leading dimension of C matrix
void * ptr_D, /// Pointer to D matrix
int64_t ldd /// Leading dimension of D matrix
) {
//
// Find the operation
//
GemmFunctionalKey key(
provider_,
GemmKind::kGemm,
element_compute,
element_scalar,
element_A,
layout_A,
transform_A,
element_B,
layout_B,
transform_B,
element_C, // C/D are same type and col major default
LayoutTypeID::kColumnMajor,
element_C,
LayoutTypeID::kColumnMajor
);
auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
return cutlass::Status::kErrorNotSupported;
}
if (operators_it->second.empty()) {
return cutlass::Status::kErrorNotSupported;
}
//
// Compute the largest alignment restriction the kernel can satisfy.
//
// Maximum alignment expectation among all kernels (in units of bytes)
int const kMaximumAlignmentSize = 16;
int alignment = gemm_problem_alignment(
M, N, K,
element_A, ptr_A, lda, 0,
element_B, ptr_B, ldb, 0,
element_C, ptr_C, ldc, 0,
ptr_D, ldd, 0, kMaximumAlignmentSize
);
//
// Find the best kernel in descending order of preference.
//
GemmPreferenceKey preference_key(compute_capability(), alignment);
Operation const *operation = find_gemm_operation(operators_it, preference_key);
if (!operation) {
return cutlass::Status::kErrorNotSupported;
}
last_operation_ = operation;
//
// Configure operation
//
GemmConfiguration configuration{
{M, N, K},
lda,
ldb,
ldc,
ldd,
1
};
// Query host work space size
uint64_t host_workspace_size_needed = operation->get_host_workspace_size(&configuration);
if (uint64_t(kHostWorkspaceSize) < host_workspace_size_needed) {
return cutlass::Status::kErrorNotSupported;
}
char host_workspace[kHostWorkspaceSize];
// Query device workspace size
uint64_t device_workspace_size_needed = operation->get_device_workspace_size(&configuration);
if (uint64_t(workspace_size_) < device_workspace_size_needed) {
return cutlass::Status::kErrorNotSupported;
}
// Initialize host and device workspaces
Status status = operation->initialize(
&configuration,
host_workspace,
workspace_,
stream_);
if (status != cutlass::Status::kSuccess) {
return status;
}
// Run the operator
GemmArguments arguments{
ptr_A,
ptr_B,
ptr_C,
ptr_D,
alpha,
beta,
scalar_pointer_mode_
};
return operation->run(&arguments, host_workspace, workspace_, stream_);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Executes a GEMM computation: D <= alpha * A*B + beta * C.
//
// Supports batched-strided, batched array or split-K serial or split-K parallel.
//
Status Handle::gemm_universal(
GemmUniversalMode mode, /// indicates the mode in which the kUniversal GEMM is launched
int M, /// GEMM M dimension
int N, /// GEMM N dimension
int K, /// GEMM K dimension
NumericTypeID element_compute, /// Data type of internal accumulation
NumericTypeID element_scalar, /// Data type of alpha/beta scalars
void const *alpha, /// Pointer to alpha scalar
NumericTypeID element_A, /// Data type of A matrix elements
LayoutTypeID layout_A, /// Layout of A matrix
ComplexTransform transform_A, /// Complex transformation applied to A matrix - ignored for real-valued matrices
void const * ptr_A, /// Pointer to A matrix in Global Memory
int64_t lda, /// Leading dimension of A matrix
NumericTypeID element_B, /// Data type of B matrix elements
LayoutTypeID layout_B, /// Layout of B matrix
ComplexTransform transform_B, /// Complex transformation applied to B matrix - ignored for real-valued matrices
void const * ptr_B, /// Pointer to B matrix in Global Memory
int64_t ldb, /// Leading dimension of B matrix
void const * beta, /// Pointer to beta scalar
NumericTypeID element_C, /// Data type of C matrix
LayoutTypeID layout_C, /// Layout of D matrix
void const * ptr_C, /// Pointer to C matrix
int64_t ldc, /// Leading dimension of C matrix
NumericTypeID element_D, /// Data type of D matrix
LayoutTypeID layout_D, /// Layout of D matrix
void * ptr_D, /// Pointer to D matrix
int64_t ldd, /// Leading dimension of D matrix
int batch_count, /// Batch count or number of split-K slices
int64_t batch_stride_A, /// Batch stride of A operand
int64_t batch_stride_B, /// Batch stride of B operand
int64_t batch_stride_C, /// Batch stride of C operand
int64_t batch_stride_D /// Batch stride of D operand
) {
//
// Find the operation
//
GemmFunctionalKey key(
provider_,
GemmKind::kUniversal,
element_compute,
element_scalar,
element_A,
layout_A,
transform_A,
element_B,
layout_B,
transform_B,
element_C,
layout_C,
element_D,
layout_D
);
auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
return cutlass::Status::kErrorNotSupported;
}
if (operators_it->second.empty()) {
return cutlass::Status::kErrorNotSupported;
}
//
// Compute the largest alignment restriction the kernel can satisfy.
//
// Maximum alignment expectation among all kernels (in units of bytes)
int const kMaximumAlignmentSize = 16;
void const *ptr_A_check = ptr_A;
void const *ptr_B_check = ptr_B;
void const *ptr_C_check = ptr_C;
void * ptr_D_check = ptr_D;
// Ignore alignment of pointers to pointers. We can't check this from the host,
// as each batch index has its own pointer in device memory.
if (mode == GemmUniversalMode::kArray) {
ptr_A_check = nullptr;
ptr_B_check = nullptr;
ptr_C_check = nullptr;
ptr_D_check = nullptr;
}
int alignment = gemm_problem_alignment(
M, N, K,
element_A, ptr_A_check, lda, 0,
element_B, ptr_B_check, ldb, 0,
element_C, ptr_C_check, ldc, 0,
ptr_D_check, ldd, 0, kMaximumAlignmentSize
);
//
// Find the best kernel in descending order of preference.
//
GemmPreferenceKey preference_key(compute_capability(), alignment);
Operation const *operation = find_gemm_operation(operators_it, preference_key);
if (!operation) {
return cutlass::Status::kErrorNotSupported;
}
last_operation_ = operation;
//
// Configure operation
//
GemmUniversalConfiguration configuration{
mode,
{M, N, K},
batch_count,
lda,
ldb,
ldc,
ldd
};
// Query host work space size
uint64_t host_workspace_size_needed = operation->get_host_workspace_size(&configuration);
if (uint64_t(kHostWorkspaceSize) < host_workspace_size_needed) {
return cutlass::Status::kErrorNotSupported;
}
char host_workspace[kHostWorkspaceSize];
GemmUniversalArguments arguments{
{M, N, K},
batch_count,
ptr_A,
ptr_B,
ptr_C,
ptr_D,
alpha,
beta,
scalar_pointer_mode_,
lda,
ldb,
ldc,
ldd,
batch_stride_A,
batch_stride_B,
batch_stride_C,
batch_stride_D
};
// Query device workspace size
uint64_t device_workspace_size_needed = operation->get_device_workspace_size(&configuration, &arguments);
if (uint64_t(workspace_size_) < device_workspace_size_needed) {
return cutlass::Status::kErrorNotSupported;
}
// Initialize host and device workspaces
Status status = operation->initialize(
&configuration,
host_workspace,
workspace_,
stream_);
if (status != cutlass::Status::kSuccess) {
return status;
}
// Run the operator
return operation->run(&arguments, host_workspace, workspace_, stream_);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Planar complex GEMM
Status Handle::gemm_planar_complex(
int M, /// GEMM M dimension
int N, /// GEMM N dimension
int K, /// GEMM K dimension
NumericTypeID element_compute, /// Data type of internal accumulation
NumericTypeID element_scalar, /// Data type of alpha/beta scalars
void const *alpha, /// Pointer to alpha scalar
NumericTypeID element_A, /// Data type of A matrix elements
LayoutTypeID layout_A, /// Layout of A matrix
ComplexTransform transform_A, /// Complex transformation applied to A matrix
void const * ptr_A_real, /// Pointer to real part of A matrix
void const * ptr_A_imag, /// Pointer to imaginary part of A matrix
int64_t lda_real, /// Leading dimension of real part of A matrix
int64_t lda_imag, /// Leading dimension of imaginary part of A matrix
NumericTypeID element_B, /// Data type of B matrix elements
LayoutTypeID layout_B, /// Layout of B matrix
ComplexTransform transform_B, /// Complex transformation applied to B matrix
void const * ptr_B_real, /// Pointer to real part of B matrix
void const * ptr_B_imag, /// Pointer to imaginary part of B matrix
int64_t ldb_real, /// Leading dimension of real part of B matrix
int64_t ldb_imag, /// Leading dimension of imaginary part of B matrix
void const * beta, /// Pointer to beta scalar
NumericTypeID element_C, /// Data type of C and D matrix
void const * ptr_C_real, /// Pointer to real part of C matrix
void const * ptr_C_imag, /// Pointer to imaginary part of C matrix
int64_t ldc_real, /// Leading dimension of real part of C matrix
int64_t ldc_imag, /// Leading dimension of imaginary part of C matrix
void * ptr_D_real, /// Pointer to real part of D matrix
void * ptr_D_imag, /// Pointer to imaginary part of D matrix
int64_t ldd_real, /// Leading dimension of real part of D matrix
int64_t ldd_imag, /// Leading dimension of imaginary part of D matrix
int batch_count, /// Number of batched GEMMs to execute
int64_t batch_stride_A_real,
int64_t batch_stride_A_imag,
int64_t batch_stride_B_real,
int64_t batch_stride_B_imag,
int64_t batch_stride_C_real,
int64_t batch_stride_C_imag,
int64_t batch_stride_D_real,
int64_t batch_stride_D_imag
) {
//
// Find the operation
//
GemmFunctionalKey key(
provider_,
GemmKind::kPlanarComplex,
element_compute,
element_scalar,
element_A,
layout_A,
transform_A,
element_B,
layout_B,
transform_B,
element_C, // C/D are same type
LayoutTypeID::kColumnMajor,
element_C,
LayoutTypeID::kColumnMajor
);
auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
return cutlass::Status::kErrorNotSupported;
}
if (operators_it->second.empty()) {
return cutlass::Status::kErrorNotSupported;
}
//
// Compute the largest alignment restriction the kernel can satisfy.
//
// Maximum alignment expectation among all kernels (in units of bytes)
int const kMaximumAlignmentSize = 16;
int alignment = std::max(
gemm_problem_alignment(
M, N, K,
element_A, ptr_A_real, lda_real, batch_stride_A_real,
element_B, ptr_B_real, ldb_real, batch_stride_B_real,
element_C, ptr_C_real, ldc_real, batch_stride_C_real,
ptr_D_real, ldd_real, batch_stride_D_real, kMaximumAlignmentSize
),
gemm_problem_alignment(
M, N, K,
element_A, ptr_A_imag, lda_imag, batch_stride_A_imag,
element_B, ptr_B_imag, ldb_imag, batch_stride_B_imag,
element_C, ptr_C_imag, ldc_imag, batch_stride_C_imag,
ptr_D_imag, ldd_imag, batch_stride_D_imag, kMaximumAlignmentSize
)
);
//
// Find the best kernel in descending order of preference.
//
GemmPreferenceKey preference_key(compute_capability(), alignment);
Operation const *operation = find_gemm_operation(operators_it, preference_key);
if (!operation) {
return cutlass::Status::kErrorNotSupported;
}
last_operation_ = operation;
//
// Configure operation
//
GemmPlanarComplexConfiguration configuration{
GemmUniversalMode::kBatched,
{M, N, K},
batch_count,
lda_real,
lda_imag,
ldb_real,
ldb_imag,
ldc_real,
ldc_imag,
ldd_real,
ldd_imag
};
// Query host work space size
uint64_t host_workspace_size_needed = operation->get_host_workspace_size(&configuration);
if (uint64_t(kHostWorkspaceSize) < host_workspace_size_needed) {
return cutlass::Status::kErrorNotSupported;
}
char host_workspace[kHostWorkspaceSize];
// Query device workspace size
uint64_t device_workspace_size_needed = operation->get_device_workspace_size(&configuration);
if (uint64_t(workspace_size_) < device_workspace_size_needed) {
return cutlass::Status::kErrorNotSupported;
}
// Initialize host and device workspaces
Status status = operation->initialize(
&configuration,
host_workspace,
workspace_,
stream_);
if (status != cutlass::Status::kSuccess) {
return status;
}
// Run the operator
GemmPlanarComplexArguments arguments{
ptr_A_real,
ptr_A_imag,
ptr_B_real,
ptr_B_imag,
ptr_C_real,
ptr_C_imag,
ptr_D_real,
ptr_D_imag,
alpha,
beta,
scalar_pointer_mode_,
batch_stride_A_real,
batch_stride_A_imag,
batch_stride_B_real,
batch_stride_B_imag,
batch_stride_C_real,
batch_stride_C_imag,
batch_stride_D_real,
batch_stride_D_imag
};
return operation->run(&arguments, host_workspace, workspace_, stream_);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Planar complex batched GEMM loading pointers from arrays in global memory
Status Handle::gemm_planar_complex_array(
int expected_M, /// Expected GEMM M dimension (used for sizing CUDA grid)
int expected_N, /// Expected GEMM N dimension (used for sizing CUDA grid)
int expected_K, /// Expected GEMM K dimension
int batch_count, /// Number of independent GEMM computations to execute
int const *M, /// Array containing the GEMM M dimension for each batch index
int const *N, /// Array containing the GEMM N dimension for each batch index
int const *K, /// Array containing the GEMM K dimension for each batch index
NumericTypeID element_compute, /// Data type of internal accumulation
NumericTypeID element_scalar, /// Data type of alpha/beta scalars
void const *alpha, /// Pointer to alpha scalar
NumericTypeID element_A, /// Data type of A matrix elements
LayoutTypeID layout_A, /// Layout of A matrix
ComplexTransform transform_A, /// Complex transformation applied to A matrix
void const * const * ptr_A_real, /// Pointer to array containing pointers to real part of A matrices
void const * const * ptr_A_imag, /// Pointer to array containing pointers to imaginary part of A matrices
int64_t lda_real, /// Leading dimension of real part of A matrix
int64_t lda_imag, /// Leading dimension of imaginary part of A matrix
NumericTypeID element_B, /// Data type of B matrix elements
LayoutTypeID layout_B, /// Layout of B matrix
ComplexTransform transform_B, /// Complex transformation applied to B matrix
void const * const * ptr_B_real, /// Pointer to array containing pointers to real part of B matrices
void const * const * ptr_B_imag, /// Pointer to array containing pointers to imaginary part of B matrices
int64_t ldb_real, /// Leading dimension of real part of B matrix
int64_t ldb_imag, /// Leading dimension of imaginary part of B matrix
void const * beta, /// Pointer to beta scalar
NumericTypeID element_C, /// Data type of C and D matrix
void const * const * ptr_C_real, /// Pointer to array containing pointers to real part of C matrices
void const * const * ptr_C_imag, /// Pointer to array containing pointers to imaginary part of C matrices
int64_t ldc_real, /// Leading dimension of real part of C matrix
int64_t ldc_imag, /// Leading dimension of imaginary part of C matrix
void * const * ptr_D_real, /// Pointer to array containing pointers to real part of D matrices
void * const * ptr_D_imag, /// Pointer to array containing pointers to imaginary part of D matrices
int64_t ldd_real, /// Leading dimension of real part of D matrix
int64_t ldd_imag /// Leading dimension of imaginary part of D matrix
) {
//
// Find the operation
//
GemmFunctionalKey key(
provider_,
GemmKind::kPlanarComplexArray,
element_compute,
element_scalar,
element_A,
layout_A,
transform_A,
element_B,
layout_B,
transform_B,
element_C, // C/D are same type
LayoutTypeID::kColumnMajor,
element_C,
LayoutTypeID::kColumnMajor
);
auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
return cutlass::Status::kErrorNotSupported;
}
if (operators_it->second.empty()) {
return cutlass::Status::kErrorNotSupported;
}
//
// Compute the largest alignment restriction the kernel can satisfy.
//
// Maximum alignment expectation among all kernels (in units of bytes)
int const kMaximumAlignmentSize = 16;
int alignment = std::max(
gemm_problem_alignment(
expected_M, expected_N, expected_K,
element_A, nullptr, lda_real, 0,
element_B, nullptr, ldb_real, 0,
element_C, nullptr, ldc_real, 0,
nullptr, ldd_real, 0, kMaximumAlignmentSize
),
gemm_problem_alignment(
expected_M, expected_N, expected_K,
element_A, nullptr, lda_imag, 0,
element_B, nullptr, ldb_imag, 0,
element_C, nullptr, ldc_imag, 0,
nullptr, ldd_imag, 0, kMaximumAlignmentSize
)
);
//
// Find the best kernel in descending order of preference.
//
GemmPreferenceKey preference_key(compute_capability(), alignment);
Operation const *operation = find_gemm_operation(operators_it, preference_key);
if (!operation) {
return cutlass::Status::kErrorNotSupported;
}
last_operation_ = operation;
//
// Configure operation
//
GemmPlanarComplexArrayConfiguration configuration{
{expected_M, expected_N, expected_K},
batch_count,
lda_real,
lda_imag,
ldb_real,
ldb_imag,
ldc_real,
ldc_imag,
ldd_real,
ldd_imag
};
// Query host work space size
uint64_t host_workspace_size_needed = operation->get_host_workspace_size(&configuration);
if (uint64_t(kHostWorkspaceSize) < host_workspace_size_needed) {
return cutlass::Status::kErrorNotSupported;
}
char host_workspace[kHostWorkspaceSize];
// Query device workspace size
uint64_t device_workspace_size_needed = operation->get_device_workspace_size(&configuration);
if (uint64_t(workspace_size_) < device_workspace_size_needed) {
return cutlass::Status::kErrorNotSupported;
}
// Initialize host and device workspaces
Status status = operation->initialize(
&configuration,
host_workspace,
workspace_,
stream_);
if (status != cutlass::Status::kSuccess) {
return status;
}
// Run the operator
GemmPlanarComplexArrayArguments arguments{
M, N, K,
ptr_A_real,
ptr_A_imag,
ptr_B_real,
ptr_B_imag,
ptr_C_real,
ptr_C_imag,
ptr_D_real,
ptr_D_imag,
alpha,
beta,
scalar_pointer_mode_
};
return operation->run(&arguments, host_workspace, workspace_, stream_);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Finds conv operation instances with Conv::ElementC = Reduction::ElementWorkspace
Operation const* find_conv_operation_for_parallel_reduction(Operation const *operation) {
ConvDescription const &conv_desc =
static_cast<ConvDescription const &>(operation->description());
// if the curren conv operation accumulator and output data type match return operation
if(conv_desc.tile_description.math_instruction.element_accumulator == conv_desc.C.element) {
return operation;
}
// find conv operation to match conv output and reduction workspace data type
ConvFunctionalKey key(
library::Provider::kCUTLASS,
conv_desc.conv_kind,
conv_desc.A.element,
conv_desc.A.layout,
conv_desc.B.element,
conv_desc.B.layout,
conv_desc.tile_description.math_instruction.element_accumulator,
conv_desc.C.layout,
conv_desc.tile_description.math_instruction.element_accumulator,
conv_desc.element_epilogue);
// conv operation table for conv2d or conv3d
auto conv_operations = (conv_desc.kind == OperationKind::kConv2d) ?
Singleton::get().operation_table.conv2d_operations :
Singleton::get().operation_table.conv3d_operations;
// find ConvFunctionalKey in convolution operation table
auto operators_it = conv_operations.find(key);
if (operators_it == conv_operations.end()) {
return nullptr;
}
if (operators_it->second.empty()) {
return nullptr;
}
// conv operation for same compute capability and iterator algorithm
ConvPreferenceKey preference_key(
conv_desc.tile_description.minimum_compute_capability,
conv_desc.iterator_algorithm);
auto it = operators_it->second.find(preference_key);
if(it == operators_it->second.end()) {
return nullptr;
}
// return matching conv opertion (same tile sizes and instruction)
for (auto op : it->second) {
if (op->description().tile_description == operation->description().tile_description) {
return op;
}
}
return nullptr;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Finds gemm operation instances with Gemm::ElementC = Reduction::ElementWorkspace
Operation const* find_gemm_operation_for_parallel_reduction(Operation const *operation) {
GemmDescription const &gemm_desc =
static_cast<GemmDescription const &>(operation->description());
// if the curren gemm operation accumulator and output data type match return operation
if(gemm_desc.tile_description.math_instruction.element_accumulator == gemm_desc.D.element) {
return operation;
}
// find gemm operation to match gemm output and reduction workspace data type
GemmFunctionalKey key(
library::Provider::kCUTLASS,
gemm_desc.gemm_kind,
gemm_desc.tile_description.math_instruction.element_accumulator,
gemm_desc.element_epilogue,
gemm_desc.A.element,
gemm_desc.A.layout,
gemm_desc.transform_A,
gemm_desc.B.element,
gemm_desc.B.layout,
gemm_desc.transform_B,
gemm_desc.tile_description.math_instruction.element_accumulator, // C/D are same type
LayoutTypeID::kColumnMajor,
gemm_desc.tile_description.math_instruction.element_accumulator,
LayoutTypeID::kColumnMajor);
// gemm operation table
auto gemm_operations = Singleton::get().operation_table.gemm_operations;
// find ConvFunctionalKey in gemm operation table
auto operators_it = gemm_operations.find(key);
if (operators_it == gemm_operations.end()) {
return nullptr;
}
if (operators_it->second.empty()) {
return nullptr;
}
// gemm operation for same compute capability and max operand alignment
int alignment = std::max(
gemm_desc.A.alignment,
gemm_desc.B.alignment);
GemmPreferenceKey preference_key(
gemm_desc.tile_description.minimum_compute_capability,
alignment);
auto it = operators_it->second.find(preference_key);
if(it == operators_it->second.end()) {
return nullptr;
}
// return matching gemm opertion (same tile shape, stages, warp count, and instruction)
for (auto op : it->second) {
if (op->description().tile_description == operation->description().tile_description) {
return op;
}
}
// return nullptr if no matching gemm operation found for parallel split-k reduction
return nullptr;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace library
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////