Spaces:
Sleeping
Sleeping
/*************************************************************************************************** | |
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
* SPDX-License-Identifier: BSD-3-Clause | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* | |
* 1. Redistributions of source code must retain the above copyright notice, this | |
* list of conditions and the following disclaimer. | |
* | |
* 2. Redistributions in binary form must reproduce the above copyright notice, | |
* this list of conditions and the following disclaimer in the documentation | |
* and/or other materials provided with the distribution. | |
* | |
* 3. Neither the name of the copyright holder nor the names of its | |
* contributors may be used to endorse or promote products derived from | |
* this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
**************************************************************************************************/ | |
/*! \file | |
\brief Defines additional layout functions used in Permute GEMM example to simplify | |
computing reference permutations of 4/5D tensors when source data is column-major. | |
*/ | |
namespace cutlass { | |
namespace layout { | |
///////////////////////////////////////////////////////////////////////////////////////////////// | |
/// Mapping function for 4-D CWHN tensors. | |
class TensorCWHN { | |
public: | |
/// Logical rank of tensor | |
static int const kRank = 4; | |
/// Rank of stride vector | |
static int const kStrideRank = 3; | |
/// Index type used for coordinates | |
using Index = int32_t; | |
/// Long index type used for offsets | |
using LongIndex = int64_t; | |
/// Logical coordinate (n, h, w, c) | |
using TensorCoord = Tensor4DCoord; | |
/// Stride vector | |
using Stride = Coord<kStrideRank>; | |
private: | |
// | |
// Data members | |
// | |
/// Stride data member - [n, hn, whn] | |
Stride stride_; | |
public: | |
// | |
// Methods | |
// | |
/// Constructor | |
CUTLASS_HOST_DEVICE | |
TensorCWHN(Stride const &stride = Stride(0)): stride_(stride) { } | |
/// Constructor | |
CUTLASS_HOST_DEVICE | |
TensorCWHN( | |
typename Stride::Index stride_h, ///< number of elements between adjacent N coordinates | |
typename Stride::Index stride_w, ///< number of elements between adjacent C coordinates | |
typename Stride::Index stride_c ///< number of elements between adjacent W coordinates | |
): | |
stride_(make_Coord(stride_h, stride_w, stride_c)) { } | |
/// Constructor | |
// Once convolutions implement 64b stride this ctor can be deleted | |
CUTLASS_HOST_DEVICE | |
TensorCWHN(Coord<kStrideRank, LongIndex> const &stride): | |
stride_(make_Coord( | |
static_cast<typename Stride::Index>(stride[0]), | |
static_cast<typename Stride::Index>(stride[1]), | |
static_cast<typename Stride::Index>(stride[2])) | |
) { } | |
/// Helper returns a layout to a tightly packed WCNH tensor. | |
CUTLASS_HOST_DEVICE | |
static TensorCWHN packed(TensorCoord const &extent) { | |
return TensorCWHN( | |
make_Coord( | |
extent.n(), | |
extent.h() * extent.n(), | |
extent.w() * extent.h() * extent.n() | |
) | |
); | |
} | |
/// Returns the offset of a coordinate (n, h, w, c) in linear memory. | |
CUTLASS_HOST_DEVICE | |
LongIndex operator()(TensorCoord const &coord) const { | |
return coord.n() + | |
LongIndex(stride_[0] * coord.h()) + | |
LongIndex(stride_[1] * coord.w()) + | |
LongIndex(stride_[2] * coord.c()); | |
} | |
/// Returns the offset of a pitchlinear coordinate in linear memory. | |
CUTLASS_HOST_DEVICE | |
LongIndex operator()(PitchLinearCoord coord) const { | |
return coord.contiguous() + LongIndex(coord.strided() * stride_[2]); | |
} | |
/// Returns the stride of the layout | |
CUTLASS_HOST_DEVICE | |
Stride stride() const { | |
return stride_; | |
} | |
/// Returns the stride of the layout | |
CUTLASS_HOST_DEVICE | |
Stride & stride() { | |
return stride_; | |
} | |
/// Compute the number of contiguous elements needed to store a tensor with the given size | |
CUTLASS_HOST_DEVICE | |
LongIndex capacity(TensorCoord const &extent) const { | |
// it does not make sense if the extent is larger than stride | |
// and we could not rely on the capacity calculation in such cases | |
// we could move this checkers to debug code only | |
if ((extent.n() > stride_[0]) | |
|| (extent.h() * stride_[0] > stride_[1]) | |
|| (extent.w() * stride_[1] > stride_[2])) { | |
assert(0); | |
} | |
return extent.c() * stride_[2]; | |
} | |
}; | |
///////////////////////////////////////////////////////////////////////////////////////////////// | |
/// Mapping function for 4-D NHCW tensors. | |
class TensorNHCW { | |
public: | |
/// Logical rank of tensor | |
static int const kRank = 4; | |
/// Rank of stride vector | |
static int const kStrideRank = 3; | |
/// Index type used for coordinates | |
using Index = int32_t; | |
/// Long index type used for offsets | |
using LongIndex = int64_t; | |
/// Logical coordinate (n, h, w, c) | |
using TensorCoord = Tensor4DCoord; | |
/// Stride vector | |
using Stride = Coord<kStrideRank>; | |
private: | |
// | |
// Data members | |
// | |
/// Stride data member - [w, cw, hcw] | |
Stride stride_; | |
public: | |
// | |
// Methods | |
// | |
/// Constructor | |
CUTLASS_HOST_DEVICE | |
TensorNHCW(Stride const &stride = Stride(0)): stride_(stride) { } | |
/// Constructor | |
CUTLASS_HOST_DEVICE | |
TensorNHCW( | |
typename Stride::Index stride_c, ///< number of elements between adjacent C coordinates | |
typename Stride::Index stride_h, ///< number of elements between adjacent H coordinates | |
typename Stride::Index stride_n ///< number of elements between adjacent N coordinates | |
): | |
stride_(make_Coord(stride_c, stride_h, stride_n)) { } | |
/// Constructor | |
// Once convolutions implement 64b stride this ctor can be deleted | |
CUTLASS_HOST_DEVICE | |
TensorNHCW(Coord<kStrideRank, LongIndex> const &stride): | |
stride_(make_Coord( | |
static_cast<typename Stride::Index>(stride[0]), | |
static_cast<typename Stride::Index>(stride[1]), | |
static_cast<typename Stride::Index>(stride[2])) | |
) { } | |
/// Helper returns a layout to a tightly packed WCNH tensor. | |
CUTLASS_HOST_DEVICE | |
static TensorNHCW packed(TensorCoord const &extent) { | |
return TensorNHCW( | |
make_Coord( | |
extent.w(), | |
extent.c() * extent.w(), | |
extent.h() * extent.c() * extent.w() | |
) | |
); | |
} | |
/// Returns the offset of a coordinate (n, h, w, c) in linear memory. | |
CUTLASS_HOST_DEVICE | |
LongIndex operator()(TensorCoord const &coord) const { | |
return coord.w() + | |
LongIndex(stride_[0] * coord.c()) + | |
LongIndex(stride_[1] * coord.h()) + | |
LongIndex(stride_[2] * coord.n()); | |
} | |
/// Returns the offset of a pitchlinear coordinate in linear memory. | |
CUTLASS_HOST_DEVICE | |
LongIndex operator()(PitchLinearCoord coord) const { | |
return coord.contiguous() + LongIndex(coord.strided() * stride_[2]); | |
} | |
/// Returns the stride of the layout | |
CUTLASS_HOST_DEVICE | |
Stride stride() const { | |
return stride_; | |
} | |
/// Returns the stride of the layout | |
CUTLASS_HOST_DEVICE | |
Stride & stride() { | |
return stride_; | |
} | |
/// Compute the number of contiguous elements needed to store a tensor with the given size | |
CUTLASS_HOST_DEVICE | |
LongIndex capacity(TensorCoord const &extent) const { | |
// it does not make sense if the extent is larger than stride | |
// and we could not rely on the capacity calculation in such cases | |
// we could move this checkers to debug code only | |
if ((extent.w() > stride_[0]) | |
|| (extent.c() * stride_[0] > stride_[1]) | |
|| (extent.h() * stride_[1] > stride_[2])) { | |
assert(0); | |
} | |
return extent.n() * stride_[2]; | |
} | |
}; | |
///////////////////////////////////////////////////////////////////////////////////////////////// | |
/// Mapping function for 4-D NHCW tensors. | |
class TensorNCWH { | |
public: | |
/// Logical rank of tensor | |
static int const kRank = 4; | |
/// Rank of stride vector | |
static int const kStrideRank = 3; | |
/// Index type used for coordinates | |
using Index = int32_t; | |
/// Long index type used for offsets | |
using LongIndex = int64_t; | |
/// Logical coordinate (n, h, w, c) | |
using TensorCoord = Tensor4DCoord; | |
/// Stride vector | |
using Stride = Coord<kStrideRank>; | |
private: | |
// | |
// Data members | |
// | |
/// Stride data member - [h, wh, cwh] | |
Stride stride_; | |
public: | |
// | |
// Methods | |
// | |
/// Constructor | |
CUTLASS_HOST_DEVICE | |
TensorNCWH(Stride const &stride = Stride(0)): stride_(stride) { } | |
/// Constructor | |
CUTLASS_HOST_DEVICE | |
TensorNCWH( | |
typename Stride::Index stride_w, ///< number of elements between adjacent C coordinates | |
typename Stride::Index stride_c, ///< number of elements between adjacent H coordinates | |
typename Stride::Index stride_n ///< number of elements between adjacent N coordinates | |
): | |
stride_(make_Coord(stride_w, stride_c, stride_n)) { } | |
/// Constructor | |
// Once convolutions implement 64b stride this ctor can be deleted | |
CUTLASS_HOST_DEVICE | |
TensorNCWH(Coord<kStrideRank, LongIndex> const &stride): | |
stride_(make_Coord( | |
static_cast<typename Stride::Index>(stride[0]), | |
static_cast<typename Stride::Index>(stride[1]), | |
static_cast<typename Stride::Index>(stride[2])) | |
) { } | |
/// Helper returns a layout to a tightly packed WCNH tensor. | |
CUTLASS_HOST_DEVICE | |
static TensorNCWH packed(TensorCoord const &extent) { | |
return TensorNCWH( | |
make_Coord( | |
extent.h(), | |
extent.w() * extent.h(), | |
extent.c() * extent.w() * extent.h() | |
) | |
); | |
} | |
/// Returns the offset of a coordinate (n, h, w, c) in linear memory. | |
CUTLASS_HOST_DEVICE | |
LongIndex operator()(TensorCoord const &coord) const { | |
return coord.h() + | |
LongIndex(stride_[0] * coord.w()) + | |
LongIndex(stride_[1] * coord.c()) + | |
LongIndex(stride_[2] * coord.n()); | |
} | |
/// Returns the offset of a pitchlinear coordinate in linear memory. | |
CUTLASS_HOST_DEVICE | |
LongIndex operator()(PitchLinearCoord coord) const { | |
return coord.contiguous() + LongIndex(coord.strided() * stride_[2]); | |
} | |
/// Returns the stride of the layout | |
CUTLASS_HOST_DEVICE | |
Stride stride() const { | |
return stride_; | |
} | |
/// Returns the stride of the layout | |
CUTLASS_HOST_DEVICE | |
Stride & stride() { | |
return stride_; | |
} | |
/// Compute the number of contiguous elements needed to store a tensor with the given size | |
CUTLASS_HOST_DEVICE | |
LongIndex capacity(TensorCoord const &extent) const { | |
// it does not make sense if the extent is larger than stride | |
// and we could not rely on the capacity calculation in such cases | |
// we could move this checkers to debug code only | |
if ((extent.h() > stride_[0]) | |
|| (extent.w() * stride_[0] > stride_[1]) | |
|| (extent.c() * stride_[1] > stride_[2])) { | |
assert(0); | |
} | |
return extent.n() * stride_[2]; | |
} | |
}; | |
///////////////////////////////////////////////////////////////////////////////////////////////// | |
/// Mapping function for 5-D CWHDN tensors. | |
class TensorCWHDN { | |
public: | |
/// Logical rank of tensor | |
static int const kRank = 5; | |
/// Rank of stride vector | |
static int const kStrideRank = 4; | |
/// Index type used for coordinates | |
using Index = int32_t; | |
/// Long index type used for offsets | |
using LongIndex = int64_t; | |
/// Logical coordinate (n, d, h, w, c) | |
using TensorCoord = Tensor5DCoord; | |
/// Stride vector | |
using Stride = Coord<kStrideRank>; | |
private: | |
// | |
// Data members | |
// | |
/// Stride data member - [n, dn, hdn, whdn] | |
Stride stride_; | |
public: | |
// | |
// Methods | |
// | |
/// Constructor | |
CUTLASS_HOST_DEVICE | |
TensorCWHDN(Stride const &stride = Stride(0)): stride_(stride) { } | |
/// Constructor | |
CUTLASS_HOST_DEVICE | |
TensorCWHDN( | |
typename Stride::Index n, | |
typename Stride::Index dn, | |
typename Stride::Index hdn, | |
typename Stride::Index whdn): | |
stride_(make_Coord(n, dn, hdn, whdn)) { } | |
/// Constructor | |
// Once convolutions implement 64b stride this ctor can be deleted | |
CUTLASS_HOST_DEVICE | |
TensorCWHDN(Coord<kStrideRank, LongIndex> const &stride): | |
stride_(make_Coord( | |
static_cast<typename Stride::Index>(stride[0]), | |
static_cast<typename Stride::Index>(stride[1]), | |
static_cast<typename Stride::Index>(stride[2]), | |
static_cast<typename Stride::Index>(stride[3])) | |
) { } | |
/// Helper returns a layout to a tightly packed CWHDN tensor. | |
CUTLASS_HOST_DEVICE | |
static TensorCWHDN packed(TensorCoord const &extent) { | |
return TensorCWHDN( | |
make_Coord( | |
extent.n(), | |
extent.d() * extent.n(), | |
extent.h() * extent.d() * extent.n(), | |
extent.w() * extent.h() * extent.d() * extent.n() | |
) | |
); | |
} | |
/// Returns the offset of a coordinate (n, d, h, w, c) in linear memory. | |
CUTLASS_HOST_DEVICE | |
LongIndex operator()(TensorCoord const &coord) const { | |
return coord.n() + | |
LongIndex(stride_[0] * coord.d()) + | |
LongIndex(stride_[1] * coord.h()) + | |
LongIndex(stride_[2] * coord.w()) + | |
LongIndex(stride_[3] * coord.c()); | |
} | |
/// Returns the offset of a pitchlinear coordinate in linear memory. | |
CUTLASS_HOST_DEVICE | |
LongIndex operator()(PitchLinearCoord coord) const { | |
return coord.contiguous() + LongIndex(coord.strided() * stride_[3]); | |
} | |
/// Returns the stride of the layout | |
CUTLASS_HOST_DEVICE | |
Stride stride() const { | |
return stride_; | |
} | |
/// Returns the stride of the layout | |
CUTLASS_HOST_DEVICE | |
Stride & stride() { | |
return stride_; | |
} | |
/// Compute the number of contiguous elements needed to store a tensor with the given size | |
CUTLASS_HOST_DEVICE | |
LongIndex capacity(TensorCoord const &extent) const { | |
// it does not make sense if the extent is larger than stride | |
// and we could not rely on the capacity calculation in such cases | |
// we could move this checkers to debug code only | |
if ((extent.n() > stride_[0]) | |
|| (extent.d() * stride_[0] > stride_[1]) | |
|| (extent.h() * stride_[1] > stride_[2]) | |
|| (extent.w() * stride_[2] > stride_[3])) { | |
assert(0); | |
} | |
return extent.c() * stride_[3]; | |
} | |
}; | |
///////////////////////////////////////////////////////////////////////////////////////////////// | |
} // namespace layout | |
} // namespace cutlass | |