File size: 20,446 Bytes

d1ceb73

#pragma once

#include <ATen/cuda/ApplyGridUtils.cuh>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/core/TensorBase.h>
#include <ATen/ceil_div.h>
#include <ATen/cuda/Atomic.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <c10/macros/Macros.h>
#include <ATen/native/Copy.h>

#include <math.h>

//
// This file contains pointwise operation functions and kernels that
// work on both contiguous and non-contiguous tensor arguments of
// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
// copying or temporary storage.
//

/*
  NOTE [ CUDA_tensor_applyN helpers ]

  The following CUDA_tensor_applyN (where N currently can be 1, 2, 3, or 4)
  functions apply a pointwise operator to N tensor(s).

  The calling convention is

  1. The template arguments should be, sequentially,
    - First N typename args specify the scalar types of each of the N tensors.
    - (Optional) `int step` arg specifies the number of elements processed
      together at the same time.
      Default is 1.
    - A usually omitted (i.e., inferred) typename arg specifies the type of the
      function/functor applied on `N * step` values  in each iteration of each
      CUDA thread.
  2. The arguments should be, sequentially,
    - N tensors
    - op: a function/functor that processes `N * step` values at the same time.
      - If `step == 1`, it must have signature
        `void(*)(scalar1_t&, scalar2_t&, ..., scalarN_t&)`, where
        `scalar*_t`s are the first N typename template args, and the inputs
        are the `N` values from the `N` tensors retrieved at a common index.
      - Otherwise, it must must have signature
          void(*)(int n, scalar1_t&, scalar1_t&, ..., scalar1_t&,  // repeat `step` times
                         scalar2_t&, scalar2_t&, ..., scalar2_t&,  // repeat `step` times
                         ...,
                         scalarN_t&, scalarN_t&, ..., scalarN_t&)  // repeat `step` times
        Different from `step == 1` case, it processes `N * step` values taken
        from `step` common indices. Moreover, the first input `n` represents the
        number of valid indices (it will always have `0 < n <= step`). It will
        almost always be `step`, but at the boundary we may not have full `step`
        elements and `n` can be a lesser value.

        E.g., if `step == 4` and `N == 2`, `op` could be

          [](int n, scalar1_t &u1, scalar1_t &u2, scalar1_t &u3, scalar1_t &u4,
                    scalar2_t &v1, scalar2_t &v2, scalar2_t &v3, scalar2_t &v4) {
            // Only process u1, ..., un and v1, ..., vn.
            // So if `n == 3`, `u4` and `v4` need not to be considered.
          }

      In both cases, the references can actually be const, but at least one of
      them should be non-const in order to write the output.
    - (Optional, but recommended) N TensorArgType args that specify for each
      tensor whether `op` reads AND writes ] (i.e., TensorArgType::ReadWrite),
      or only reads (i.e., TensorArgType::ReadOnly).
      Default is TensorArgType::ReadWrite for first Tensor, and
                 TensorArgType::ReadOnly  for the rest.

  E.g.,

  to compute a = b^2 for a and b of same dtype, we can call

  CUDA_tensor_apply2<scalar, scalar>(
    a, b,
    [] __device__ (scalar &a_val, const scalar &b_val) { a_val = b_val * b_val; }
  );

  to work on 2 values at the same time, we can call

  CUDA_tensor_apply2<scalar1, scalar2, 2>(
    a, b,
    [] __device__ (int n, scalar1 &a_val1, scalar1 &a_val2,
                          const scalar2 &b_val1, const scalar2 &b_val2) {
      // call special vectorized op here, or just do elementwise and enjoy unrolling...
      // if n == 1, only process a_val1 and b_val1
    }
  );
*/

namespace at::cuda {

// TODO: combine with TensorArg?  So far that's been for debugging, and this is functional...
enum class TensorArgType { ReadWrite, ReadOnly };

namespace {

// Rearrange dimensions for pointwise operations so that strides are in
// decreasing order as much as possible, so that kernels have better memory
// access patterns.
//
// For example, consider a binary operation on two "transposed" 2-dim tensors:
//    sizes:          256 512
//    aInfo->strides:   1 256
//    bInfo->strides:   1 256
//
// Given this, each concurrent memory access inside kernelPointwiseApply2() is
// exactly 256 elements apart, resulting in poor performance.
//
// This function exchanges dimensions so that memory access is contiguous:
//    sizes:          512 256
//    aInfo->strides: 256   1
//    bInfo->strides: 256   1
//
// (Actually, it becomes even better because now collapseDims() can turn each
// input into one contiguous array.)
//
// In general, given M (<=4) TensorInfo's with N dimensions, we can view each
// strides[i] (0 <= i < N) as an M-tuple.  Given each pair i < j, we exchange
// strides[i] and [j] if
//    (1) strides[i][k] < strides[j][k] for some k (0 <= k < M)
//        (exchanging them will benefit input #k), and
//    (2) strides[i][k] <= strieds[j][k] for all k
//        (exchanging them will not make any input worse).
template <typename T1, typename IndexType,
          typename T2 = void, typename T3 = void, typename T4 = void>
inline void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo,
                          detail::TensorInfo<T2, IndexType>* bInfo = nullptr,
                          detail::TensorInfo<T3, IndexType>* cInfo = nullptr,
                          detail::TensorInfo<T4, IndexType>* dInfo = nullptr) {
  int numInfos = 1;
  int dims = aInfo->dims;
  IndexType *sizes[4] = { aInfo->sizes, };
  IndexType *strides[4] = { aInfo->strides, };

  if (bInfo != nullptr) {
    ++numInfos;
    if (bInfo->dims != dims) return;
    sizes[1] = bInfo->sizes;
    strides[1] = bInfo->strides;
  }

  if (cInfo != nullptr) {
    ++numInfos;
    if (cInfo->dims != dims) return;
    sizes[2] = cInfo->sizes;
    strides[2] = cInfo->strides;
  }

  if (dInfo != nullptr) {
    ++numInfos;
    if (dInfo->dims != dims) return;
    sizes[3] = dInfo->sizes;
    strides[3] = dInfo->strides;
  }

  // Bail out if sizes do not match: we are using "deprecated pointwise
  // behavior" among tensors of different shapes but same number of elements.
  for (int i = 1; i < numInfos; ++i) {
    for (int j = 0; j < dims; ++j) {
      if (sizes[i][j] != sizes[0][j]) return;
    }
  }

  for (int i = 0; i < dims - 1; ++i) {
    // No need to consider dimensions of size 1.
    if (sizes[0][i] == 1) continue;

    for (int j = i + 1; j < dims; ++j) {
      if (sizes[0][j] == 1) continue;

      // Compare the relative sizes of strides between dim #i and dim #j.
      bool hasIncreasingStrides = false;
      bool hasDecreasingStrides = false;

      for (int k = 0; k < numInfos; k++) {
        IndexType stride_i = strides[k][i];
        IndexType stride_j = strides[k][j];
        if (stride_i < stride_j) {
          hasIncreasingStrides = true;
        } else if (stride_i > stride_j) {
          hasDecreasingStrides = true;
        }
      }

      if (hasIncreasingStrides && !hasDecreasingStrides) {
        for (int k = 0; k < numInfos; k++) {
          IndexType size = sizes[k][i];
          sizes[k][i] = sizes[k][j];
          sizes[k][j] = size;

          IndexType stride = strides[k][i];
          strides[k][i] = strides[k][j];
          strides[k][j] = stride;
        }
      }
    }
  }
}

// The `remaining_steps` argument is used to support Op that operates on
// multiple elements at the same time. Generally, the strategy of ApplyOpN is to
//  1. Initialize `remaining_steps = step`, where `step` is the template arg of
//     CUDA_tensor_applyN helpers. The input arg `n` to `apply()` represents the
//     number of elements in bound for this call. It will almost always equal to
//     `step` except at boundaries.
//  2. If `remaining_steps > 0` convert the current linearIndex to offset (if in
//     bound), and recursively call `ApplyOpN` with `remaining_steps - 1`.
//  3. At `remaining_steps = 0`,
//       if `step = 1`, call `op(tensor1_val, tensor2_val, ...)`;
//       if `step > 1`, call `op(n, tensor1_val1, tensor1_val2, ..., tesor1_valstep,
//                                  tensor2_val1, tensor2_val2, ..., tesor2_valstep,
//                                       ...
//                                  tensorN_val1, tensorN_val2, ..., tesorN_valstep);`
//
// See NOTE [ CUDA_tensor_applyN helpers ] above for how Op may look like.

template <typename Op,
          typename scalar,
          typename IndexType,
          int ADims,
          int remaining_steps,
          typename... Offsets>
struct ApplyOp1 {
__device__ __forceinline__
static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
                  IndexType linearIndex, Offsets... aOffsets) {
  // Convert `linearIndex` into an offset of `a`
  const IndexType aOffset = sizeof...(Offsets) < n ?
    detail::IndexToOffset<scalar, IndexType, ADims>::get(linearIndex, a) : 0;

  ApplyOp1<Op, scalar, IndexType, ADims, remaining_steps - 1, const IndexType, Offsets...>::apply(
    a, op, n, linearIndex + 1, aOffsets..., aOffset
  );
}
};

// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
// We don't need to pass in how many elements need to processed in this case.
template <typename Op,
          typename scalar,
          typename IndexType,
          int ADims,
          typename Offset>
struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offset> {
__device__ __forceinline__
static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op,
                  int n, IndexType linearIndex, Offset offset) {
  op(a.data[offset]);
}
};

template <typename Op,
          typename scalar,
          typename IndexType,
          int ADims,
          typename... Offsets>
struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offsets...> {
__device__ __forceinline__
static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
                 IndexType linearIndex, Offsets... offsets) {
  op(n, a.data[offsets]...);
}
};

template <typename Op,
          typename scalar,
          typename IndexType,
          int ADims,
          int step>
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
#endif
__global__ void kernelPointwiseApply1(detail::TensorInfo<scalar, IndexType> a,
                                      IndexType totalElements, const Op op) {
  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
       linearIndex < totalElements;
       linearIndex += gridDim.x * blockDim.x * step) {
    ApplyOp1<Op, scalar, IndexType, ADims, step>::apply(
      a, op, ::min(step, static_cast<int>(totalElements - linearIndex)), linearIndex);
  }
}


template <typename Op,
          typename scalar1,
          typename scalar2,
          typename IndexType,
          int ADims,
          int BDims,
          int remaining_steps,
          typename... Offsets>
struct ApplyOp2 {
__device__ __forceinline__
static void apply(detail::TensorInfo<scalar1, IndexType> &a,
                  detail::TensorInfo<scalar2, IndexType> &b,
                  const Op &op, int64_t n, IndexType linearIndex,
                  Offsets... aOffsets, Offsets... bOffsets) {
  // Convert `linearIndex` into an offset of `a`
  const IndexType aOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
    detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a) : 0;

  // Convert `linearIndex` into an offset of `b`
  const IndexType bOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
    detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b) : 0;

  ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, remaining_steps - 1, const IndexType, Offsets...>::apply(
    a, b, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset
  );
}
};

// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
// We don't need to pass in how many elements need to processed in this case.
template <typename Op,
          typename scalar1,
          typename scalar2,
          typename IndexType,
          int ADims,
          int BDims,
          typename Offset>
struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offset> {
__device__ __forceinline__
static void apply(detail::TensorInfo<scalar1, IndexType> &a,
                  detail::TensorInfo<scalar2, IndexType> &b,
                  const Op &op, int /*n*/, IndexType /*linearIndex*/,
                  Offset aOffset, Offset bOffset) {
  op(a.data[aOffset], b.data[bOffset]);
}
};

template <typename Op,
          typename scalar1,
          typename scalar2,
          typename IndexType,
          int ADims,
          int BDims,
          typename... Offsets>
struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offsets...> {
__device__ __forceinline__
static void apply(detail::TensorInfo<scalar1, IndexType> &a,
                  detail::TensorInfo<scalar2, IndexType> &b,
                  const Op &op, int n, IndexType linearIndex,
                  Offsets... aOffsets, Offsets... bOffsets) {
  op(n, a.data[aOffsets]..., b.data[bOffsets]...);
}
};

template <typename Op,
          typename scalar1,
          typename scalar2,
          typename IndexType,
          int ADims, int BDims,
          int step,
          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm)
#endif
__global__ void
kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
                      detail::TensorInfo<scalar2, IndexType> b,
                      IndexType totalElements,
                      const Op op) {
  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
       linearIndex < totalElements;
       linearIndex += gridDim.x * blockDim.x * step) {
    ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, step>::apply(
      a, b, op, ::min(step, static_cast<int>(totalElements - linearIndex)),
      linearIndex);
  }
}

} // anonymous namespace

template <typename scalar1, typename scalar2, int step, typename Op,
          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
inline bool CUDA_tensor_apply2(at::TensorBase a,
                               at::TensorBase b,
                               const Op op,
                               TensorArgType aType = TensorArgType::ReadWrite,
                               TensorArgType bType = TensorArgType::ReadOnly) {
  TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(),
              "CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got "
              "tensors with type ", a.device().type(), " and ", b.device().type());
  int64_t totalElements = a.numel();

  if (totalElements != b.numel()) {
    return false;
  }

  if (a.dim() > MAX_TENSORINFO_DIMS ||
      b.dim() > MAX_TENSORINFO_DIMS) {
    return false;
  }

  if (a.numel() == 0) {
    // Empty tensor; do nothing
    return true;
  }
  const dim3 block = getApplyBlock(max_threads_per_block);

  dim3 grid;
  auto curDevice = current_device();
  if (curDevice == -1) return false;
  if (!getApplyGrid<step>(totalElements, grid, curDevice, max_threads_per_block)) {
    return false;
  }

  /*
  Expands readable/writable tensors whose indices may be "overlapped."
  This ensures that each element of the tensor is operated on once and only
  once.
  */
  TensorBase oldA;
  TensorBase oldB;

  if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
    // Must perform in contiguous space
    oldA = std::exchange(a, a.contiguous());
  }
  if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
    // Must perform in contiguous space
    oldB = std::exchange(b, b.contiguous());
  }

  // It is possible that the tensor dimensions are able to be collapsed,
  // and thus we can reduce the actual code complexity of the copy by
  // exploiting this knowledge statically, since the div/mod is the
  // most expensive part of the operation, more so than memory accesses.
  // For instance, when copying a non-contiguous to a contiguous tensor
  // (or vice versa), the contiguous tensor can be collapsed to one
  // dimension, and the loop to translate the linear index to the array
  // index can be similarly collapsed. That is what this unrolling is for.

#define HANDLE_CASE(TYPE, A, B)                                        \
  kernelPointwiseApply2<Op,                                            \
                        scalar1,                                       \
                        scalar2,                                       \
                        TYPE, A, B, step,                              \
                        max_threads_per_block,                         \
                        min_blocks_per_sm>                             \
   <<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>(    \
       aInfo, bInfo, static_cast<TYPE>(totalElements), op);            \
  C10_CUDA_KERNEL_LAUNCH_CHECK();

#define HANDLE_B_CASE(TYPE, A, B) {         \
  switch (B) {                              \
    case 1:                                 \
      HANDLE_CASE(TYPE, A, 1);              \
      break;                                \
    case 2:                                 \
      HANDLE_CASE(TYPE, A, 2);              \
      break;                                \
    default:                                \
      HANDLE_CASE(TYPE, A, -1);             \
      break;                                \
  }                                         \
}

#define HANDLE_A_CASE(TYPE, A, B) {         \
  switch (A) {                              \
    case 1:                                 \
      HANDLE_B_CASE(TYPE, 1, B);            \
      break;                                \
    case 2:                                 \
      HANDLE_B_CASE(TYPE, 2, B);            \
      break;                                \
    default:                                \
      HANDLE_B_CASE(TYPE, -1, B);           \
      break;                                \
  }                                         \
}

  if (detail::canUse32BitIndexMath(a) &&
      detail::canUse32BitIndexMath(b)) {
    detail::TensorInfo<scalar1, unsigned int> aInfo =
      detail::getTensorInfo<scalar1, unsigned int>(a);

    detail::TensorInfo<scalar2, unsigned int> bInfo =
      detail::getTensorInfo<scalar2, unsigned int>(b);
    rearrangeDims(&aInfo, &bInfo);
    aInfo.collapseDims();
    bInfo.collapseDims();

    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
  } else {
    detail::TensorInfo<scalar1, uint64_t> aInfo =
      detail::getTensorInfo<scalar1, uint64_t>(a);

    detail::TensorInfo<scalar2, uint64_t> bInfo =
      detail::getTensorInfo<scalar2, uint64_t>(b);
    rearrangeDims(&aInfo, &bInfo);
    aInfo.collapseDims();
    bInfo.collapseDims();

    /*
    Only instantiates the all 1D special case and the fallback all nD case for
    large (64-bit indexed) tensors to reduce compilation time.
    */
    if (aInfo.dims == 1 && bInfo.dims == 1) {
      HANDLE_CASE(uint64_t, 1, 1);
    } else {
      HANDLE_CASE(uint64_t, -1, -1);
    }
  }
#undef HANDLE_CASE
#undef HANDLE_B_CASE
#undef HANDLE_A_CASE

  if (oldA.defined()) {
    at::native::copy_ignoring_overlaps(oldA, a);
  }

  if (oldB.defined()) {
    at::native::copy_ignoring_overlaps(oldB, b);
  }

  return true;
}

/* Provides default step = 1 to CUDA_tensor_apply2. */
template <typename scalar1, typename scalar2, typename Op,
          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
inline bool CUDA_tensor_apply2(const at::TensorBase &a,
                               const at::TensorBase &b,
                               const Op op,
                               TensorArgType aType = TensorArgType::ReadWrite,
                               TensorArgType bType = TensorArgType::ReadOnly) {
  return CUDA_tensor_apply2<scalar1, scalar2, 1, Op,
                            max_threads_per_block, min_blocks_per_sm>(a, b, op, aType, bType);
}

} // namespace at::cuda