#include <torch/library.h>

#include "registration.h"
#include "torch_binding.h"

TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column                                                                                                                                            
  // quantization, as well as bias                                                                                                                                                                                   
  ops.def(                                                                                                                                                                                                           
      "cutlass_scaled_mm(Tensor! out, Tensor a,"                                                                                                                                                                     
      "                  Tensor b, Tensor a_scales,"                                                                                                                                                                 
      "                  Tensor b_scales, Tensor? bias) -> ()");                                                                                                                                                     
  ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);                                                                                                                                                   
                                                                                                                                                                                                                     
  // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column                                                                                                                                           
  // quantization.                                                                                                                                                                                                   
  ops.def(                                                                                                                                                                                                           
      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"                                                                                                                                                                 
      "                  Tensor b, Tensor a_scales,"                                                                                                                                                                 
      "                  Tensor b_scales, Tensor azp_adj,"                                                                                                                                                           
      "                  Tensor? azp, Tensor? bias) -> ()");                                                                                                                                                         
  ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);                                                                                                                                           
                                                                                                                                                                                                                     
  // Check if cutlass scaled_mm is supported for CUDA devices of the given                                                                                                                                           
  // capability                                                                                                                                                                                                      
  ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");                                                                                                                                     
  ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);                                            

}

REGISTER_EXTENSION(TORCH_EXTENSION_NAME)