|
#pragma once |
|
|
|
#include <ATen/cuda/ATenCUDAGeneral.h> |
|
#include <ATen/cuda/CUDAContext.h> |
|
#include <c10/core/impl/GPUTrace.h> |
|
#include <c10/cuda/CUDAStream.h> |
|
#include <c10/cuda/CUDAGuard.h> |
|
#include <ATen/cuda/Exceptions.h> |
|
#include <c10/util/Exception.h> |
|
|
|
#include <cuda_runtime_api.h> |
|
|
|
#include <cstdint> |
|
#include <utility> |
|
|
|
namespace at { namespace cuda { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct TORCH_CUDA_CPP_API CUDAEvent { |
|
|
|
|
|
CUDAEvent() {} |
|
CUDAEvent(unsigned int flags) : flags_{flags} {} |
|
|
|
CUDAEvent( |
|
DeviceIndex device_index, const cudaIpcEventHandle_t* handle) { |
|
device_index_ = device_index; |
|
CUDAGuard guard(device_index_); |
|
|
|
AT_CUDA_CHECK(cudaIpcOpenEventHandle(&event_, *handle)); |
|
is_created_ = true; |
|
} |
|
|
|
|
|
|
|
~CUDAEvent() { |
|
try { |
|
if (is_created_) { |
|
CUDAGuard guard(device_index_); |
|
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
|
if (C10_UNLIKELY(interp)) { |
|
(*interp)->trace_gpu_event_deletion(reinterpret_cast<uintptr_t>(event_)); |
|
} |
|
cudaEventDestroy(event_); |
|
} |
|
} catch (...) { } |
|
} |
|
|
|
CUDAEvent(const CUDAEvent&) = delete; |
|
CUDAEvent& operator=(const CUDAEvent&) = delete; |
|
|
|
CUDAEvent(CUDAEvent&& other) { moveHelper(std::move(other)); } |
|
CUDAEvent& operator=(CUDAEvent&& other) { |
|
moveHelper(std::move(other)); |
|
return *this; |
|
} |
|
|
|
operator cudaEvent_t() const { return event(); } |
|
|
|
|
|
friend bool operator<(const CUDAEvent& left, const CUDAEvent& right) { |
|
return left.event_ < right.event_; |
|
} |
|
|
|
optional<at::Device> device() const { |
|
if (is_created_) { |
|
return at::Device(at::kCUDA, device_index_); |
|
} else { |
|
return {}; |
|
} |
|
} |
|
|
|
bool isCreated() const { return is_created_; } |
|
DeviceIndex device_index() const {return device_index_;} |
|
cudaEvent_t event() const { return event_; } |
|
|
|
|
|
bool query() const { |
|
if (!is_created_) { |
|
return true; |
|
} |
|
|
|
cudaError_t err = cudaEventQuery(event_); |
|
if (err == cudaSuccess) { |
|
return true; |
|
} else if (err != cudaErrorNotReady) { |
|
C10_CUDA_CHECK(err); |
|
} else { |
|
|
|
cudaGetLastError(); |
|
} |
|
|
|
return false; |
|
} |
|
|
|
void record() { record(getCurrentCUDAStream()); } |
|
|
|
void recordOnce(const CUDAStream& stream) { |
|
if (!was_recorded_) record(stream); |
|
} |
|
|
|
|
|
void record(const CUDAStream& stream) { |
|
if (!is_created_) { |
|
createEvent(stream.device_index()); |
|
} |
|
|
|
TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_, |
|
" does not match recording stream's device ", stream.device_index(), "."); |
|
CUDAGuard guard(device_index_); |
|
AT_CUDA_CHECK(cudaEventRecord(event_, stream)); |
|
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
|
if (C10_UNLIKELY(interp)) { |
|
(*interp)->trace_gpu_event_record( |
|
reinterpret_cast<uintptr_t>(event_), |
|
reinterpret_cast<uintptr_t>(stream.stream()) |
|
); |
|
} |
|
was_recorded_ = true; |
|
} |
|
|
|
|
|
|
|
void block(const CUDAStream& stream) { |
|
if (is_created_) { |
|
CUDAGuard guard(stream.device_index()); |
|
AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0)); |
|
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
|
if (C10_UNLIKELY(interp)) { |
|
(*interp)->trace_gpu_event_wait( |
|
reinterpret_cast<uintptr_t>(event_), |
|
reinterpret_cast<uintptr_t>(stream.stream()) |
|
); |
|
} |
|
} |
|
} |
|
|
|
|
|
float elapsed_time(const CUDAEvent& other) const { |
|
TORCH_CHECK(is_created_ && other.isCreated(), |
|
"Both events must be recorded before calculating elapsed time."); |
|
float time_ms = 0; |
|
|
|
AT_CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_, other.event_)); |
|
return time_ms; |
|
} |
|
|
|
|
|
void synchronize() const { |
|
if (is_created_) { |
|
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
|
if (C10_UNLIKELY(interp)) { |
|
(*interp)->trace_gpu_event_synchronization(reinterpret_cast<uintptr_t>(event_)); |
|
} |
|
AT_CUDA_CHECK(cudaEventSynchronize(event_)); |
|
} |
|
} |
|
|
|
|
|
void ipc_handle(cudaIpcEventHandle_t * handle) { |
|
if (!is_created_) { |
|
|
|
|
|
createEvent(getCurrentCUDAStream().device_index()); |
|
} |
|
CUDAGuard guard(device_index_); |
|
AT_CUDA_CHECK(cudaIpcGetEventHandle(handle, event_)); |
|
} |
|
|
|
private: |
|
unsigned int flags_ = cudaEventDisableTiming; |
|
bool is_created_ = false; |
|
bool was_recorded_ = false; |
|
DeviceIndex device_index_ = -1; |
|
cudaEvent_t event_{}; |
|
|
|
void createEvent(DeviceIndex device_index) { |
|
device_index_ = device_index; |
|
CUDAGuard guard(device_index_); |
|
AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_)); |
|
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
|
if (C10_UNLIKELY(interp)) { |
|
(*interp)->trace_gpu_event_creation(reinterpret_cast<uintptr_t>(event_)); |
|
} |
|
is_created_ = true; |
|
} |
|
|
|
void moveHelper(CUDAEvent&& other) { |
|
std::swap(flags_, other.flags_); |
|
std::swap(is_created_, other.is_created_); |
|
std::swap(was_recorded_, other.was_recorded_); |
|
std::swap(device_index_, other.device_index_); |
|
std::swap(event_, other.event_); |
|
} |
|
}; |
|
|
|
} |
|
} |
|
|