#pragma once #include #include #include #include #include #include #include #include #include #include namespace at { namespace cuda { /* * CUDAEvents are movable not copyable wrappers around CUDA's events. * * CUDAEvents are constructed lazily when first recorded unless it is * reconstructed from a cudaIpcEventHandle_t. The event has a device, and this * device is acquired from the first recording stream. However, if reconstructed * from a handle, the device should be explicitly specified; or if ipc_handle() is * called before the event is ever recorded, it will use the current device. * Later streams that record the event must match this device. */ struct TORCH_CUDA_CPP_API CUDAEvent { // Constructors // Default value for `flags` is specified below - it's cudaEventDisableTiming CUDAEvent() {} CUDAEvent(unsigned int flags) : flags_{flags} {} CUDAEvent( DeviceIndex device_index, const cudaIpcEventHandle_t* handle) { device_index_ = device_index; CUDAGuard guard(device_index_); AT_CUDA_CHECK(cudaIpcOpenEventHandle(&event_, *handle)); is_created_ = true; } // Note: event destruction done on creating device to avoid creating a // CUDA context on other devices. ~CUDAEvent() { try { if (is_created_) { CUDAGuard guard(device_index_); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { (*interp)->trace_gpu_event_deletion(reinterpret_cast(event_)); } cudaEventDestroy(event_); } } catch (...) { /* No throw */ } } CUDAEvent(const CUDAEvent&) = delete; CUDAEvent& operator=(const CUDAEvent&) = delete; CUDAEvent(CUDAEvent&& other) { moveHelper(std::move(other)); } CUDAEvent& operator=(CUDAEvent&& other) { moveHelper(std::move(other)); return *this; } operator cudaEvent_t() const { return event(); } // Less than operator (to allow use in sets) friend bool operator<(const CUDAEvent& left, const CUDAEvent& right) { return left.event_ < right.event_; } optional device() const { if (is_created_) { return at::Device(at::kCUDA, device_index_); } else { return {}; } } bool isCreated() const { return is_created_; } DeviceIndex device_index() const {return device_index_;} cudaEvent_t event() const { return event_; } // Note: cudaEventQuery can be safely called from any device bool query() const { if (!is_created_) { return true; } cudaError_t err = cudaEventQuery(event_); if (err == cudaSuccess) { return true; } else if (err != cudaErrorNotReady) { C10_CUDA_CHECK(err); } else { // ignore and clear the error if not ready cudaGetLastError(); } return false; } void record() { record(getCurrentCUDAStream()); } void recordOnce(const CUDAStream& stream) { if (!was_recorded_) record(stream); } // Note: cudaEventRecord must be called on the same device as the event. void record(const CUDAStream& stream) { if (!is_created_) { createEvent(stream.device_index()); } TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_, " does not match recording stream's device ", stream.device_index(), "."); CUDAGuard guard(device_index_); AT_CUDA_CHECK(cudaEventRecord(event_, stream)); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { (*interp)->trace_gpu_event_record( reinterpret_cast(event_), reinterpret_cast(stream.stream()) ); } was_recorded_ = true; } // Note: cudaStreamWaitEvent must be called on the same device as the stream. // The event has no actual GPU resources associated with it. void block(const CUDAStream& stream) { if (is_created_) { CUDAGuard guard(stream.device_index()); AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0)); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { (*interp)->trace_gpu_event_wait( reinterpret_cast(event_), reinterpret_cast(stream.stream()) ); } } } // Note: cudaEventElapsedTime can be safely called from any device float elapsed_time(const CUDAEvent& other) const { TORCH_CHECK(is_created_ && other.isCreated(), "Both events must be recorded before calculating elapsed time."); float time_ms = 0; // raise cudaErrorNotReady if either event is recorded but not yet completed AT_CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_, other.event_)); return time_ms; } // Note: cudaEventSynchronize can be safely called from any device void synchronize() const { if (is_created_) { const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { (*interp)->trace_gpu_event_synchronization(reinterpret_cast(event_)); } AT_CUDA_CHECK(cudaEventSynchronize(event_)); } } // Note: cudaIpcGetEventHandle must be called on the same device as the event void ipc_handle(cudaIpcEventHandle_t * handle) { if (!is_created_) { // this CUDAEvent object was initially constructed from flags but event_ // is not created yet. createEvent(getCurrentCUDAStream().device_index()); } CUDAGuard guard(device_index_); AT_CUDA_CHECK(cudaIpcGetEventHandle(handle, event_)); } private: unsigned int flags_ = cudaEventDisableTiming; bool is_created_ = false; bool was_recorded_ = false; DeviceIndex device_index_ = -1; cudaEvent_t event_{}; void createEvent(DeviceIndex device_index) { device_index_ = device_index; CUDAGuard guard(device_index_); AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_)); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { (*interp)->trace_gpu_event_creation(reinterpret_cast(event_)); } is_created_ = true; } void moveHelper(CUDAEvent&& other) { std::swap(flags_, other.flags_); std::swap(is_created_, other.is_created_); std::swap(was_recorded_, other.was_recorded_); std::swap(device_index_, other.device_index_); std::swap(event_, other.event_); } }; } // namespace cuda } // namespace at