// Copyright (C) 2015 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_GPU_DaTA_CPP_ #define DLIB_GPU_DaTA_CPP_ // Only things that require CUDA are declared in this cpp file. Everything else is in the // gpu_data.h header so that it can operate as "header-only" code when using just the CPU. #ifdef DLIB_USE_CUDA #include "gpu_data.h" #include <iostream> #include "cuda_utils.h" #include <cstring> #include <cuda.h> namespace dlib { // ---------------------------------------------------------------------------------------- void memcpy ( gpu_data& dest, const gpu_data& src ) { DLIB_CASSERT(dest.size() == src.size()); if (src.size() == 0 || &dest == &src) return; memcpy(dest,0, src, 0, src.size()); } void memcpy ( gpu_data& dest, size_t dest_offset, const gpu_data& src, size_t src_offset, size_t num ) { DLIB_CASSERT(dest_offset + num <= dest.size()); DLIB_CASSERT(src_offset + num <= src.size()); if (num == 0) return; // if there is aliasing if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num) { // if they perfectly alias each other then there is nothing to do if (dest_offset == src_offset) return; else std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num); } else { // if we write to the entire thing then we can use device_write_only() if (dest_offset == 0 && num == dest.size()) { // copy the memory efficiently based on which copy is current in each object. if (src.device_ready()) CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice)); else CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); } else { // copy the memory efficiently based on which copy is current in each object. if (dest.device_ready() && src.device_ready()) CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice)); else if (!dest.device_ready() && src.device_ready()) CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToHost)); else if (dest.device_ready() && !src.device_ready()) CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); else CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToHost)); } } } // ---------------------------------------------------------------------------------------- void synchronize_stream(cudaStream_t stream) { #if !defined CUDA_VERSION #error CUDA_VERSION not defined #elif CUDA_VERSION >= 9020 && CUDA_VERSION < 11000 // We will stop using this alternative version with cuda V11, hopefully the bug in // cudaStreamSynchronize is fixed by then. // // This should be pretty much the same as cudaStreamSynchronize, which for some // reason makes training freeze in some cases. // (see https://github.com/davisking/dlib/issues/1513) while (true) { cudaError_t err = cudaStreamQuery(stream); switch (err) { case cudaSuccess: return; // now we are synchronized case cudaErrorNotReady: break; // continue waiting default: CHECK_CUDA(err); // unexpected error: throw } } #else // CUDA_VERSION CHECK_CUDA(cudaStreamSynchronize(stream)); #endif // CUDA_VERSION } void gpu_data:: wait_for_transfer_to_finish() const { if (have_active_transfer) { synchronize_stream((cudaStream_t)cuda_stream.get()); have_active_transfer = false; // Check for errors. These calls to cudaGetLastError() are what help us find // out if our kernel launches have been failing. CHECK_CUDA(cudaGetLastError()); } } void gpu_data:: copy_to_device() const { // We want transfers to the device to always be concurrent with any device // computation. So we use our non-default stream to do the transfer. async_copy_to_device(); wait_for_transfer_to_finish(); } void gpu_data:: copy_to_host() const { if (!host_current) { wait_for_transfer_to_finish(); CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost)); host_current = true; // At this point we know our RAM block isn't in use because cudaMemcpy() // implicitly syncs with the device. device_in_use = false; // Check for errors. These calls to cudaGetLastError() are what help us find // out if our kernel launches have been failing. CHECK_CUDA(cudaGetLastError()); } } void gpu_data:: async_copy_to_device() const { if (!device_current) { if (device_in_use) { // Wait for any possible CUDA kernels that might be using our memory block to // complete before we overwrite the memory. synchronize_stream(0); device_in_use = false; } CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get())); have_active_transfer = true; device_current = true; } } void gpu_data:: set_size( size_t new_size ) { if (new_size == 0) { if (device_in_use) { // Wait for any possible CUDA kernels that might be using our memory block to // complete before we free the memory. synchronize_stream(0); device_in_use = false; } wait_for_transfer_to_finish(); data_size = 0; host_current = true; device_current = true; device_in_use = false; data_host.reset(); data_device.reset(); } else if (new_size != data_size) { if (device_in_use) { // Wait for any possible CUDA kernels that might be using our memory block to // complete before we free the memory. synchronize_stream(0); device_in_use = false; } wait_for_transfer_to_finish(); data_size = new_size; host_current = true; device_current = true; device_in_use = false; try { CHECK_CUDA(cudaGetDevice(&the_device_id)); // free memory blocks before we allocate new ones. data_host.reset(); data_device.reset(); void* data; CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float))); // Note that we don't throw exceptions since the free calls are invariably // called in destructors. They also shouldn't fail anyway unless someone // is resetting the GPU card in the middle of their program. data_host.reset((float*)data, [](float* ptr){ auto err = cudaFreeHost(ptr); if(err!=cudaSuccess) std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl; }); CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float))); data_device.reset((float*)data, [](float* ptr){ auto err = cudaFree(ptr); if(err!=cudaSuccess) std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl; }); if (!cuda_stream) { cudaStream_t cstream; CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking)); cuda_stream.reset(cstream, [](void* ptr){ auto err = cudaStreamDestroy((cudaStream_t)ptr); if(err!=cudaSuccess) std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl; }); } } catch(...) { set_size(0); throw; } } } // ---------------------------------------------------------------------------------------- } #endif // DLIB_USE_CUDA #endif // DLIB_GPU_DaTA_CPP_