|
|
|
|
|
#ifndef DLIB_GPU_DaTA_CPP_ |
|
#define DLIB_GPU_DaTA_CPP_ |
|
|
|
|
|
|
|
#ifdef DLIB_USE_CUDA |
|
|
|
#include "gpu_data.h" |
|
#include <iostream> |
|
#include "cuda_utils.h" |
|
#include <cstring> |
|
#include <cuda.h> |
|
|
|
namespace dlib |
|
{ |
|
|
|
|
|
|
|
void memcpy ( |
|
gpu_data& dest, |
|
const gpu_data& src |
|
) |
|
{ |
|
DLIB_CASSERT(dest.size() == src.size()); |
|
if (src.size() == 0 || &dest == &src) |
|
return; |
|
|
|
memcpy(dest,0, src, 0, src.size()); |
|
} |
|
|
|
void memcpy ( |
|
gpu_data& dest, |
|
size_t dest_offset, |
|
const gpu_data& src, |
|
size_t src_offset, |
|
size_t num |
|
) |
|
{ |
|
DLIB_CASSERT(dest_offset + num <= dest.size()); |
|
DLIB_CASSERT(src_offset + num <= src.size()); |
|
if (num == 0) |
|
return; |
|
|
|
|
|
if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num) |
|
{ |
|
|
|
if (dest_offset == src_offset) |
|
return; |
|
else |
|
std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num); |
|
} |
|
else |
|
{ |
|
|
|
if (dest_offset == 0 && num == dest.size()) |
|
{ |
|
|
|
if (src.device_ready()) |
|
CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice)); |
|
else |
|
CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); |
|
} |
|
else |
|
{ |
|
|
|
if (dest.device_ready() && src.device_ready()) |
|
CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice)); |
|
else if (!dest.device_ready() && src.device_ready()) |
|
CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToHost)); |
|
else if (dest.device_ready() && !src.device_ready()) |
|
CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); |
|
else |
|
CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToHost)); |
|
} |
|
} |
|
} |
|
|
|
|
|
void synchronize_stream(cudaStream_t stream) |
|
{ |
|
#if !defined CUDA_VERSION |
|
#error CUDA_VERSION not defined |
|
#elif CUDA_VERSION >= 9020 && CUDA_VERSION < 11000 |
|
|
|
|
|
|
|
|
|
|
|
|
|
while (true) |
|
{ |
|
cudaError_t err = cudaStreamQuery(stream); |
|
switch (err) |
|
{ |
|
case cudaSuccess: return; |
|
case cudaErrorNotReady: break; |
|
default: CHECK_CUDA(err); |
|
} |
|
} |
|
#else |
|
CHECK_CUDA(cudaStreamSynchronize(stream)); |
|
#endif |
|
} |
|
|
|
void gpu_data:: |
|
wait_for_transfer_to_finish() const |
|
{ |
|
if (have_active_transfer) |
|
{ |
|
synchronize_stream((cudaStream_t)cuda_stream.get()); |
|
have_active_transfer = false; |
|
|
|
|
|
CHECK_CUDA(cudaGetLastError()); |
|
} |
|
} |
|
|
|
void gpu_data:: |
|
copy_to_device() const |
|
{ |
|
|
|
|
|
async_copy_to_device(); |
|
wait_for_transfer_to_finish(); |
|
} |
|
|
|
void gpu_data:: |
|
copy_to_host() const |
|
{ |
|
if (!host_current) |
|
{ |
|
wait_for_transfer_to_finish(); |
|
CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost)); |
|
host_current = true; |
|
|
|
|
|
device_in_use = false; |
|
|
|
|
|
CHECK_CUDA(cudaGetLastError()); |
|
} |
|
} |
|
|
|
void gpu_data:: |
|
async_copy_to_device() const |
|
{ |
|
if (!device_current) |
|
{ |
|
if (device_in_use) |
|
{ |
|
|
|
|
|
synchronize_stream(0); |
|
device_in_use = false; |
|
} |
|
CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get())); |
|
have_active_transfer = true; |
|
device_current = true; |
|
} |
|
} |
|
|
|
void gpu_data:: |
|
set_size( |
|
size_t new_size |
|
) |
|
{ |
|
if (new_size == 0) |
|
{ |
|
if (device_in_use) |
|
{ |
|
|
|
|
|
synchronize_stream(0); |
|
device_in_use = false; |
|
} |
|
wait_for_transfer_to_finish(); |
|
data_size = 0; |
|
host_current = true; |
|
device_current = true; |
|
device_in_use = false; |
|
data_host.reset(); |
|
data_device.reset(); |
|
} |
|
else if (new_size != data_size) |
|
{ |
|
if (device_in_use) |
|
{ |
|
|
|
|
|
synchronize_stream(0); |
|
device_in_use = false; |
|
} |
|
wait_for_transfer_to_finish(); |
|
data_size = new_size; |
|
host_current = true; |
|
device_current = true; |
|
device_in_use = false; |
|
|
|
try |
|
{ |
|
CHECK_CUDA(cudaGetDevice(&the_device_id)); |
|
|
|
|
|
data_host.reset(); |
|
data_device.reset(); |
|
|
|
void* data; |
|
CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float))); |
|
|
|
|
|
|
|
data_host.reset((float*)data, [](float* ptr){ |
|
auto err = cudaFreeHost(ptr); |
|
if(err!=cudaSuccess) |
|
std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl; |
|
}); |
|
|
|
CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float))); |
|
data_device.reset((float*)data, [](float* ptr){ |
|
auto err = cudaFree(ptr); |
|
if(err!=cudaSuccess) |
|
std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl; |
|
}); |
|
|
|
if (!cuda_stream) |
|
{ |
|
cudaStream_t cstream; |
|
CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking)); |
|
cuda_stream.reset(cstream, [](void* ptr){ |
|
auto err = cudaStreamDestroy((cudaStream_t)ptr); |
|
if(err!=cudaSuccess) |
|
std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl; |
|
}); |
|
} |
|
|
|
} |
|
catch(...) |
|
{ |
|
set_size(0); |
|
throw; |
|
} |
|
} |
|
} |
|
|
|
|
|
} |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|