|
|
|
|
|
#ifndef DLIB_DNN_CuDA_H_ |
|
#define DLIB_DNN_CuDA_H_ |
|
|
|
|
|
#include "tensor.h" |
|
#include "../geometry/rectangle.h" |
|
#include "../dnn/misc.h" |
|
|
|
namespace dlib |
|
{ |
|
namespace cuda |
|
{ |
|
|
|
|
|
|
|
void set_device ( |
|
int dev |
|
); |
|
|
|
int get_device ( |
|
); |
|
|
|
int get_num_devices ( |
|
); |
|
|
|
std::string get_device_name ( |
|
int device |
|
); |
|
|
|
void set_current_device_blocking_sync( |
|
); |
|
|
|
bool can_access_peer (int device_id, int peer_device_id); |
|
bool can_access_peer (const tensor& device, const tensor& peer_device); |
|
|
|
void device_synchronize (int dev); |
|
void device_synchronize (const tensor& dev); |
|
|
|
|
|
class raii_set_device |
|
{ |
|
public: |
|
raii_set_device() = delete; |
|
raii_set_device(const raii_set_device&) = delete; |
|
raii_set_device& operator=(const raii_set_device&) = delete; |
|
|
|
raii_set_device(int dev) |
|
{ |
|
prev_dev = get_device(); |
|
set_device(dev); |
|
} |
|
|
|
raii_set_device(const tensor& dev) |
|
{ |
|
prev_dev = get_device(); |
|
set_device(dev.device_id()); |
|
} |
|
|
|
void operator() (int dev) |
|
{ |
|
set_device(dev); |
|
} |
|
|
|
void operator() (const tensor& dev) |
|
{ |
|
set_device(dev.device_id()); |
|
} |
|
|
|
~raii_set_device() noexcept(false) |
|
{ |
|
set_device(prev_dev); |
|
} |
|
|
|
private: |
|
int prev_dev; |
|
}; |
|
|
|
|
|
#ifdef DLIB_USE_CUDA |
|
|
|
class enable_peer_access |
|
{ |
|
public: |
|
|
|
enable_peer_access() = delete; |
|
enable_peer_access(const enable_peer_access&) = delete; |
|
enable_peer_access& operator=(const enable_peer_access&) = delete; |
|
|
|
enable_peer_access( |
|
int device_id, |
|
int peer_device_id |
|
); |
|
|
|
enable_peer_access( |
|
const tensor& device, |
|
const tensor& peer_device |
|
) : enable_peer_access(device.device_id(), peer_device.device_id()) |
|
{} |
|
|
|
~enable_peer_access() noexcept(false); |
|
|
|
private: |
|
|
|
bool call_disable; |
|
int device_id; |
|
int peer_device_id; |
|
}; |
|
|
|
|
|
|
|
void inverse_norms ( |
|
resizable_tensor& invnorms, |
|
const tensor& data, |
|
const double eps |
|
); |
|
|
|
void dot_prods ( |
|
resizable_tensor& out, |
|
const tensor& lhs, |
|
const tensor& rhs |
|
); |
|
|
|
void dot_prods ( |
|
bool add_to, |
|
tensor& out, |
|
const tensor& lhs, |
|
const tensor& rhs |
|
); |
|
|
|
void scale_columns ( |
|
tensor& out, |
|
const tensor& m, |
|
const tensor& v |
|
); |
|
|
|
void scale_rows ( |
|
tensor& out, |
|
const tensor& m, |
|
const tensor& v |
|
); |
|
|
|
void scale_rows2 ( |
|
float beta, |
|
tensor& out, |
|
const tensor& m1, |
|
const tensor& m2, |
|
const tensor& v1, |
|
const tensor& v2 |
|
); |
|
|
|
void exp ( |
|
tensor& dest, |
|
const tensor& src |
|
); |
|
|
|
void log ( |
|
tensor& dest, |
|
const tensor& src |
|
); |
|
|
|
void log10 ( |
|
tensor& dest, |
|
const tensor& src |
|
); |
|
|
|
|
|
|
|
void set_tensor ( |
|
tensor& t, |
|
float value |
|
); |
|
|
|
void scale_tensor ( |
|
tensor& t, |
|
float value |
|
); |
|
|
|
|
|
|
|
void multiply ( |
|
bool add_to, |
|
tensor& dest, |
|
const tensor& src1, |
|
const tensor& src2 |
|
); |
|
|
|
void multiply_conv ( |
|
bool add_to, |
|
tensor& dest, |
|
const tensor& src1, |
|
const tensor& src2 |
|
); |
|
|
|
void multiply_zero_padded ( |
|
bool add_to, |
|
tensor& dest, |
|
const tensor& src1, |
|
const tensor& src2 |
|
); |
|
|
|
void scale_channels ( |
|
bool add_to, |
|
tensor& dest, |
|
const tensor& src, |
|
const tensor& scales |
|
); |
|
|
|
void add ( |
|
tensor& dest, |
|
const tensor& src1, |
|
const tensor& src2 |
|
); |
|
|
|
|
|
|
|
void affine_transform( |
|
tensor& dest, |
|
const tensor& src, |
|
const float A, |
|
const float B |
|
); |
|
|
|
void affine_transform( |
|
tensor& dest, |
|
const tensor& src, |
|
const float A |
|
); |
|
|
|
void affine_transform( |
|
tensor& dest, |
|
const tensor& src1, |
|
const tensor& src2, |
|
const float A, |
|
const float B, |
|
const float C |
|
); |
|
|
|
void affine_transform( |
|
tensor& dest, |
|
const tensor& src1, |
|
const tensor& src2, |
|
const float A, |
|
const float B |
|
); |
|
|
|
void affine_transform( |
|
tensor& dest, |
|
const tensor& src1, |
|
const tensor& src2, |
|
const tensor& src3, |
|
const float A, |
|
const float B, |
|
const float C, |
|
const float D |
|
); |
|
|
|
void affine_transform_range( |
|
size_t begin, |
|
size_t end, |
|
tensor& dest, |
|
const tensor& src1, |
|
const tensor& src2, |
|
const tensor& src3, |
|
const float A, |
|
const float B, |
|
const float C |
|
); |
|
|
|
void affine_transform( |
|
const rectangle& rect, |
|
tensor& dest, |
|
const tensor& src1, |
|
const tensor& src2, |
|
const tensor& src3, |
|
float A, |
|
float B, |
|
float C |
|
); |
|
|
|
|
|
|
|
void add_scaled( |
|
tensor& dest, |
|
const float scale, |
|
const tensor& src |
|
); |
|
|
|
void add_cv_to_all_columns( |
|
float beta, |
|
tensor& dest, |
|
float alpha, |
|
const tensor& src |
|
); |
|
|
|
|
|
|
|
void affine_transform( |
|
tensor& dest, |
|
const tensor& src, |
|
const tensor& A, |
|
const tensor& B |
|
); |
|
|
|
|
|
|
|
void affine_transform_conv( |
|
tensor& dest, |
|
const tensor& src, |
|
const tensor& A, |
|
const tensor& B |
|
); |
|
|
|
|
|
|
|
void compute_adam_update ( |
|
size_t begin, |
|
size_t end, |
|
tensor& s, |
|
tensor& m, |
|
tensor& v, |
|
const float t, |
|
const float learning_rate, |
|
const float weight_decay, |
|
const float momentum1, |
|
const float momentum2, |
|
const tensor& params, |
|
const tensor& params_grad |
|
); |
|
|
|
|
|
|
|
void assign_bias_gradient ( |
|
tensor& grad, |
|
const tensor& gradient_input |
|
); |
|
|
|
|
|
|
|
void layer_normalize ( |
|
const double eps, |
|
resizable_tensor& dest, |
|
resizable_tensor& means, |
|
resizable_tensor& invstds, |
|
const tensor& src, |
|
const tensor& gamma, |
|
const tensor& beta |
|
); |
|
|
|
void layer_normalize_gradient ( |
|
const double eps, |
|
const tensor& gradient_input, |
|
const tensor& means, |
|
const tensor& invstds, |
|
const tensor& src, |
|
const tensor& gamma, |
|
tensor& src_grad, |
|
tensor& gamma_grad, |
|
tensor& beta_grad |
|
); |
|
|
|
|
|
|
|
void threshold ( |
|
tensor& data, |
|
float thresh |
|
); |
|
|
|
|
|
|
|
void dot ( |
|
const tensor& a, |
|
const tensor& b, |
|
tensor& result, |
|
size_t idx |
|
); |
|
|
|
|
|
|
|
void prelu ( |
|
tensor& dest, |
|
const tensor& src, |
|
const tensor& param |
|
); |
|
|
|
void prelu_gradient ( |
|
tensor& grad, |
|
const tensor& src, |
|
const tensor& gradient_input, |
|
const tensor& param, |
|
tensor& params_grad |
|
); |
|
|
|
|
|
|
|
void leaky_relu ( |
|
tensor& dest, |
|
const tensor& src, |
|
const float alpha |
|
); |
|
|
|
void leaky_relu_gradient ( |
|
tensor& grad, |
|
const tensor& src, |
|
const tensor& gradient_input, |
|
const float alpha |
|
); |
|
|
|
|
|
|
|
void mish ( |
|
tensor& dest, |
|
const tensor& src |
|
); |
|
|
|
void mish_gradient ( |
|
tensor& grad, |
|
const tensor& src, |
|
const tensor& gradient_input |
|
); |
|
|
|
|
|
|
|
void gelu ( |
|
tensor& dest, |
|
const tensor& src |
|
); |
|
|
|
void gelu_gradient ( |
|
tensor& grad, |
|
const tensor& src, |
|
const tensor& gradient_input |
|
); |
|
|
|
|
|
|
|
void resize_bilinear ( |
|
tensor& dest, |
|
long dest_row_stride, |
|
long dest_channel_stride, |
|
const tensor& src, |
|
long src_row_stride, |
|
long src_channel_stride |
|
); |
|
|
|
void resize_bilinear_gradient ( |
|
tensor& grad, |
|
long grad_row_stride, |
|
long grad_channel_stride, |
|
const tensor& gradient_input, |
|
long gradient_input_row_stride, |
|
long gradient_input_channel_stride |
|
); |
|
|
|
inline void resize_bilinear ( |
|
tensor& dest, |
|
const tensor& src |
|
) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); } |
|
|
|
inline void resize_bilinear_gradient ( |
|
tensor& grad, |
|
const tensor& gradient_input |
|
) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); } |
|
|
|
|
|
|
|
void copy_tensor( |
|
bool add_to, |
|
tensor& dest, |
|
size_t dest_k_offset, |
|
const tensor& src, |
|
size_t src_k_offset, |
|
size_t count_k |
|
); |
|
|
|
|
|
|
|
|
|
class compute_loss_binary_log_per_pixel |
|
{ |
|
|
|
|
|
|
|
|
|
public: |
|
|
|
compute_loss_binary_log_per_pixel( |
|
) |
|
{ |
|
} |
|
|
|
template < |
|
typename const_label_iterator |
|
> |
|
void operator() ( |
|
const_label_iterator truth, |
|
const tensor& subnetwork_output, |
|
tensor& gradient, |
|
double& loss |
|
) const |
|
{ |
|
const auto image_size = subnetwork_output.nr()*subnetwork_output.nc(); |
|
const size_t bytes_per_plane = image_size*sizeof(float); |
|
|
|
|
|
buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float)); |
|
|
|
cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1); |
|
buf = buf+sizeof(float); |
|
|
|
|
|
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) |
|
{ |
|
const matrix<float>& t = *truth; |
|
DLIB_ASSERT(t.nr() == subnetwork_output.nr()); |
|
DLIB_ASSERT(t.nc() == subnetwork_output.nc()); |
|
memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane); |
|
} |
|
|
|
auto truth_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size); |
|
|
|
do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss); |
|
} |
|
|
|
private: |
|
|
|
static void do_work( |
|
cuda_data_ptr<float> loss_work_buffer, |
|
cuda_data_ptr<const float> truth_buffer, |
|
const tensor& subnetwork_output, |
|
tensor& gradient, |
|
double& loss |
|
); |
|
|
|
mutable cuda_data_void_ptr buf; |
|
}; |
|
|
|
|
|
|
|
class compute_loss_multiclass_log_per_pixel |
|
{ |
|
|
|
|
|
|
|
|
|
public: |
|
|
|
compute_loss_multiclass_log_per_pixel( |
|
) |
|
{ |
|
} |
|
|
|
template < |
|
typename const_label_iterator |
|
> |
|
void operator() ( |
|
const_label_iterator truth, |
|
const tensor& subnetwork_output, |
|
tensor& gradient, |
|
double& loss |
|
) const |
|
{ |
|
const auto image_size = subnetwork_output.nr()*subnetwork_output.nc(); |
|
const size_t bytes_per_plane = image_size*sizeof(uint16_t); |
|
|
|
|
|
buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float)); |
|
|
|
cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1); |
|
buf = buf+sizeof(float); |
|
|
|
|
|
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) |
|
{ |
|
const matrix<uint16_t>& t = *truth; |
|
DLIB_ASSERT(t.nr() == subnetwork_output.nr()); |
|
DLIB_ASSERT(t.nc() == subnetwork_output.nc()); |
|
memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane); |
|
} |
|
|
|
auto truth_buf = static_pointer_cast<const uint16_t>(buf, subnetwork_output.num_samples()*image_size); |
|
|
|
do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss); |
|
} |
|
|
|
private: |
|
|
|
static void do_work( |
|
cuda_data_ptr<float> loss_work_buffer, |
|
cuda_data_ptr<const uint16_t> truth_buffer, |
|
const tensor& subnetwork_output, |
|
tensor& gradient, |
|
double& loss |
|
); |
|
|
|
mutable cuda_data_void_ptr buf; |
|
}; |
|
|
|
|
|
|
|
class compute_loss_multiclass_log_per_pixel_weighted |
|
{ |
|
|
|
|
|
|
|
|
|
public: |
|
|
|
compute_loss_multiclass_log_per_pixel_weighted( |
|
) |
|
{ |
|
} |
|
|
|
template < |
|
typename const_label_iterator |
|
> |
|
void operator() ( |
|
const_label_iterator truth, |
|
const tensor& subnetwork_output, |
|
tensor& gradient, |
|
double& loss |
|
) const |
|
{ |
|
const auto image_size = subnetwork_output.nr()*subnetwork_output.nc(); |
|
const size_t bytes_per_plane = image_size*sizeof(uint16_t); |
|
const size_t weight_bytes_per_plane = image_size*sizeof(float); |
|
matrix<uint16_t> labels(truth->nr(), truth->nc()); |
|
matrix<float> weights(truth->nr(), truth->nc()); |
|
|
|
|
|
buf = device_global_buffer(subnetwork_output.num_samples()*(bytes_per_plane + weight_bytes_per_plane) + sizeof(float)); |
|
|
|
cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1); |
|
buf = buf+sizeof(float); |
|
const auto weights_offset = subnetwork_output.num_samples() * bytes_per_plane; |
|
|
|
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) |
|
{ |
|
const matrix<weighted_label<uint16_t>>& t = *truth; |
|
DLIB_ASSERT(t.nr() == subnetwork_output.nr()); |
|
DLIB_ASSERT(t.nc() == subnetwork_output.nc()); |
|
for (long r = 0; r < t.nr(); ++r) |
|
{ |
|
for (long c = 0; c < t.nc(); ++c) |
|
{ |
|
labels(r, c) = t(r, c).label; |
|
weights(r, c) = t(r, c).weight; |
|
} |
|
} |
|
memcpy(buf + i*bytes_per_plane, &labels(0,0), bytes_per_plane); |
|
memcpy(buf + weights_offset + i*weight_bytes_per_plane, &weights(0, 0), weight_bytes_per_plane); |
|
} |
|
|
|
auto truth_buf = static_pointer_cast<const uint16_t>(buf, subnetwork_output.num_samples()*image_size); |
|
buf = buf+weights_offset; |
|
auto weights_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size); |
|
|
|
do_work(loss_buf, truth_buf, weights_buf, subnetwork_output, gradient, loss); |
|
} |
|
|
|
private: |
|
|
|
static void do_work( |
|
cuda_data_ptr<float> loss_work_buffer, |
|
cuda_data_ptr<const uint16_t> truth_buffer, |
|
cuda_data_ptr<const float> weights_buffer, |
|
const tensor& subnetwork_output, |
|
tensor& gradient, |
|
double& loss |
|
); |
|
|
|
mutable cuda_data_void_ptr buf; |
|
}; |
|
|
|
|
|
|
|
class compute_loss_mean_squared_per_channel_and_pixel |
|
{ |
|
|
|
|
|
|
|
|
|
public: |
|
|
|
compute_loss_mean_squared_per_channel_and_pixel( |
|
) |
|
{ |
|
} |
|
|
|
template < |
|
typename const_label_iterator |
|
> |
|
void operator() ( |
|
const_label_iterator truth, |
|
const tensor& subnetwork_output, |
|
tensor& gradient, |
|
double& loss |
|
) const |
|
{ |
|
const auto image_size = subnetwork_output.nr()*subnetwork_output.nc()*subnetwork_output.k(); |
|
const size_t bytes_per_image = image_size*sizeof(float); |
|
|
|
|
|
buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_image + sizeof(float)); |
|
|
|
cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1); |
|
buf = buf+sizeof(float); |
|
|
|
const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(float); |
|
|
|
|
|
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) |
|
{ |
|
const auto& t = *truth; |
|
DLIB_ASSERT(static_cast<long>(t.size()) == subnetwork_output.k()); |
|
for (size_t j = 0; j < t.size(); ++j) { |
|
DLIB_ASSERT(t[j].nr() == subnetwork_output.nr()); |
|
DLIB_ASSERT(t[j].nc() == subnetwork_output.nc()); |
|
memcpy(buf + i*bytes_per_image + j*bytes_per_plane, &t[j](0,0), bytes_per_plane); |
|
} |
|
} |
|
|
|
auto truth_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size); |
|
|
|
do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss); |
|
} |
|
|
|
private: |
|
|
|
static void do_work( |
|
cuda_data_ptr<float> loss_work_buffer, |
|
cuda_data_ptr<const float> truth_buffer, |
|
const tensor& subnetwork_output, |
|
tensor& gradient, |
|
double& loss |
|
); |
|
|
|
mutable cuda_data_void_ptr buf; |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
#else |
|
|
|
inline void set_device ( |
|
int id |
|
) |
|
{ |
|
DLIB_CASSERT(id == 0, "dlib::cuda::set_device(id) called with an invalid device id."); |
|
} |
|
|
|
inline int get_device ( |
|
){ return 0; } |
|
|
|
inline int get_num_devices ( |
|
) { return 1; } |
|
|
|
inline std::string get_device_name ( |
|
int device |
|
) |
|
{ |
|
DLIB_CASSERT(device == 0, "dlib::cuda::set_device(id) called with an invalid device id."); |
|
return "CUDA_DISABLED"; |
|
} |
|
|
|
inline void set_current_device_blocking_sync( |
|
) {} |
|
|
|
|
|
inline bool can_access_peer (int , int ) |
|
{ return false; } |
|
inline bool can_access_peer (const tensor& , const tensor& ) |
|
{ return false; } |
|
|
|
inline void device_synchronize (int ){} |
|
inline void device_synchronize (const tensor& ){} |
|
|
|
class enable_peer_access |
|
{ |
|
public: |
|
enable_peer_access() = delete; |
|
enable_peer_access(const enable_peer_access&) = delete; |
|
enable_peer_access& operator=(const enable_peer_access&) = delete; |
|
enable_peer_access( int, int ){} |
|
enable_peer_access( const tensor&, const tensor& ) {} |
|
}; |
|
|
|
#endif |
|
|
|
} |
|
} |
|
|
|
|
|
#endif |
|
|
|
|