// Copyright (C) 2015 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_DNN_CuDA_H_ #define DLIB_DNN_CuDA_H_ #include "tensor.h" #include "../geometry/rectangle.h" #include "../dnn/misc.h" namespace dlib { namespace cuda { // ---------------------------------------------------------------------------------------- void set_device ( int dev ); int get_device ( ); int get_num_devices ( ); std::string get_device_name ( int device ); void set_current_device_blocking_sync( ); bool can_access_peer (int device_id, int peer_device_id); bool can_access_peer (const tensor& device, const tensor& peer_device); void device_synchronize (int dev); void device_synchronize (const tensor& dev); class raii_set_device { public: raii_set_device() = delete; raii_set_device(const raii_set_device&) = delete; raii_set_device& operator=(const raii_set_device&) = delete; raii_set_device(int dev) { prev_dev = get_device(); set_device(dev); } raii_set_device(const tensor& dev) { prev_dev = get_device(); set_device(dev.device_id()); } void operator() (int dev) { set_device(dev); } void operator() (const tensor& dev) { set_device(dev.device_id()); } ~raii_set_device() noexcept(false) { set_device(prev_dev); } private: int prev_dev; }; #ifdef DLIB_USE_CUDA class enable_peer_access { public: enable_peer_access() = delete; enable_peer_access(const enable_peer_access&) = delete; enable_peer_access& operator=(const enable_peer_access&) = delete; enable_peer_access( int device_id, int peer_device_id ); enable_peer_access( const tensor& device, const tensor& peer_device ) : enable_peer_access(device.device_id(), peer_device.device_id()) {} ~enable_peer_access() noexcept(false); private: bool call_disable; int device_id; int peer_device_id; }; // ----------------------------------------------------------------------------------- void inverse_norms ( resizable_tensor& invnorms, const tensor& data, const double eps ); void dot_prods ( resizable_tensor& out, const tensor& lhs, const tensor& rhs ); void dot_prods ( bool add_to, tensor& out, const tensor& lhs, const tensor& rhs ); void scale_columns ( tensor& out, const tensor& m, const tensor& v ); void scale_rows ( tensor& out, const tensor& m, const tensor& v ); void scale_rows2 ( float beta, tensor& out, const tensor& m1, const tensor& m2, const tensor& v1, const tensor& v2 ); void exp ( tensor& dest, const tensor& src ); void log ( tensor& dest, const tensor& src ); void log10 ( tensor& dest, const tensor& src ); // ------------------------------------------------------------------------------------ void set_tensor ( tensor& t, float value ); void scale_tensor ( tensor& t, float value ); // ------------------------------------------------------------------------------------ void multiply ( bool add_to, tensor& dest, const tensor& src1, const tensor& src2 ); void multiply_conv ( bool add_to, tensor& dest, const tensor& src1, const tensor& src2 ); void multiply_zero_padded ( bool add_to, tensor& dest, const tensor& src1, const tensor& src2 ); void scale_channels ( bool add_to, tensor& dest, const tensor& src, const tensor& scales ); void add ( tensor& dest, const tensor& src1, const tensor& src2 ); // ----------------------------------------------------------------------------------- void affine_transform( tensor& dest, const tensor& src, const float A, const float B ); void affine_transform( tensor& dest, const tensor& src, const float A ); void affine_transform( tensor& dest, const tensor& src1, const tensor& src2, const float A, const float B, const float C ); void affine_transform( tensor& dest, const tensor& src1, const tensor& src2, const float A, const float B ); void affine_transform( tensor& dest, const tensor& src1, const tensor& src2, const tensor& src3, const float A, const float B, const float C, const float D ); void affine_transform_range( size_t begin, size_t end, tensor& dest, const tensor& src1, const tensor& src2, const tensor& src3, const float A, const float B, const float C ); void affine_transform( const rectangle& rect, tensor& dest, const tensor& src1, const tensor& src2, const tensor& src3, float A, float B, float C ); // Note that this function isn't in the tt:: namespace because add_scaled() is // called by cuda::add() so we don't need a tt:: version of add_scaled(). void add_scaled( tensor& dest, const float scale, const tensor& src ); void add_cv_to_all_columns( float beta, tensor& dest, float alpha, const tensor& src ); // ----------------------------------------------------------------------------------- void affine_transform( tensor& dest, const tensor& src, const tensor& A, const tensor& B ); // ----------------------------------------------------------------------------------- void affine_transform_conv( tensor& dest, const tensor& src, const tensor& A, const tensor& B ); // ---------------------------------------------------------------------------------------- void compute_adam_update ( size_t begin, size_t end, tensor& s, tensor& m, tensor& v, const float t, const float learning_rate, const float weight_decay, const float momentum1, const float momentum2, const tensor& params, const tensor& params_grad ); // ----------------------------------------------------------------------------------- void assign_bias_gradient ( tensor& grad, const tensor& gradient_input ); // ----------------------------------------------------------------------------------- void layer_normalize ( const double eps, resizable_tensor& dest, resizable_tensor& means, resizable_tensor& invstds, const tensor& src, const tensor& gamma, const tensor& beta ); void layer_normalize_gradient ( const double eps, const tensor& gradient_input, const tensor& means, const tensor& invstds, const tensor& src, const tensor& gamma, tensor& src_grad, tensor& gamma_grad, tensor& beta_grad ); // ----------------------------------------------------------------------------------- void threshold ( tensor& data, float thresh ); // ---------------------------------------------------------------------------------------- void dot ( const tensor& a, const tensor& b, tensor& result, size_t idx ); // ---------------------------------------------------------------------------------------- void prelu ( tensor& dest, const tensor& src, const tensor& param ); void prelu_gradient ( tensor& grad, const tensor& src, const tensor& gradient_input, const tensor& param, tensor& params_grad ); // ---------------------------------------------------------------------------------------- void leaky_relu ( tensor& dest, const tensor& src, const float alpha ); void leaky_relu_gradient ( tensor& grad, const tensor& src, const tensor& gradient_input, const float alpha ); // ---------------------------------------------------------------------------------------- void mish ( tensor& dest, const tensor& src ); void mish_gradient ( tensor& grad, const tensor& src, const tensor& gradient_input ); // ---------------------------------------------------------------------------------------- void gelu ( tensor& dest, const tensor& src ); void gelu_gradient ( tensor& grad, const tensor& src, const tensor& gradient_input ); // ---------------------------------------------------------------------------------------- void resize_bilinear ( tensor& dest, long dest_row_stride, long dest_channel_stride, const tensor& src, long src_row_stride, long src_channel_stride ); void resize_bilinear_gradient ( tensor& grad, long grad_row_stride, long grad_channel_stride, const tensor& gradient_input, long gradient_input_row_stride, long gradient_input_channel_stride ); inline void resize_bilinear ( tensor& dest, const tensor& src ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); } inline void resize_bilinear_gradient ( tensor& grad, const tensor& gradient_input ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); } // ---------------------------------------------------------------------------------------- void copy_tensor( bool add_to, tensor& dest, size_t dest_k_offset, const tensor& src, size_t src_k_offset, size_t count_k ); // ---------------------------------------------------------------------------------------- class compute_loss_binary_log_per_pixel { /*! The point of this class is to compute the loss computed by loss_binary_log_per_pixel_, but to do so with CUDA. !*/ public: compute_loss_binary_log_per_pixel( ) { } template < typename const_label_iterator > void operator() ( const_label_iterator truth, const tensor& subnetwork_output, tensor& gradient, double& loss ) const { const auto image_size = subnetwork_output.nr()*subnetwork_output.nc(); const size_t bytes_per_plane = image_size*sizeof(float); // Allocate a cuda buffer to store all the truth images and also one float // for the scalar loss output. buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float)); cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1); buf = buf+sizeof(float); // copy the truth data into a cuda buffer. for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) { const matrix<float>& t = *truth; DLIB_ASSERT(t.nr() == subnetwork_output.nr()); DLIB_ASSERT(t.nc() == subnetwork_output.nc()); memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane); } auto truth_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size); do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss); } private: static void do_work( cuda_data_ptr<float> loss_work_buffer, cuda_data_ptr<const float> truth_buffer, const tensor& subnetwork_output, tensor& gradient, double& loss ); mutable cuda_data_void_ptr buf; }; // ---------------------------------------------------------------------------------------- class compute_loss_multiclass_log_per_pixel { /*! The point of this class is to compute the loss computed by loss_multiclass_log_per_pixel_, but to do so with CUDA. !*/ public: compute_loss_multiclass_log_per_pixel( ) { } template < typename const_label_iterator > void operator() ( const_label_iterator truth, const tensor& subnetwork_output, tensor& gradient, double& loss ) const { const auto image_size = subnetwork_output.nr()*subnetwork_output.nc(); const size_t bytes_per_plane = image_size*sizeof(uint16_t); // Allocate a cuda buffer to store all the truth images and also one float // for the scalar loss output. buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float)); cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1); buf = buf+sizeof(float); // copy the truth data into a cuda buffer. for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) { const matrix<uint16_t>& t = *truth; DLIB_ASSERT(t.nr() == subnetwork_output.nr()); DLIB_ASSERT(t.nc() == subnetwork_output.nc()); memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane); } auto truth_buf = static_pointer_cast<const uint16_t>(buf, subnetwork_output.num_samples()*image_size); do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss); } private: static void do_work( cuda_data_ptr<float> loss_work_buffer, cuda_data_ptr<const uint16_t> truth_buffer, const tensor& subnetwork_output, tensor& gradient, double& loss ); mutable cuda_data_void_ptr buf; }; // ---------------------------------------------------------------------------------------- class compute_loss_multiclass_log_per_pixel_weighted { /*! The point of this class is to compute the loss computed by loss_multiclass_log_per_pixel_weighted_, but to do so with CUDA. !*/ public: compute_loss_multiclass_log_per_pixel_weighted( ) { } template < typename const_label_iterator > void operator() ( const_label_iterator truth, const tensor& subnetwork_output, tensor& gradient, double& loss ) const { const auto image_size = subnetwork_output.nr()*subnetwork_output.nc(); const size_t bytes_per_plane = image_size*sizeof(uint16_t); const size_t weight_bytes_per_plane = image_size*sizeof(float); matrix<uint16_t> labels(truth->nr(), truth->nc()); matrix<float> weights(truth->nr(), truth->nc()); // Allocate a cuda buffer to store all the truth images and also one float // for the scalar loss output. buf = device_global_buffer(subnetwork_output.num_samples()*(bytes_per_plane + weight_bytes_per_plane) + sizeof(float)); cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1); buf = buf+sizeof(float); const auto weights_offset = subnetwork_output.num_samples() * bytes_per_plane; // copy the truth data into a cuda buffer. for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) { const matrix<weighted_label<uint16_t>>& t = *truth; DLIB_ASSERT(t.nr() == subnetwork_output.nr()); DLIB_ASSERT(t.nc() == subnetwork_output.nc()); for (long r = 0; r < t.nr(); ++r) { for (long c = 0; c < t.nc(); ++c) { labels(r, c) = t(r, c).label; weights(r, c) = t(r, c).weight; } } memcpy(buf + i*bytes_per_plane, &labels(0,0), bytes_per_plane); memcpy(buf + weights_offset + i*weight_bytes_per_plane, &weights(0, 0), weight_bytes_per_plane); } auto truth_buf = static_pointer_cast<const uint16_t>(buf, subnetwork_output.num_samples()*image_size); buf = buf+weights_offset; auto weights_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size); do_work(loss_buf, truth_buf, weights_buf, subnetwork_output, gradient, loss); } private: static void do_work( cuda_data_ptr<float> loss_work_buffer, cuda_data_ptr<const uint16_t> truth_buffer, cuda_data_ptr<const float> weights_buffer, const tensor& subnetwork_output, tensor& gradient, double& loss ); mutable cuda_data_void_ptr buf; }; // ---------------------------------------------------------------------------------------- class compute_loss_mean_squared_per_channel_and_pixel { /*! The point of this class is to compute the loss computed by loss_mean_squared_per_channel_and_pixel_, but to do so with CUDA. !*/ public: compute_loss_mean_squared_per_channel_and_pixel( ) { } template < typename const_label_iterator > void operator() ( const_label_iterator truth, const tensor& subnetwork_output, tensor& gradient, double& loss ) const { const auto image_size = subnetwork_output.nr()*subnetwork_output.nc()*subnetwork_output.k(); const size_t bytes_per_image = image_size*sizeof(float); // Allocate a cuda buffer to store all the truth images and also one float // for the scalar loss output. buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_image + sizeof(float)); cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1); buf = buf+sizeof(float); const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(float); // copy the truth data into a cuda buffer. for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) { const auto& t = *truth; DLIB_ASSERT(static_cast<long>(t.size()) == subnetwork_output.k()); for (size_t j = 0; j < t.size(); ++j) { DLIB_ASSERT(t[j].nr() == subnetwork_output.nr()); DLIB_ASSERT(t[j].nc() == subnetwork_output.nc()); memcpy(buf + i*bytes_per_image + j*bytes_per_plane, &t[j](0,0), bytes_per_plane); } } auto truth_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size); do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss); } private: static void do_work( cuda_data_ptr<float> loss_work_buffer, cuda_data_ptr<const float> truth_buffer, const tensor& subnetwork_output, tensor& gradient, double& loss ); mutable cuda_data_void_ptr buf; }; // ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------ #else // if DLIB_USE_CUDA NOT DEFINED inline void set_device ( int id ) { DLIB_CASSERT(id == 0, "dlib::cuda::set_device(id) called with an invalid device id."); } inline int get_device ( ){ return 0; } inline int get_num_devices ( ) { return 1; } inline std::string get_device_name ( int device ) { DLIB_CASSERT(device == 0, "dlib::cuda::set_device(id) called with an invalid device id."); return "CUDA_DISABLED"; } inline void set_current_device_blocking_sync( ) {} inline bool can_access_peer (int , int ) { return false; } inline bool can_access_peer (const tensor& , const tensor& ) { return false; } inline void device_synchronize (int ){} inline void device_synchronize (const tensor& ){} class enable_peer_access { public: enable_peer_access() = delete; enable_peer_access(const enable_peer_access&) = delete; enable_peer_access& operator=(const enable_peer_access&) = delete; enable_peer_access( int, int ){} enable_peer_access( const tensor&, const tensor& ) {} }; #endif // DLIB_USE_CUDA } } #endif // DLIB_DNN_CuDA_H_