|
|
|
|
|
#ifndef DLIB_DNN_CuSOLVER_CU_ |
|
#define DLIB_DNN_CuSOLVER_CU_ |
|
|
|
#ifdef DLIB_USE_CUDA |
|
|
|
#include "cusolver_dlibapi.h" |
|
#include <cublas_v2.h> |
|
#include <cusolverDn.h> |
|
#include "cuda_utils.h" |
|
|
|
|
|
|
|
static const char* cusolver_get_error_string(cusolverStatus_t s) |
|
{ |
|
switch(s) |
|
{ |
|
case CUSOLVER_STATUS_NOT_INITIALIZED: |
|
return "CUDA Runtime API initialization failed."; |
|
case CUSOLVER_STATUS_ALLOC_FAILED: |
|
return "CUDA Resources could not be allocated."; |
|
default: |
|
return "A call to cuSolver failed"; |
|
} |
|
} |
|
|
|
|
|
#define CHECK_CUSOLVER(call) \ |
|
do{ \ |
|
const cusolverStatus_t error = call; \ |
|
if (error != CUSOLVER_STATUS_SUCCESS) \ |
|
{ \ |
|
std::ostringstream sout; \ |
|
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\ |
|
sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\ |
|
throw dlib::cusolver_error(sout.str()); \ |
|
} \ |
|
}while(false) |
|
|
|
|
|
|
|
|
|
namespace dlib |
|
{ |
|
namespace cuda |
|
{ |
|
|
|
|
|
|
|
class cusolver_context |
|
{ |
|
public: |
|
|
|
cusolver_context(const cusolver_context&) = delete; |
|
cusolver_context& operator=(const cusolver_context&) = delete; |
|
|
|
cusolver_context() |
|
{ |
|
handles.resize(16); |
|
} |
|
~cusolver_context() |
|
{ |
|
for (auto h : handles) |
|
{ |
|
if (h) |
|
cusolverDnDestroy(h); |
|
} |
|
} |
|
|
|
cusolverDnHandle_t get_handle ( |
|
) |
|
{ |
|
int new_device_id; |
|
CHECK_CUDA(cudaGetDevice(&new_device_id)); |
|
|
|
if (new_device_id >= (long)handles.size()) |
|
handles.resize(new_device_id+16); |
|
|
|
|
|
if (!handles[new_device_id]) |
|
CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id])); |
|
|
|
|
|
return handles[new_device_id]; |
|
} |
|
|
|
private: |
|
|
|
std::vector<cusolverDnHandle_t> handles; |
|
}; |
|
|
|
static cusolverDnHandle_t context() |
|
{ |
|
thread_local cusolver_context c; |
|
return c.get_handle(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
__global__ void _cuda_set_to_identity_matrix(float* m, size_t nr) |
|
{ |
|
for (auto j : grid_stride_range(0, nr*nr)) |
|
{ |
|
if (j%(nr+1) == 0) |
|
m[j] = 1; |
|
else |
|
m[j] = 0; |
|
} |
|
} |
|
|
|
void set_to_identity_matrix ( |
|
tensor& m |
|
) |
|
{ |
|
DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples()); |
|
launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples()); |
|
} |
|
|
|
|
|
|
|
inv::~inv() |
|
{ |
|
sync_if_needed(); |
|
} |
|
|
|
|
|
|
|
void inv:: |
|
operator() ( |
|
const tensor& m_, |
|
resizable_tensor& out |
|
) |
|
{ |
|
DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it."); |
|
m = m_; |
|
|
|
out.copy_size(m); |
|
set_to_identity_matrix(out); |
|
|
|
const int nc = m.num_samples(); |
|
int Lwork; |
|
CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork)); |
|
|
|
if (Lwork > (int)workspace.size()) |
|
{ |
|
sync_if_needed(); |
|
workspace = cuda_data_ptr<float>(Lwork); |
|
} |
|
if (nc > (int)Ipiv.size()) |
|
{ |
|
sync_if_needed(); |
|
Ipiv = cuda_data_ptr<int>(nc); |
|
} |
|
if (info.size() != 1) |
|
{ |
|
info = cuda_data_ptr<int>(1); |
|
} |
|
|
|
CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info)); |
|
CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info)); |
|
did_work_lately = true; |
|
} |
|
|
|
|
|
|
|
int inv:: |
|
get_last_status( |
|
) |
|
{ |
|
std::vector<int> linfo; |
|
memcpy(linfo, info); |
|
if (linfo.size() != 0) |
|
return linfo[0]; |
|
else |
|
return 0; |
|
} |
|
|
|
|
|
|
|
void inv:: |
|
sync_if_needed() |
|
{ |
|
if (did_work_lately) |
|
{ |
|
did_work_lately = false; |
|
|
|
|
|
cudaDeviceSynchronize(); |
|
} |
|
} |
|
|
|
|
|
|
|
} |
|
} |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|