// Copyright (C) 2017  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuSOLVER_CU_
#define DLIB_DNN_CuSOLVER_CU_

#ifdef DLIB_USE_CUDA

#include "cusolver_dlibapi.h"
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "cuda_utils.h"

// ----------------------------------------------------------------------------------------

static const char* cusolver_get_error_string(cusolverStatus_t s)
{
    switch(s)
    {
        case CUSOLVER_STATUS_NOT_INITIALIZED: 
            return "CUDA Runtime API initialization failed.";
        case CUSOLVER_STATUS_ALLOC_FAILED: 
            return "CUDA Resources could not be allocated.";
        default:
            return "A call to cuSolver failed";
    }
}

// Check the return value of a call to the cuSolver runtime for an error condition.
#define CHECK_CUSOLVER(call)                                                      \
do{                                                                              \
    const cusolverStatus_t error = call;                                         \
    if (error != CUSOLVER_STATUS_SUCCESS)                                        \
    {                                                                          \
        std::ostringstream sout;                                               \
        sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
        sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\
        throw dlib::cusolver_error(sout.str());                                \
    }                                                                          \
}while(false)

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

namespace dlib
{
    namespace cuda 
    {

    // -----------------------------------------------------------------------------------

        class cusolver_context
        {
        public:
            // not copyable
            cusolver_context(const cusolver_context&) = delete;
            cusolver_context& operator=(const cusolver_context&) = delete;

            cusolver_context()
            {
                handles.resize(16);
            }
            ~cusolver_context()
            {
                for (auto h : handles)
                {
                    if (h)
                        cusolverDnDestroy(h);
                }
            }

            cusolverDnHandle_t get_handle (
            )  
            { 
                int new_device_id;
                CHECK_CUDA(cudaGetDevice(&new_device_id));
                // make room for more devices if needed
                if (new_device_id >= (long)handles.size())
                    handles.resize(new_device_id+16);

                // If we don't have a handle already for this device then make one
                if (!handles[new_device_id])
                    CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id]));

                // Finally, return the handle for the current device
                return handles[new_device_id];
            }

        private:

            std::vector<cusolverDnHandle_t> handles;
        };

        static cusolverDnHandle_t context()
        {
            thread_local cusolver_context c;
            return c.get_handle();
        }

    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------

        __global__ void _cuda_set_to_identity_matrix(float* m, size_t nr)
        {
            for (auto j : grid_stride_range(0, nr*nr))
            {
                if (j%(nr+1) == 0)
                    m[j] = 1;
                else
                    m[j] = 0;
            }
        }

        void set_to_identity_matrix (
            tensor& m 
        )
        {
            DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples());
            launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples());
        }

    // ------------------------------------------------------------------------------------

        inv::~inv()
        {
            sync_if_needed();
        }

    // ------------------------------------------------------------------------------------

        void inv::
        operator() (
            const tensor& m_,
            resizable_tensor& out
        )
        {
            DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it.");
            m = m_;

            out.copy_size(m);
            set_to_identity_matrix(out);

            const int nc = m.num_samples();
            int Lwork;
            CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork));

            if (Lwork > (int)workspace.size())
            {
                sync_if_needed();
                workspace = cuda_data_ptr<float>(Lwork);
            }
            if (nc > (int)Ipiv.size())
            {
                sync_if_needed();
                Ipiv = cuda_data_ptr<int>(nc);
            }
            if (info.size() != 1)
            {
                info = cuda_data_ptr<int>(1);
            }

            CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info));
            CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info));
            did_work_lately = true;
        }

    // ------------------------------------------------------------------------------------

        int inv::
        get_last_status(
        )
        {
            std::vector<int> linfo; 
            memcpy(linfo, info);
            if (linfo.size() != 0)
                return linfo[0];
            else
                return 0;
        }

    // ------------------------------------------------------------------------------------

        void inv::
        sync_if_needed()
        {
            if (did_work_lately)
            {
                did_work_lately = false;
                // make sure we wait until any previous kernel launches have finished
                // before we do something like deallocate the GPU memory.
                cudaDeviceSynchronize();
            }
        }

    // ------------------------------------------------------------------------------------

    }  
}

#endif // DLIB_USE_CUDA

#endif // DLIB_DNN_CuSOLVER_CU_