// Copyright (C) 2015  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_DNn_SOLVERS_H_
#define DLIB_DNn_SOLVERS_H_

#include "solvers_abstract.h"
#include "../cuda/tensor.h"
#include <iostream>
#include "layers.h"

namespace dlib
{
    class sgd
    {
    public:

        explicit sgd(
            float weight_decay_,
            float momentum_ = 0.9
        ) 
        { 
            weight_decay = weight_decay_;
            momentum = momentum_;
        }

        sgd(
        ) : sgd(0.0005, 0.9) 
        { 
        }

        float get_momentum (
        ) const { return momentum; }

        float get_weight_decay (
        ) const { return weight_decay; }

        template <typename layer_type> 
        const tensor& operator() (
            const float learning_rate,
            const layer_type& l,
            const tensor& params_grad
        )
        {
            const tensor& params = l.get_layer_params();

            DLIB_CASSERT(params.size() != 0);
            if (v.size() == 0)
            {
                v.copy_size(params_grad);
                v = 0;
            }

            const double lr = learning_rate*get_learning_rate_multiplier(l);
            const double wd = weight_decay*get_weight_decay_multiplier(l);
            
            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
            tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);

            return v;
        }

        template <unsigned long N>
        const tensor& operator() (
            const float learning_rate,
            const fc_<N,FC_HAS_BIAS>& l,
            const tensor& params_grad
        )
        {
            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs());
            return v;
        }

        template <
            long _num_filters,
            long _nr,
            long _nc,
            int _stride_y,
            int _stride_x,
            int _padding_y,
            int _padding_x
            >
        const tensor& operator() (
            const float learning_rate,
            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
            const tensor& params_grad
        )
        {
            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
            return v;
        }

        template <
            long _num_filters,
            long _nr,
            long _nc,
            int _stride_y,
            int _stride_x,
            int _padding_y,
            int _padding_x
            >
        const tensor& operator() (
            const float learning_rate,
            const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
            const tensor& params_grad
        )
        {
            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
            return v;
        }

        template < layer_mode mode >
        const tensor& operator() (
            const float learning_rate,
            const bn_<mode>& l,
            const tensor& params_grad
        )
        {
            update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
            return v;
        }

        friend void serialize(const sgd& item, std::ostream& out)
        {
            serialize("sgd2", out);
            serialize(item.v, out);
            serialize(item.weight_decay, out);
            serialize(item.momentum, out);
        }

        friend void deserialize(sgd& item, std::istream& in)
        {
            std::string version;
            deserialize(version, in);
            if (version != "sgd2")
                throw serialization_error("Unexpected version found while deserializing dlib::sgd.");
            deserialize(item.v, in);
            deserialize(item.weight_decay, in);
            deserialize(item.momentum, in);
        }

        friend std::ostream& operator<< (std::ostream& out, const sgd& item)
        {
            out << "sgd: weight_decay="<<item.get_weight_decay() << ", momentum="<<item.get_momentum(); 
            return out;
        }

    private:

        template <typename layer_type> 
        void update_considering_bias(
            const float learning_rate,
            const layer_type& l,
            const tensor& params_grad,
            unsigned long bias_offset
        )
        {
            const tensor& params = l.get_layer_params();

            DLIB_CASSERT(params.size() != 0);
            if (v.size() == 0)
            {
                v.copy_size(params_grad);
                v = 0;
            }

            double lr = learning_rate*get_learning_rate_multiplier(l);
            double wd = weight_decay*get_weight_decay_multiplier(l);
            
            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);

            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
            {
                tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
            }
            else
            {

                tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr);

                // now update the biases but apply their multipliers
                lr *= l.get_bias_learning_rate_multiplier();
                wd *= l.get_bias_weight_decay_multiplier();
                tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr);
            }
        }

        resizable_tensor v;
        float weight_decay;
        float momentum;

    };

// ----------------------------------------------------------------------------------------

    class adam 
    {
    public:

        adam(
            float weight_decay_,
            float momentum1_, 
            float momentum2_
        ) 
        { 
            weight_decay = weight_decay_;
            momentum1 = momentum1_;
            momentum2 = momentum2_;
            t = 0;
        }

        adam(
        ) : adam(0.0005, 0.9, 0.999) 
        {}

        float get_momentum1 (
        ) const { return momentum1; }

        float get_momentum2 (
        ) const { return momentum2; }

        float get_weight_decay (
        ) const { return weight_decay; }

        template <typename layer_type>
        const tensor& operator() (
            const float learning_rate,
            const layer_type& l,
            const tensor& params_grad
        )
        {
            const tensor& params = l.get_layer_params();
            DLIB_CASSERT(params.size() != 0);
            if (v.size() == 0)
            {
                m.copy_size(params_grad);
                m = 0;
                v.copy_size(params_grad);
                v = 0;
                s.copy_size(params_grad);
            }

            ++t;

            
            tt::compute_adam_update(0, params.size(), s, m, v, t,
                learning_rate*get_learning_rate_multiplier(l),
                weight_decay*get_weight_decay_multiplier(l), 
                momentum1, momentum2, params, params_grad);

            return s;
        }

        template <unsigned long N>
        const tensor& operator() (
            const float learning_rate,
            const fc_<N,FC_HAS_BIAS>& l,
            const tensor& params_grad
        )
        {
            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs());
            return s;
        }

        template <
            long _num_filters,
            long _nr,
            long _nc,
            int _stride_y,
            int _stride_x,
            int _padding_y,
            int _padding_x
            >
        const tensor& operator() (
            const float learning_rate,
            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
            const tensor& params_grad
        )
        {
            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
            return s;
        }

        template <
            long _num_filters,
            long _nr,
            long _nc,
            int _stride_y,
            int _stride_x,
            int _padding_y,
            int _padding_x
            >
        const tensor& operator() (
            const float learning_rate,
            const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
            const tensor& params_grad
        )
        {
            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
            return s;
        }

        template < layer_mode mode >
        const tensor& operator() (
            const float learning_rate,
            const bn_<mode>& l,
            const tensor& params_grad
        )
        {
            update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
            return s;
        }


        friend void serialize(const adam& item, std::ostream& out)
        {
            serialize("adam2", out);
            serialize(item.m, out);
            serialize(item.v, out);
            serialize(item.s, out);
            serialize(item.weight_decay, out);
            serialize(item.momentum1, out);
            serialize(item.momentum2, out);
            serialize(item.t, out);
        }

        friend void deserialize(adam& item, std::istream& in)
        {
            std::string version;
            deserialize(version, in);
            if (version != "adam2")
                throw serialization_error("Unexpected version found while deserializing dlib::adam.");
            deserialize(item.m, in);
            deserialize(item.v, in);
            deserialize(item.s, in);
            deserialize(item.weight_decay, in);
            deserialize(item.momentum1, in);
            deserialize(item.momentum2, in);
            deserialize(item.t, in);
        }

        friend std::ostream& operator<< (std::ostream& out, const adam& item)
        {
            out << "adam: weight_decay="<<item.get_weight_decay() << ", momentum1="<<item.get_momentum1() << ", momentum2="<<item.get_momentum2(); 
            return out;
        }

    private:

        template <typename layer_type> 
        void update_considering_bias(
            const float learning_rate,
            const layer_type& l,
            const tensor& params_grad,
            unsigned long bias_offset
        )
        {
            const tensor& params = l.get_layer_params();
            DLIB_CASSERT(params.size() != 0);
            if (v.size() == 0)
            {
                m.copy_size(params_grad);
                m = 0;
                v.copy_size(params_grad);
                v = 0;
                s.copy_size(params_grad);
            }


            ++t;

            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
            {
                tt::compute_adam_update(0, params.size(), s, m, v, t,
                    learning_rate*get_learning_rate_multiplier(l),
                    weight_decay*get_weight_decay_multiplier(l), 
                    momentum1, momentum2, params, params_grad);
            }
            else
            {
                tt::compute_adam_update(0, bias_offset, s, m, v, t,
                    learning_rate*get_learning_rate_multiplier(l),
                    weight_decay*get_weight_decay_multiplier(l), 
                    momentum1, momentum2, params, params_grad);

                tt::compute_adam_update(bias_offset, params.size(), s, m, v, t,
                    learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(),
                    weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(), 
                    momentum1, momentum2, params, params_grad);
            }
        }
        resizable_tensor m;
        resizable_tensor v;
        resizable_tensor s;
        float weight_decay;
        float momentum1;
        float momentum2;
        float t;
    };

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_DNn_SOLVERS_H_