#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Two Class logistic regression module with Prejudice Remover

the number of sensitive features is restricted to one, and the feature must
be binary.

Attributes
----------
EPSILON : floast
    small positive constant
N_S : int
    the number of sensitive features
N_CLASSES : int
    the number of classes
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

#==============================================================================
# Module metadata variables
#==============================================================================

#==============================================================================
# Imports
#==============================================================================

import logging
import numpy as np
from scipy.optimize import fmin_cg
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin

#==============================================================================
# Public symbols
#==============================================================================

__all__ = ['LRwPRType4']

#==============================================================================
# Constants
#==============================================================================

EPSILON = 1.0e-10
SIGMOID_RANGE = np.log((1.0 - EPSILON) / EPSILON)
N_S = 1
N_CLASSES = 2

#==============================================================================
# Module variables
#==============================================================================

#==============================================================================
# Functions
#==============================================================================

def sigmoid(x, w):
    """ sigmoid(w^T x)
    To suppress the warnings at np.exp, do "np.seterr(all='ignore')"

    Parameters
    ----------
    x : array, shape=(d)
        input vector
    w : array, shape=(d)
        weight

    -------
    sigmoid : float
        sigmoid(w^T x)
    """

    s = np.clip(np.dot(w, x), -SIGMOID_RANGE, SIGMOID_RANGE)

    return 1.0 / (1.0 + np.exp(-s))


#==============================================================================
# Classes
#==============================================================================

class LRwPR(BaseEstimator, ClassifierMixin):
    """ Two class LogisticRegression with Prejudice Remover

    Parameters
    ----------
    C : float
        regularization parameter
    eta : float
        penalty parameter
    fit_intercept : bool
        use a constant term
    penalty : str
        fixed to 'l2'

    Attributes
    ----------
    minor_type : int
        type of likelihood fitting
    `coef_` : array, shape=(n_features)
        parameters for logistic regression model
    `mx_` : array-like, shape(n_sfv, n_nsf)
        mx_[si, :] is a mean rows of X whose corresponding sensitive
        feature is exactly si.
    `n_s_` : int
        the number of sensitive features
    `n_sfv_` : int
        the number of sensitive feature values.
    `c_s_` : ary, shape=(`n_sfv_`)
        the counts of each senstive values in training samples
    `n_features_` : int
        the number of non-sensitive features including a bias constant
    `n_samples_` : int
        the number of samples
    `f_loss_` : float
        the value of loss function after training
    """

    def __init__(self, C=1.0, eta=1.0, fit_intercept=True, penalty='l2'):

        if C < 0.0:
            raise TypeError
        self.fit_intercept = fit_intercept
        self.penalty = penalty
        self.C = C
        self.eta = eta
        self.minor_type = 0
        self.f_loss_ = np.inf

    def predict(self, X):
        """ predict classes

        Parameters
        ----------
        X : array, shape=(n_samples, n_features)
            feature vectors of samples

        Returns
        -------
        y : array, shape=(n_samples), dtype=int
            array of predicted class
        """

        return np.argmax(self.predict_proba(X), 1)

class LRwPRPredictProbaType2Mixin(LRwPR):
    """ mixin for singe type 2 likelihood
    """

    def predict_proba(self, X):
        """ predict probabilities

        a set of weight vectors, whose size if the same as the number of the
        sensitive features, are available and these weights are selected
        according to the value of a sensitive feature

        Parameters
        ----------
        X : array, shape=(n_samples, n_features)
            feature vectors of samples

        Returns
        -------
        y_proba : array, shape=(n_samples, n_classes), dtype=float
            array of predicted class
        """

        # add a constanet term
        s = np.atleast_1d(np.squeeze(np.array(X)[:, -self.n_s_]).astype(int))
        if self.fit_intercept:
            X = np.c_[np.atleast_2d(X)[:, :-self.n_s_], np.ones(X.shape[0])]
        else:
            X = np.atleast_2d(X)[:, :-self.n_s_]
        coef = self.coef_.reshape(self.n_sfv_, self.n_features_)

        proba = np.empty((X.shape[0], N_CLASSES))
        proba[:, 1] = [sigmoid(X[i, :], coef[s[i], :])
                       for i in range(X.shape[0])]
        proba[:, 0] = 1.0 - proba[:, 1]

        return proba

class LRwPRFittingType1Mixin(LRwPR):
    """ Fitting Method Mixin
    """

    def init_coef(self, itype, X, y, s):
        """ set initial weight

        initialization methods are specified by `itype`

        * 0: cleared by 0
        * 1: follows standard normal distribution
        * 2: learned by standard logistic regression
        * 3: learned by standard logistic regression separately according to
          the value of sensitve feature

        Parameters
        ----------
        itype : int
            type of initialization method
        X : array, shape=(n_samples, n_features)
            feature vectors of samples
        y : array, shape=(n_samples)
            target class of samples
        s : array, shape=(n_samples)
            values of sensitive features
        """

        if itype == 0:
            # clear by zeros
            self.coef_ = np.zeros(self.n_sfv_ * self.n_features_,
                                  dtype=np.float)
        elif itype == 1:
            # at random
            self.coef_ = np.random.randn(self.n_sfv_ * self.n_features_)

        elif itype == 2:
            # learned by standard LR
            self.coef_ = np.empty(self.n_sfv_ * self.n_features_,
                                  dtype=np.float)
            coef = self.coef_.reshape(self.n_sfv_, self.n_features_)

            clr = LogisticRegression(C=self.C, penalty='l2',
                                     fit_intercept=False)
            clr.fit(X, y)

            coef[:, :] = clr.coef_
        elif itype == 3:
            # learned by standard LR
            self.coef_ = np.empty(self.n_sfv_ * self.n_features_,
                                  dtype=np.float)
            coef = self.coef_.reshape(self.n_sfv_, self.n_features_)

            for i in range(self.n_sfv_):
                clr = LogisticRegression(C=self.C, penalty='l2', max_iter=1000,
                                         fit_intercept=False)
                clr.fit(X[s == i, :], y[s == i])
                coef[i, :] = clr.coef_
        else:
            raise TypeError

    def fit(self, X, y, ns=N_S, itype=0, **kwargs):
        """ train this model

        Parameters
        ----------
        X : array, shape = (n_samples, n_features)
            feature vectors of samples
        y : array, shape = (n_samples)
            target class of samples
        ns : int
            number of sensitive features. currently fixed to N_S
        itype : int
            type of initialization method
        kwargs : any
            arguments to optmizer
        """

        # rearrange input arguments
        s = np.atleast_1d(np.squeeze(np.array(X)[:, -ns]).astype(int))
        if self.fit_intercept:
            X = np.c_[np.atleast_2d(X)[:, :-ns], np.ones(X.shape[0])]
        else:
            X = np.atleast_2d(X)[:, :-ns]

        # check optimization parameters
        if not 'disp' in kwargs:
            kwargs['disp'] = False
        if not 'maxiter' in kwargs:
            kwargs['maxiter'] = 100

        # set instance variables
        self.n_s_ = ns
        self.n_sfv_ = np.max(s) + 1
        self.c_s_ = np.array([np.sum(s == si).astype(np.float)
                              for si in range(self.n_sfv_)])
        self.n_features_ = X.shape[1]
        self.n_samples_ = X.shape[0]

        # optimization
        self.init_coef(itype, X, y, s)
        self.coef_ = fmin_cg(self.loss,
                             self.coef_,
                             fprime=self.grad_loss,
                             args=(X, y, s),
                             **kwargs)

        # get final loss
        self.f_loss_ = self.loss(self.coef_, X, y, s)

class LRwPRObjetiveType4Mixin(LRwPR):
    """ objective function of logistic regression with prejudice remover

    Loss Function type 4: Weights for logistic regression are prepared for each
    value of S. Penalty for enhancing is defined as mutual information between
    Y and S.
    """

    def loss(self, coef_, X, y, s):
        """ loss function: negative log - likelihood with l2 regularizer
        To suppress the warnings at np.log, do "np.seterr(all='ignore')"

        Parameters
        ----------
        `coef_` : array, shape=(`n_sfv_` * n_features)
            coefficients of model
        X : array, shape=(n_samples, n_features)
            feature vectors of samples
        y : array, shape=(n_samples)
            target class of samples
        s : array, shape=(n_samples)
            values of sensitive features

        Returns
        -------
        loss : float
            loss function value
        """

        coef = coef_.reshape(self.n_sfv_, self.n_features_)

#        print >> sys.stderr, "loss:", coef[0, :], coef[1, :]

        ### constants

        # sigma = Pr[y=0|x,s] = sigmoid(w(s)^T x)
        p = np.array([sigmoid(X[i, :], coef[s[i], :])
                      for i in range(self.n_samples_)])

        # rho(s) = Pr[y=0|s] = \sum_{(xi,si)in D st si=s} sigma(xi,si) / #D[s]
        q = np.array([np.sum(p[s == si])
                      for si in range(self.n_sfv_)]) / self.c_s_

        # pi = Pr[y=0] = \sum_{(xi,si)in D} sigma(xi,si)
        r = np.sum(p) / self.n_samples_

        ### loss function

        # likelihood
        # \sum_{x,s,y in D} y log(sigma) + (1 - y) log(1 - sigma)
        l = np.sum(y * np.log(p) + (1.0 - y) * np.log(1.0 - p))

        # fairness-aware regularizer
        # \sum_{x,s in D} \
        #    sigma(x,x)       [log(rho(s))     - log(pi)    ] + \
        #    (1 - sigma(x,s)) [log(1 - rho(s)) - log(1 - pi)]
        f = np.sum(p * (np.log(q[s]) - np.log(r))
             + (1.0 - p) * (np.log(1.0 - q[s]) - np.log(1.0 - r)))

        # l2 regularizer
        reg = np.sum(coef * coef)

        l = -l + self.eta * f + 0.5 * self.C * reg
#        print >> sys.stderr, l
        return l

    def grad_loss(self, coef_, X, y, s):
        """ first derivative of loss function

        Parameters
        ----------
        `coef_` : array, shape=(`n_sfv_` * n_features)
            coefficients of model
        X : array, shape=(n_samples, n_features)
            feature vectors of samples
        y : array, shape=(n_samples)
            target class of samples
        s : array, shape=(n_samples)
            values of sensitive features

        Returns
        grad_loss : float
            first derivative of loss function
        """

        coef = coef_.reshape(self.n_sfv_, self.n_features_)
        l_ = np.empty(self.n_sfv_ * self.n_features_)
        l = l_.reshape(self.n_sfv_, self.n_features_)
#        print >> sys.stderr, "grad_loss:", coef[0, :], coef[1, :]

        ### constants
        # prefix "d_": derivertive by w(s)

        # sigma = Pr[y=0|x,s] = sigmoid(w(s)^T x)
        # d_sigma(x,s) = d sigma / d w(s) = sigma (1 - sigma) x
        p = np.array([sigmoid(X[i, :], coef[s[i], :])
                      for i in range(self.n_samples_)])
        dp = (p * (1.0 - p))[:, np.newaxis] * X

        # rho(s) = Pr[y=0|s] = \sum_{(xi,si)in D st si=s} sigma(xi,si) / #D[s]
        # d_rho(s) = \sum_{(xi,si)in D st si=s} d_sigma(xi,si) / #D[s]
        q = np.array([np.sum(p[s == si])
                      for si in range(self.n_sfv_)]) / self.c_s_
        dq = np.array([np.sum(dp[s == si, :], axis=0)
                       for si in range(self.n_sfv_)]) \
                       / self.c_s_[:, np.newaxis]

        # pi = Pr[y=0] = \sum_{(xi,si)in D} sigma(xi,si) / #D
        # d_pi = \sum_{(xi,si)in D} d_sigma(xi,si) / #D
        r = np.sum(p) / self.n_samples_
        dr = np.sum(dp, axis=0) / self.n_samples_

        # likelihood
        # l(si) = \sum_{x,y in D st s=si} (y - sigma(x, si)) x
        for si in range(self.n_sfv_):
            l[si, :] = np.sum((y - p)[s == si][:, np.newaxis] * X[s == si, :],
                              axis=0)

        # fairness-aware regularizer
        # differentialy by w(s)
        # \sum_{x,s in {D st s=si} \
        #     [(log(rho(si)) - log(pi)) - (log(1 - rho(si)) - log(1 - pi))] \
        #     * d_sigma
        # + \sum_{x,s in {D st s=si} \
        #     [ {sigma(xi, si) - rho(si)} / {rho(si) (1 - rho(si))} ] \
        #     * d_rho
        # - \sum_{x,s in {D st s=si} \
        #     [ {sigma(xi, si) - pi} / {pi (1 - pi)} ] \
        #     * d_pi

        f1 = (np.log(q[s]) - np.log(r)) \
             - (np.log(1.0 - q[s]) - np.log(1.0 - r))
        f2 = (p - q[s]) / (q[s] * (1.0 - q[s]))
        f3 = (p - r) / (r * (1.0 - r))
        f4 = f1[:, np.newaxis] * dp \
            + f2[:, np.newaxis] * dq[s, :] \
            - np.outer(f3, dr)
        f = np.array([np.sum(f4[s == si, :], axis=0)
                      for si in range(self.n_sfv_)])

        # l2 regularizer
        reg = coef

        # sum
        l[:, :] = -l + self.eta * f + self.C * reg
#        print >> sys.stderr, "l =", l

        return l_

class LRwPRType4\
    (LRwPRObjetiveType4Mixin,
     LRwPRFittingType1Mixin,
     LRwPRPredictProbaType2Mixin):
    """ Two class LogisticRegression with Prejudice Remover

    Parameters
    ----------
    C : float
        regularization parameter
    eta : float
        penalty parameter
    fit_intercept : bool
        use a constant term
    penalty : str
        fixed to 'l2'
    """

    def __init__(self, C=1.0, eta=1.0, fit_intercept=True, penalty='l2'):

        super(LRwPRType4, self).\
            __init__(C=C, eta=eta,
                     fit_intercept=fit_intercept, penalty=penalty)

        self.coef_ = None
        self.mx_ = None
        self.n_s_ = 0
        self.n_sfv_ = 0
        self.minor_type = 4

#==============================================================================
# Module initialization
#==============================================================================

# init logging system

logger = logging.getLogger('fadm')
if not logger.handlers:
    logger.addHandler(logging.NullHandler)

#==============================================================================
# Test routine
#==============================================================================

def _test():
    """ test function for this module
    """

    # perform doctest
    import sys
    import doctest

    doctest.testmod()

    sys.exit(0)

# Check if this is call as command script

if __name__ == '__main__':
    _test()