import numpy as np
from scipy.optimize import minimize
import sys
import tensorflow as tf
from idnns.networks import model as mo
import contextlib
import idnns.information.entropy_estimators as ee

@contextlib.contextmanager
def printoptions(*args, **kwargs):
    original = np.get_printoptions()
    np.set_printoptions(*args, **kwargs)
    try:
        yield
    finally:
        np.set_printoptions(**original)


def optimiaze_func(s, diff_mat, d, N):
    diff_mat1 = (1. / (np.sqrt(2. * np.pi) * (s ** 2) ** (d / 2.))) * np.exp(-diff_mat / (2. * s ** 2))
    np.fill_diagonal(diff_mat1, 0)
    diff_mat2 = (1. / (N - 1)) * np.sum(diff_mat1, axis=0)
    diff_mat3 = np.sum(np.log2(diff_mat2), axis=0)
    return -diff_mat3


def calc_all_sigams(data, sigmas):
    batchs = 128
    num_of_bins = 8
    # bins = np.linspace(-1, 1, num_of_bins).astype(np.float32)
    # bins = stats.mstats.mquantiles(np.squeeze(data.reshape(1, -1)), np.linspace(0,1, num=num_of_bins))
    # data = bins[np.digitize(np.squeeze(data.reshape(1, -1)), bins) - 1].reshape(len(data), -1)

    batch_points = np.rint(np.arange(0, data.shape[0] + 1, batchs)).astype(dtype=np.int32)
    I_XT = []
    num_of_rand = min(800, data.shape[1])
    for sigma in sigmas:
        # print sigma
        I_XT_temp = 0
        for i in range(0, len(batch_points) - 1):
            new_data = data[batch_points[i]:batch_points[i + 1], :]
            rand_indexs = np.random.randint(0, new_data.shape[1], num_of_rand)
            new_data = new_data[:, :]
            N = new_data.shape[0]
            d = new_data.shape[1]
            diff_mat = np.linalg.norm(((new_data[:, np.newaxis, :] - new_data)), axis=2)
            # print diff_mat.shape, new_data.shape
            s0 = 0.2
            # DOTO -add leaveoneout validation
            res = minimize(optimiaze_func, s0, args=(diff_mat, d, N), method='nelder-mead',
                           options={'xtol': 1e-8, 'disp': False, 'maxiter': 6})
            eta = res.x
            diff_mat0 = - 0.5 * (diff_mat / (sigma ** 2 + eta ** 2))
            diff_mat1 = np.sum(np.exp(diff_mat0), axis=0)
            diff_mat2 = -(1.0 / N) * np.sum(np.log2((1.0 / N) * diff_mat1))
            I_XT_temp += diff_mat2 - d * np.log2((sigma ** 2) / (eta ** 2 + sigma ** 2))
            # print diff_mat2 - d*np.log2((sigma**2)/(eta**2+sigma**2))
        I_XT_temp /= len(batch_points)
        I_XT.append(I_XT_temp)
    sys.stdout.flush()
    return I_XT


def estimate_IY_by_network(data, labels, from_layer=0):
    if len(data.shape) > 2:
        input_size = data.shape[1:]
    else:
        input_size = data.shape[1]
    p_y_given_t_i = data
    acc_all = [0]
    if from_layer < 5:

        acc_all = []
        g1 = tf.Graph()  ## This is one graph
        with g1.as_default():
            # For each epoch and for each layer we calculate the best decoder - we train a 2 lyaer network
            cov_net = 4
            model = mo.Model(input_size, [400, 100, 50], labels.shape[1], 0.0001, '', cov_net=cov_net,
                             from_layer=from_layer)
            if from_layer < 5:
                optimizer = model.optimize
            init = tf.global_variables_initializer()
            num_of_ephocs = 50
            batch_size = 51
            batch_points = np.rint(np.arange(0, data.shape[0] + 1, batch_size)).astype(dtype=np.int32)
            if data.shape[0] not in batch_points:
                batch_points = np.append(batch_points, [data.shape[0]])
        with tf.Session(graph=g1) as sess:
            sess.run(init)
            if from_layer < 5:
                for j in range(0, num_of_ephocs):
                    for i in range(0, len(batch_points) - 1):
                        batch_xs = data[batch_points[i]:batch_points[i + 1], :]
                        batch_ys = labels[batch_points[i]:batch_points[i + 1], :]
                        feed_dict = {model.x: batch_xs, model.labels: batch_ys}
                        if cov_net == 1:
                            feed_dict[model.drouput] = 0.5
                        optimizer.run(feed_dict)
            p_y_given_t_i = []
            batch_size = 256
            batch_points = np.rint(np.arange(0, data.shape[0] + 1, batch_size)).astype(dtype=np.int32)
            if data.shape[0] not in batch_points:
                batch_points = np.append(batch_points, [data.shape[0]])
            for i in range(0, len(batch_points) - 1):
                batch_xs = data[batch_points[i]:batch_points[i + 1], :]
                batch_ys = labels[batch_points[i]:batch_points[i + 1], :]
                feed_dict = {model.x: batch_xs, model.labels: batch_ys}
                if cov_net == 1:
                    feed_dict[model.drouput] = 1
                p_y_given_t_i_local, acc = sess.run([model.prediction, model.accuracy],
                                                    feed_dict=feed_dict)
                acc_all.append(acc)
                if i == 0:
                    p_y_given_t_i = np.array(p_y_given_t_i_local)
                else:
                    p_y_given_t_i = np.concatenate((p_y_given_t_i, np.array(p_y_given_t_i_local)), axis=0)
                    # print ("The accuracy of layer number - {}  - {}".format(from_layer, np.mean(acc_all)))
    max_indx = len(p_y_given_t_i)
    labels_cut = labels[:max_indx, :]
    true_label_index = np.argmax(labels_cut, 1)
    s = np.log2(p_y_given_t_i[np.arange(len(p_y_given_t_i)), true_label_index])
    I_TY = np.mean(s[np.isfinite(s)])
    PYs = np.sum(labels_cut, axis=0) / labels_cut.shape[0]
    Hy = np.nansum(-PYs * np.log2(PYs + np.spacing(1)))
    I_TY = Hy + I_TY
    I_TY = I_TY if I_TY >= 0 else 0
    acc = np.mean(acc_all)
    sys.stdout.flush()
    return I_TY, acc


def calc_varitional_information(data, labels, model_path, layer_numer, num_of_layers, epoch_index, input_size,
                                layerSize, sigma, pys, ks,
                                search_sigma=False, estimate_y_by_network=False):
    """Calculate estimation of the information using vartional IB"""
    # Assumpations
    estimate_y_by_network = True
    # search_sigma = False
    data_x = data.reshape(data.shape[0], -1)

    if search_sigma:
        sigmas = np.linspace(0.2, 10, 20)
        sigmas = [0.2]

    else:
        sigmas = [sigma]
    if False:
        I_XT = calc_all_sigams(data_x, sigmas)
    else:
        I_XT = 0
    if estimate_y_by_network:

        I_TY, acc = estimate_IY_by_network(data, labels, from_layer=layer_numer)
    else:
        I_TY = 0
    with printoptions(precision=3, suppress=True, formatter={'float': '{: 0.3f}'.format}):
        print('[{0}:{1}] - I(X;T) - {2}, I(X;Y) - {3}, accuracy - {4}'.format(epoch_index, layer_numer,
                                                                              np.array(I_XT).flatten(), I_TY, acc))
    sys.stdout.flush()

    # I_est = mutual_inform[ation((data, labels[:, 0][:, None]), PYs, k=ks)
    # I_est,I_XT = 0, 0
    params = {}
    # params['DKL_YgX_YgT'] = DKL_YgX_YgT
    # params['pts'] = p_ts
    # params['H_Xgt'] = H_Xgt
    params['local_IXT'] = I_XT
    params['local_ITY'] = I_TY
    return params

def estimate_Information(Xs, Ys, Ts):
	"""Estimation of the MI from missing data based on k-means clustring"""
	estimate_IXT = ee.mi(Xs, Ts)
	estimate_IYT = ee.mi(Ys, Ts)
	# estimate_IXT1 = ee.mi(Xs, Ts)
	# estimate_IYT1 = ee.mi(Ys, Ts)
	return estimate_IXT, estimate_IYT