# Copyright 2019 Seth V. Neel, Michael J. Kearns, Aaron L. Roth, Zhiwei Steven Wu
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
"""Functions for manipulating and loading input data."""
import argparse
import numpy as np
import pandas as pd


def setup():
    parser = argparse.ArgumentParser(description='Fairness Data Cleaning')
    parser.add_argument(
        '-n',
        '--name',
        type=str,
        help='name of the to store the new datasets (Required)')
    parser.add_argument('-d',
                        '--dataset',
                        type=str,
                        help='name of the original dataset file (Required)')
    parser.add_argument(
        '-a',
        '--attributes',
        type=str,
        help=
        'name of the file representing which attributes are protected (unprotected = 0, protected = 1, label = 2) (Required)'
    )
    parser.add_argument(
        '-c',
        '--centered',
        default=False,
        action='store_true',
        required=False,
        help='Include this flag to determine whether data should be centered')
    args = parser.parse_args()
    return [args.name, args.dataset, args.attributes, args.centered]


def clean_dataset(dataset, attributes, centered):
    """Clean a dataset, given the filename for the dataset and the filename for the attributes.

    Args:
        :param dataset: Filename for dataset. The dataset should be formatted such that categorical
        variables use one-hot encoding
    and the label should be 0/1
        :param attributes: Filename for the attributes of the dataset. The file should have each column name in a list,
         and under this list should have 0 for an unprotected attribute, 1 for a protected attribute, and 2 for the
          attribute of the label.
        :param centered: boolean flag that determines whether to center the input covariates.
        :return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels
    """

    df = pd.read_csv(dataset)
    sens_df = pd.read_csv(attributes)

    ## Get and remove label Y
    y_col = [str(c) for c in sens_df.columns if sens_df[c][0] == 2]
    print('label feature: {}'.format(y_col))
    if (len(y_col) > 1):
        raise ValueError('More than 1 label column used')
    if (len(y_col) < 1):
        raise ValueError('No label column used')

    y = df[y_col[0]]

    ## Do not use labels in rest of data
    X = df.loc[:, df.columns != y_col[0]]
    X = X.loc[:, X.columns != 'Unnamed: 0']
    ## Create X_prime, by getting protected attributes
    sens_cols = [str(c) for c in sens_df.columns if sens_df[c][0] == 1]
    print('sensitive features: {}'.format(sens_cols))
    sens_dict = {c: 1 if c in sens_cols else 0 for c in df.columns}
    X, sens_dict = one_hot_code(X, sens_dict)
    sens_names = [key for key in sens_dict.keys() if sens_dict[key] == 1]
    print(
        'there are {} sensitive features including derivative features'.format(
            len(sens_names)))
    X_prime = X[sens_names]
    if centered:
        X = center(X)
        X_prime = center(X_prime)
    return X, X_prime, y


def center(X):
    for col in X.columns:
        X.loc[:, col] = X.loc[:, col] - np.mean(X.loc[:, col])
    return X


def array_to_tuple(x):
    # have to cast ndarray to hashable type in get_baseline()
    x = tuple([el[0] for el in x]) if x.__class__.__name__ == 'ndarray' else x
    return x


def one_hot_code(df1, sens_dict):
    cols = df1.columns
    for c in cols:
        if isinstance(df1[c][0], str):
            column = df1[c]
            df1 = df1.drop(c, 1)
            unique_values = list(set(column))
            n = len(unique_values)
            if n > 2:
                for i in range(n):
                    col_name = '{}.{}'.format(c, i)
                    col_i = [
                        1 if el == unique_values[i] else 0 for el in column
                    ]
                    df1[col_name] = col_i
                    sens_dict[col_name] = sens_dict[c]
                del sens_dict[c]
            else:
                col_name = c
                col = [1 if el == unique_values[0] else 0 for el in column]
                df1[col_name] = col
    return df1, sens_dict


def extract_df_from_ds(dataset):
    """Extract data frames from Transformer Data set

    Args:
         :param dataset: aif360 dataset

    Returns:
         :return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels
    """

    X = pd.DataFrame(dataset.convert_to_dataframe()[0])
    # remove labels
    X = X.drop(columns=dataset.label_names)
    # get sensitive attributes
    X_prime = X[dataset.protected_attribute_names]
    y = tuple(dataset.labels[:, 0])
    return X, X_prime, y


def get_data(dataset):
    # Helper for main method
    """Given name of dataset, load in the three datasets associated from the clean.py file
    :param dataset:
    :return:
    """
    X = pd.read_csv('dataset/' + dataset + '_features.csv')
    X_prime = pd.read_csv('dataset/' + dataset + '_protectedfeatures.csv')
    y = pd.read_csv('dataset/' + dataset + '_labels.csv',
                    names=['index', 'label'])
    y = y['label']
    return X, X_prime, y