from cProfile import label import numpy as np import pandas as pd import networkx as nx from aif360.datasets import BinaryLabelDataset from aif360.algorithms.preprocessing import DisparateImpactRemover, Reweighing, LFR from aif360.metrics import BinaryLabelDatasetMetric def fairness_calculation(dataset_name, dataset_path, sens_attr, predict_attr): if dataset_name == 'nba': fairness_calculation_nba(dataset_path, sens_attr, predict_attr) elif dataset_name == 'alibaba': fairness_calculation_alibaba(dataset_path, sens_attr, predict_attr) elif dataset_name == 'tecent': fairness_calculation_tecent(dataset_path, sens_attr, predict_attr) elif dataset_name == 'pokec_z' or dataset_name == 'pokec_n': fairness_calculation_pokec(dataset_path, dataset_path, sens_attr, predict_attr) def fairness_calculation_nba(dataset_path, sens_attr, predict_attr): #data = nx.read_graphml(dataset_path) #df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') df = pd.read_csv(dataset_path) if df.columns[0] != 'user_id': df = df.reset_index(level=0) df = df.rename(columns={"index": "user_id"}) if type(df['user_id'][0]) != np.int64: df['user_id'] = pd.to_numeric(df['user_id']) df = df.astype({'user_id': int}) df[predict_attr] = df[predict_attr].replace(-1, 0) #dataset_fairness(df, sens_attr, predict_attr) disparate_impact(df, sens_attr, predict_attr) def fairness_calculation_alibaba(dataset_path, sens_attr, label): # data = nx.read_graphml(dataset_path) #df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') df = pd.read_csv(dataset_path) #if df.columns[0] != 'userid': # df = df.reset_index(level=0) # df = df.rename(columns={"index": "userid"}) #if type(df['userid'][0]) != np.int64: # df['userid'] = pd.to_numeric(df['userid']) # df = df.astype({'userid': int}) #if sens_attr == 'age' or sens_attr == 'age_level' or sens_attr == 'bin_age': # df.rename(columns={'age_level':'age', 'final_gender_code':'gender'}, inplace=True) df[sens_attr] = df[sens_attr].replace(1, 0) df[sens_attr] = df[sens_attr].replace(2, 0) df[sens_attr] = df[sens_attr].replace(3, 0) df[sens_attr] = df[sens_attr].replace(4, 1) df[sens_attr] = df[sens_attr].replace(5, 1) df[sens_attr] = df[sens_attr].replace(6, 1) df[label] = df[label].replace(1, 0) df[label] = df[label].replace(2, 1) #dataset_fairness(df, sens_attr, label) disparate_impact(df, sens_attr, label) def fairness_calculation_tecent(dataset_path, sens_attr, label): #data = nx.read_graphml(dataset_path) #df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') df = pd.read_csv(dataset_path) #if df.columns[0] != 'user_id': # df = df.reset_index(level=0) # df = df.rename(columns={"index": "user_id"}) #if type(df['user_id'][0]) != np.int64: # df['user_id'] = pd.to_numeric(df['user_id']) # df = df.astype({'user_id': int}) #if sens_attr == 'bin_age': # df.rename(columns={'age_range':'age'}, inplace=True) if sens_attr == 'age_range': age_dic = {'11~15':0, '16~20':0, '21~25':0, '26~30':1, '31~35':1, '36~40':2, '41~45':2, '46~50':3, '51~55':3, '56~60':4, '61~65':4, '66~70':4, '71~':4} df[[sens_attr]] = df[[sens_attr]].applymap(lambda x:age_dic[x]) df[sens_attr] = df[sens_attr].replace(1,0) df[sens_attr] = df[sens_attr].replace(2,1) df[sens_attr] = df[sens_attr].replace(3,1) df[sens_attr] = df[sens_attr].replace(4,1) #dataset_fairness(df, sens_attr, label) disparate_impact(df, sens_attr, label) def fairness_calculation_pokec(dataset_path, dataset_name, sens_attr, label): #data = nx.read_graphml(dataset_path) #df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') df = pd.read_csv(dataset_path) #if df.columns[0] != 'user_id': # df = df.reset_index(level=0) # df = df.rename(columns={"index": "user_id"}) #if type(df['user_id'][0]) != np.int64: # df['user_id'] = pd.to_numeric(df['user_id']) # df = df.astype({'user_id': int}) if dataset_name == 'pokec_z': df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 0) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 0) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(4, 1) #elif dataset_name == 'pokec_n': # df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0) # df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 1) # df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 1) # df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1) # df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1) #dataset_fairness(df, sens_attr, label) disparate_impact(df, sens_attr, label) def dataset_fairness(df, sens_attr, label): total_number_of_sens0 = len(df.loc[df[sens_attr] == 0]) total_number_of_sens1 = len(df.loc[df[sens_attr] == 1]) number_of_positive_sens0 = len(df.loc[(df[sens_attr] == 0) & (df[label] == 1)]) number_of_positive_sens1 = len(df.loc[(df[sens_attr] == 1) & (df[label] == 1)]) fairness = np.absolute(number_of_positive_sens0) / np.absolute(total_number_of_sens0) - np.absolute(number_of_positive_sens1) / np.absolute(total_number_of_sens1) dataset_fainress = fairness * 100 print('Dataset fairness:', dataset_fainress) def disparate_impact(df, sens_attr, label): pr_unpriv = calc_prop(df, sens_attr, 1, label, 1) #print('pr_unpriv: ', pr_unpriv) pr_priv = calc_prop(df, sens_attr, 0, label, 1) #print('pr_priv:', pr_priv) disp = pr_unpriv / pr_priv bin_label_dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df, label_names=[label], protected_attribute_names=[sens_attr], unprivileged_protected_attributes=[1]) privileged_groups = [{sens_attr: 0}] unprivileged_groups = [{sens_attr: 1}] metric_dataset = BinaryLabelDatasetMetric(bin_label_dataset, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) # just for comparison print('Dataset Fairness:', disp) #print("Disparate impact (from AIF360) = %f" %metric_dataset.disparate_impact()) def calc_prop(data, group_col, group, output_col, output_val): new = data[data[group_col] == group] return len(new[new[output_col] == output_val])/len(new) def disparate_impact_remover(df, sens_attr, label): if 'final_gender_code' in df: df.rename(columns={'final_gender_code':'gender'}, inplace=True) elif 'age_level' in df: df.rename(columns={'age_level': 'age'}, inplace=True) bin_label_dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df, label_names=[label], protected_attribute_names=[sens_attr], unprivileged_protected_attributes=[1]) di = DisparateImpactRemover(repair_level=1 ) di_transformation = di.fit_transform(bin_label_dataset) privileged_groups = [{sens_attr: 0}] unprivileged_groups = [{sens_attr: 1}] metric_original_dataset = BinaryLabelDatasetMetric(bin_label_dataset, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) metric_new_dataset = BinaryLabelDatasetMetric(di_transformation, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) print("Original Disparate impact (from AIF360) = %f" %metric_original_dataset.disparate_impact()) print("After debaising Disparate impact (from AIF360) = %f" %metric_new_dataset.disparate_impact()) new_df = di_transformation.convert_to_dataframe()[0] return new_df def reweighting(df, sens_attr, label): print('we are in reweighting') bin_label_dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df, label_names=[label], protected_attribute_names=[sens_attr], unprivileged_protected_attributes=[1]) privileged_groups = [{sens_attr: 0}] unprivileged_groups = [{sens_attr: 1}] RW = Reweighing(unprivileged_groups = unprivileged_groups, privileged_groups = privileged_groups) RW.fit(bin_label_dataset) rw_transformation = RW.transform(bin_label_dataset) metric_original_dataset = BinaryLabelDatasetMetric(bin_label_dataset, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) metric_new_dataset = BinaryLabelDatasetMetric(rw_transformation, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) print("Original Disparate impact (from AIF360) = %f" %metric_original_dataset.disparate_impact()) print("After debaising Disparate impact (from AIF360) = %f" %metric_new_dataset.disparate_impact()) df_new = rw_transformation.convert_to_dataframe()[0] return df_new def lfr(df, sens_attr, label): bin_label_dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df, label_names=[label], protected_attribute_names=[sens_attr], unprivileged_protected_attributes=[1]) privileged_groups = [{sens_attr: 0}] unprivileged_groups = [{sens_attr: 1}] TR = LFR(unprivileged_groups = unprivileged_groups, privileged_groups = privileged_groups) TR = TR.fit(bin_label_dataset) dset_lfr_trn = TR.transform(bin_label_dataset, threshold = 0.3) dset_lfr_trn = bin_label_dataset.align_datasets(dset_lfr_trn) metric_original_dataset = BinaryLabelDatasetMetric(bin_label_dataset, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) metric_new_dataset = BinaryLabelDatasetMetric(dset_lfr_trn, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) print("Original Disparate impact (from AIF360) = %f" %metric_original_dataset.disparate_impact()) print("After debaising Disparate impact (from AIF360) = %f" %metric_new_dataset.disparate_impact()) df_new = dset_lfr_trn.convert_to_dataframe()[0] return df_new def sample(df, sens_attr, label): print('we are in sample') dp = df.loc[(df[sens_attr] == 0) & (df[label] == 1)] dn = df.loc[(df[sens_attr] == 0) & (df[label] == 0)] fp = df.loc[(df[sens_attr] == 1) & (df[label] == 1)] fn = df.loc[(df[sens_attr] == 1) & (df[label] == 0)] wdp = len(df.loc[df[sens_attr] == 0]) * len(df.loc[df[label] == 1]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)]) wdn = len(df.loc[df[sens_attr] == 0]) * len(df.loc[df[label] == 0]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)]) wfp = len(df.loc[df[sens_attr] == 1]) * len(df.loc[df[label] == 1]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)]) wfn = len(df.loc[df[sens_attr] == 1]) * len(df.loc[df[label] == 0]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)]) # sample dp_sample = dp.sample(n=int(wdp), random_state=1, replace=True) dn_sample = dn.sample(n=int(wdn), random_state=1, replace=True) fp_sample = fp.sample(n=int(wfp), random_state=1, replace=True) fn_sample = fn.sample(n=int(wfn), random_state=1, replace=True) # merge df_new = pd.concat([dp_sample, dn_sample, fp_sample, fn_sample]).drop_duplicates().reset_index(drop=True) return df_new ''' def fairness_calculation(dataset_path, dataset_name, sens_attr, predict_attr, label): data = nx.read_graphml(dataset_path) df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') if df.columns[0] != 'userid': # if so, then we make it as the first column df = df.reset_index(level=0) df = df.rename(columns={"index": 'userid'}) # check if user_id column is not string if type(df['userid'][0]) != np.int64: # if so, we convert it to int df['userid'] = pd.to_numeric(df['userid']) df = df.astype({'userid': int}) if predict_attr != None: label == predict_attr if dataset_name == 'pokec_z': df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 0) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 0) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(4, 1) elif dataset_name == 'pokec_n': df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 1) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 1) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1) df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1) elif dataset_name == 'alibaba': df['age_level'] = df['age_level'].replace(1, 0) df['age_level'] = df['age_level'].replace(2, 0) df['age_level'] = df['age_level'].replace(3, 0) df['age_level'] = df['age_level'].replace(4, 1) df['age_level'] = df['age_level'].replace(5, 1) df['age_level'] = df['age_level'].replace(6, 1) df['final_gender_code'] = df['final_gender_code'].replace(1, 0) df['final_gender_code'] = df['final_gender_code'].replace(2, 1) #df.rename(columns={'age_level':'age', 'final_gender_code':'gender'}, inplace=True) elif dataset_name == 'tecent': age_dic = {'11~15':0, '16~20':0, '21~25':0, '26~30':1, '31~35':1, '36~40':2, '41~45':2, '46~50':3, '51~55':3, '56~60':4, '61~65':4, '66~70':4, '71~':4} df[["age_range"]] = df[["age_range"]].applymap(lambda x:age_dic[x]) df["age_range"] = df["age_range"].replace(1,0) df["age_range"] = df["age_range"].replace(2,1) df["age_range"] = df["age_range"].replace(3,1) df["age_range"] = df["age_range"].replace(4,1) df.rename(columns={'age_level':'age', 'final_gender_code':'gender'}, inplace=True) elif dataset_name == 'nba': df['SALARY'] = df['SALARY'].replace(-1, 0) #df['SALARY'] = df['SALARY'].replace(0, 1) #df['SALARY'] = df['SALARY'].replace(1,1) # old calculation total_number_of_sens0 = len(df.loc[df[sens_attr] == 0]) total_number_of_sens1 = len(df.loc[df[sens_attr] == 1]) number_of_positive_sens0 = len(df.loc[(df[sens_attr] == 0) & (df[label] == 1)]) number_of_positive_sens1 = len(df.loc[(df[sens_attr] == 1) & (df[label] == 1)]) fairness = np.absolute(number_of_positive_sens0) / np.absolute(total_number_of_sens0) - np.absolute(number_of_positive_sens1) / np.absolute(total_number_of_sens1) dataset_fainress = fairness * 100 print('dataset fairness:', dataset_fainress) # new calculation #one_df = df[df[sens_attr] == 0] #num_of_priv = one_df.shape[0] #zero_df = df[df[sens_attr] == 1] #num_of_unpriv = zero_df.shape[0] #unpriv_outcomes = zero_df[zero_df[label]==1].shape[0] #unpriv_ratio = unpriv_outcomes/num_of_unpriv #priv_outcomes = one_df[one_df[label]==1].shape[0] #priv_ratio = priv_outcomes/num_of_priv #disparate_impact = unpriv_ratio/priv_ratio #return disparate_impact pr_unpriv = calc_prop(df, sens_attr, 1, label, 1) #print('pr_unpriv: ', pr_unpriv) pr_priv = calc_prop(df, sens_attr, 0, label, 1) #print('pr_priv:', pr_priv) disp = pr_unpriv / pr_priv #return pr_unpriv / pr_priv print('Dsparate impact:', disp) #binaryLabelDataset =BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df, label_names=[label], protected_attribute_names=[sens_attr], unprivileged_protected_attributes=['1']) #di = DisparateImpactRemover(repair_level=1.0) #rp_train = di.fit_transform(binaryLabelDataset) #df_new = rp_train.convert_to_dataframe()[0] #print(dataset) #print(binaryLabelDataset) #return df_new '''