File size: 6,411 Bytes
d2a8669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from turtle import pd
import numpy as np
import pandas as pd
import dgl
from fainress_component import disparate_impact_remover, reweighting, sample
import fastText
import torch

def pokec_z_RHGN_pre_process(df, dataset_user_id_name, sens_attr, label, debaising_approach=None):

    
    df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0)
    #df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 0)
    df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 0)
    df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1)
    df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1)
    df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(4, 1)

    if debaising_approach != 'sample':
        df = df.astype({'user_id': 'str'}, copy=False)
        df = df.astype({'completion_percentage':'str', 'AGE':'str', 'I_am_working_in_field':'str'}, copy=False)

    if debaising_approach != None:
        if debaising_approach == 'disparate_impact_remover':
            df = disparate_impact_remover(df, sens_attr, label)
        elif debaising_approach == 'reweighting':
            df = reweighting(df, sens_attr, label)
        elif debaising_approach == 'sample':
            df = sample(df, sens_attr, label)
            df = df.astype({'user_id':'str'}, copy=False)
            df = df.astype({'completion_percentage':'str', 'AGE':'str', 'I_am_working_in_field':'str'}, copy=False)

    if debaising_approach == 'reweighting' or debaising_approach == 'disparate_impact_remover':
        df.user_id = df.user_id.astype(np.int64)
        df.user_id = df.user_id.astype(str)

        df.completion_percentage = df.completion_percentage.astype(np.int64)
        df.completion_percentage = df.completion_percentage.astype(str)

        df.AGE = df.AGE.astype(np.int64)
        df.AGE = df.AGE.astype(str)

        df.I_am_working_in_field = df.I_am_working_in_field.astype(np.int64)
        df.I_am_working_in_field = df.I_am_working_in_field.astype(str)

    
    user_dic = {k: v for v, k in enumerate(df.user_id.drop_duplicates())}
    comp_dic = {k: v for v, k in enumerate(df.completion_percentage.drop_duplicates())}
    age_dic = {k: v for v, k in enumerate(df.AGE.drop_duplicates())}
    working_dic = {k: v for v, k in enumerate(df.I_am_working_in_field.drop_duplicates())}

    item_dic = {}
    c1, c2, c3=[], [], []
    '''
    if debaising_approach == 'sample':
        for i, row in df.iterrows():
            c1_1 = df.at[i, 'completion_percentage']
            if isinstance(c1_1, str):
                c1.append(comp_dic[c1_1])
            else:
                c1.append(comp_dic[c1_1.iloc[0]])

            c2_2 = df.at[i, 'AGE']
            if isinstance(c2_2, str):
                c2.append(age_dic[c2_2])
            else:
                c2.append(age_dic[c2_2.iloc[0]])

            c3_3 = df.at[i, 'I_am_working_in_field']
            if isinstance(c3_3, str):
                c3.append(working_dic[c3_3])
            else:
                c3.append(working_dic[c3_3.iloc[0]])
    '''
    if debaising_approach == 'disparate_impact_remover' or debaising_approach == 'reweighting':
        for i in range(len(df)):
            c1.append(comp_dic[df['completion_percentage'].iloc[i]])
            c2.append(age_dic[df['AGE'].iloc[i]])
            c3.append(working_dic[df['I_am_working_in_field'].iloc[i]])
    else:
        for i in range(len(df)):
            c1.append(comp_dic[df.at[i, 'completion_percentage']])
            c2.append(age_dic[df.at[i, 'AGE']])
            c3.append(working_dic[df.at[i, 'I_am_working_in_field']])
        
        
    print(min(c1), min(c2), min(c3))
    print(len(comp_dic), len(age_dic), len(working_dic))

    has_user = [user_dic[user] for user in df.user_id]
    is_made_by_user = [age_dic[item] for item in df.AGE]


    data_dict = {
        ("user", "has", "item"): (torch.tensor(has_user), torch.tensor(is_made_by_user)),
        ("item", "is_made_by", "user"): (torch.tensor(is_made_by_user), torch.tensor(has_user))
    }

    G = dgl.heterograph(data_dict)

    model = fasttext.load_model('../cc.zh.200.bin')

    temp1 = {k: model.get_sentence_vector(v) for v, k in comp_dic.items()}
    cid1_feature = torch.tensor([temp1[k] for _, k in comp_dic.items()])

    temp2 = {k: model.get_sentence_vector(v) for v, k in age_dic.items()}
    cid2_feature = torch.tensor([temp2[k] for _, k in age_dic.items()])
 

    temp3 = {k: model.get_sentence_vector(v) for v, k in working_dic.items()}
    cid3_feature = torch.tensor([temp3[k] for _, k in working_dic.items()])

    uid2id = {num: i for i, num in enumerate(df[dataset_user_id_name])}

    df_user = col_map(df, dataset_user_id_name, uid2id)
    user_label = label_map(df_user, df_user.columns[1:])

    label_age = user_label.AGE
    label_comp_perc = user_label.completion_percentage
    label_gender = user_label.gender
    label_region = user_label.region
    label_working = user_label.I_am_working_in_field
    label_lang = user_label.spoken_languages_indicator


    G.nodes['user'].data['age'] = torch.tensor(label_age[:G.number_of_nodes('user')])
    G.nodes['user'].data['completion_percentage'] = torch.tensor(label_comp_perc[:G.number_of_nodes('user')])
    G.nodes['user'].data['gender'] = torch.tensor(label_gender[:G.number_of_nodes('user')])
    G.nodes['user'].data['region'] = torch.tensor(label_region[:G.number_of_nodes('user')])
    G.nodes['user'].data['I_am_working_in_field'] = torch.tensor(label_working[:G.number_of_nodes('user')])
    G.nodes['user'].data['spoken_languages_indicator'] = torch.tensor(label_lang[:G.number_of_nodes('user')])

    G.nodes['item'].data['cid1'] = torch.tensor(c1[:G.number_of_nodes('item')])
    G.nodes['item'].data['cid2'] = torch.tensor(c2[:G.number_of_nodes('item')])
    G.nodes['item'].data['cid3'] = torch.tensor(c3[:G.number_of_nodes('item')])

    
    print(G)
    print(cid1_feature.shape)
    print(cid2_feature.shape)
    print(cid3_feature.shape)

    return G, cid1_feature, cid2_feature, cid3_feature


def col_map(df, col, num2id):
    df[[col]] = df[[col]].applymap(lambda x: num2id[x])
    return df

def label_map(label_df, label_list):
    for label in label_list:
        label2id = {num: i for i, num in enumerate(pd.unique(label_df[label]))}
        label_df = col_map(label_df, label, label2id)
    return label_df