File size: 10,510 Bytes
8876cd2
 
3c78fe7
 
 
 
8af6ce4
4727e02
ce3be16
445bf3a
258d659
8876cd2
b18aa7e
8876cd2
a506979
445bf3a
0c9d457
 
 
 
f9d5a22
a4c0920
0c9d457
00423c4
e725540
3d9a50d
dc31fa3
ce3be16
 
7702d79
ce3be16
b18aa7e
1bb20ca
b18aa7e
8876cd2
d58f189
445bf3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d58f189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258d659
d58f189
258d659
d58f189
 
3d9a50d
932646c
 
5cdc823
99bfd14
 
 
 
5cdc823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
932646c
85775ba
4f71456
5cdc823
3c78fe7
b651e33
 
3c78fe7
99bfd14
 
85775ba
4f71456
85775ba
3c78fe7
d93a02f
4f71456
932646c
c49dbd7
e0f2797
932646c
e0f2797
932646c
 
 
 
 
 
e0f2797
b651e33
b18aa7e
8af6ce4
3c78fe7
79b1800
 
 
 
 
 
 
 
 
 
 
 
 
8af6ce4
221bc87
 
 
 
c71a3a0
221bc87
c71a3a0
221bc87
79b1800
8af6ce4
79b1800
 
8af6ce4
79b1800
 
8af6ce4
3c78fe7
b832f73
 
 
 
 
 
 
 
 
 
445bf3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8876cd2
445bf3a
 
 
 
d58f189
 
 
 
 
 
 
 
32a9314
445bf3a
258d659
 
8876cd2
 
32a9314
8876cd2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import gradio as gr
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
import pickle
from tensorflow.keras.models import load_model
import pickle
import hdbscan



# # Define the prediction function
def predict_ann(age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country):
    # columns = {
    # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], 
    # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    # "hours-per-week":[hours_per_week], "native-country":[native_country]}
    columns = { "0":[0],
    "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation],
    "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    "hours-per-week":[hours_per_week], "native-country":[native_country]}
    df = pd.DataFrame(data=columns)
    fixed_features = cleaning_features(df,race)
    print(fixed_features)
    # with open('ann_model.pkl', 'rb') as ann_model_file:
    #     ann_model = pickle.load(ann_model_file)
    scaler = StandardScaler()
    ann_model = load_model('ann_model.h5')
    prediction = ann_model.predict(fixed_features)
    # prediction = 1
    return "Income >50K" if prediction == 1 else "Income <=50K"

def predict_rf(age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country):
    # columns = {
    # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], 
    # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    # "hours-per-week":[hours_per_week], "native-country":[native_country]}
    columns = { "0":[0],
    "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation],
    "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    "hours-per-week":[hours_per_week], "native-country":[native_country]}
    df = pd.DataFrame(data=columns)
    fixed_features = cleaning_features(df,race)
    print(fixed_features)
    # with open('ann_model.pkl', 'rb') as ann_model_file:
    #     ann_model = pickle.load(ann_model_file)
    scaler = StandardScaler()
    rf_model = pickle.load(open('rf_model.pkl', 'rb'))
    prediction = rf_model.predict(fixed_features)
    # prediction = 1
    return "Income >50K" if prediction == 1 else "Income <=50K"

def predict_hb(age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country):
    # columns = {
    # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], 
    # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    # "hours-per-week":[hours_per_week], "native-country":[native_country]}
    columns = { "0":[0],
    "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation],
    "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    "hours-per-week":[hours_per_week], "native-country":[native_country]}
    df = pd.DataFrame(data=columns)
    fixed_features = cleaning_features(df,race)
    print(fixed_features)
    # with open('ann_model.pkl', 'rb') as ann_model_file:
    #     ann_model = pickle.load(ann_model_file)
    scaler = StandardScaler()
    X = scaler.fit_transform(fixed_features)
    hb_model = pickle.load(open('hdbscan_model.pkl', 'rb'))
    prediction = hdbscan.approximate_predict(hb_model,fixed_features)
    # prediction = 1
    return f"Predicted Cluster (HDBSCAN): {prediction}"


def cleaning_features(data,race):
    # with open('race_onehot_encoder.pkl', 'rb') as enc_file:
    #     encoder = pickle.load(enc_file)
    
    with open('label_encoder_work.pkl', 'rb') as le_file:
        le_work = pickle.load(le_file)
    with open('label_encoder_occ.pkl', 'rb') as le_file:
        le_occ = pickle.load(le_file)

    with open('scaler.pkl', 'rb') as scaler_file:
        scaler = pickle.load(scaler_file)
        
    education_num_mapping = {
        "Preschool": 1,
        "1st-4th": 2,
        "5th-6th": 3,
        "7th-8th": 4,
        "9th": 5,
        "10th": 6,
        "11th": 7,
        "12th": 8,
        "HS-grad": 9,
        "Some-college": 10,
        "Assoc-voc": 11,
        "Assoc-acdm": 12,
        "Bachelors": 13,
        "Masters": 14,
        "Doctorate": 15,
        "Prof-school": 16
    }
    race_categories = ["Amer-Indian-Eskimo", "Asian-Pac-Islander","Black", "Other","White"]
    gender_mapping = {"Male":1,"Female":0}
    country_mapping = {"United-States":1,"Other":0}
    
    numeric_cols = ['age', 'educational-num', 'hours-per-week']
    # columns_to_encode = ['race','marital-status','relationship']
    columns_to_encode = ['race']
    
    data['workclass'] = le_work.transform(data['workclass'])
    data['occupation'] = le_occ.transform(data['occupation'])
    data['gender'] = data['gender'].map(gender_mapping)
    data['native-country'] = data['native-country'].map(country_mapping)
    data['educational-num'] = data['educational-num'].map(education_num_mapping)
    
    data[numeric_cols] = scaler.transform(data[numeric_cols])

    for races in race_categories:
        if race == races:
            data[f'race_{races}'] = 1
        else:
            data[f'race_{races}'] = 0
    # for N in columns_to_encode:
    #     race_encoded = encoder.transform(data[[N]])
    #     race_encoded_cols = encoder.get_feature_names_out([N])
    #     race_encoded_df = pd.DataFrame(race_encoded, columns=race_encoded_cols, index=data.index)
    #     # Combine the encoded data with original dataframe
    #     data = pd.concat([data.drop(N, axis=1), race_encoded_df], axis=1)
    data = data.drop(columns=['race'])

    data = pca(data)
    return data

# def pca(data):
#     encoder = OneHotEncoder(sparse_output=False)
#     one_hot_encoded = encoder.fit_transform(data[['workclass', 'occupation']])
#     encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
#     pca_net = PCA(n_components=10)
#     pca_result_net = pca_net.fit_transform(encoded_columns_df)
#     pca_columns = [f'pca_component_{i+1}' for i in range(10)]
#     pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
#     data = data.drop(columns=['workclass', 'occupation'], axis=1) #remove the original columns
#     data = pd.concat([data, pca_df], axis=1)
#     return data


def pca(data):
    encoder_pkl = 'onehot_encoder.pkl'
    pca_model_pkl = 'pca.pkl'
    
    with open(pca_model_pkl, 'rb') as file:  
        pca_model = pickle.load(file)
    with open(encoder_pkl, 'rb') as file:  
        encoder = pickle.load(file)
    
    one_hot_encoded = encoder.transform(data[['workclass', 'occupation']])
    encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
    pca_result_net = pca_model.transform(encoded_columns_df)
    pca_columns = [f'pca_component_{i+1}' for i in range(pca_model.n_components_)]
    pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
    data = data.drop(columns=['workclass', 'occupation'], axis=1)
    data = pd.concat([data, pca_df], axis=1) 
    return data

def hbdscan_tranform(df_transformed):
    df_transformed['capital-gain'] = np.log1p(df_transformed['capital-gain'])
    df_transformed['capital-loss'] = np.log1p(df_transformed['capital-loss'])
    
    # Apply RobustScaler to all numerical features
    numerical_features = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
    scaler = RobustScaler()
    df_transformed[numerical_features] = scaler.fit_transform(df_transformed[numerical_features])
    return df_transformed

# Shared inputs
inputs = [
    gr.Slider(18, 90, step=1, label="Age"),
    gr.Dropdown(["Male", "Female"], label="Gender"),
    gr.Dropdown(["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"], label="Workclass"),
    gr.Dropdown(["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th", "HS-grad", "Some-college", "Assoc-voc", "Assoc-acdm", "Bachelors", "Masters", "Doctorate", "Prof-school"], label="Education"),
    gr.Dropdown(["Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"], label="Marital Status"),
    gr.Dropdown(["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"], label="Occupation"),
    gr.Dropdown(["Wife", "Husband", "Own-child", "Not-in-family", "Other-relative", "Unmarried"], label="Relationship"),
    gr.Dropdown(["White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"], label="Race"),
    gr.Slider(0, 100000, step=100, label="Capital Gain"),
    gr.Slider(0, 5000, step=50, label="Capital Loss"),
    gr.Slider(1, 60, step=1, label="Hours Per Week"),
    gr.Dropdown(["United-States", "Canada", "Mexico", "Other"], label="Native Country")
]

# Interfaces for each model
ann_interface = gr.Interface(
    fn=predict_ann,
    inputs=inputs,
    outputs="text",
    title="Artificial Neural Network",
    description="Predict income using an Artificial Neural Network."
)

rf_interface = gr.Interface(
    fn=predict_rf,
    inputs=inputs,
    outputs="text",
    title="Random Forest",
    description="Predict income using a Random Forest model."
)

hb_interface = gr.Interface(
    fn=predict_hb,
    inputs=inputs,
    outputs="text",
    title="HDBScan Clustering",
    description="Predict income using a HDBScan Clustering model."
)


interface = gr.TabbedInterface(
    [ann_interface, rf_interface, hb_interface],
    ["ANN Model", "Random Forest Model", "HDBScan Model"]
)


interface.launch()