File size: 6,033 Bytes
8876cd2
 
3c78fe7
 
 
 
8af6ce4
8876cd2
 
1bb20ca
8876cd2
a506979
62e17b3
15350d5
 
781b976
e978718
 
eba369c
b832f73
1bb20ca
 
 
4a6e928
8876cd2
3c78fe7
 
 
94096e8
3c78fe7
 
 
15350d5
3c78fe7
 
 
 
 
8bf2b88
3c78fe7
 
 
 
 
 
 
 
 
 
 
8af6ce4
 
3c78fe7
79b1800
 
 
 
 
 
 
 
 
 
 
 
 
8af6ce4
79b1800
 
 
8af6ce4
79b1800
 
8af6ce4
79b1800
 
8af6ce4
3c78fe7
b832f73
 
 
 
 
 
 
 
 
 
3c78fe7
8876cd2
 
 
 
 
197ffb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8876cd2
 
197ffb2
 
 
 
8876cd2
 
197ffb2
8876cd2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA

# Load your saved model
# model = joblib.load("ann_model.joblib")

# # Define the prediction function
def predict(age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country):
    features = [age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country]
    columns = [
    "age", "workclass", "educational-num", "marital-status", "occupation", 
    "relationship", "race", "gender", "capital-gain", "capital-loss", 
    "hours-per-week", "native-country"]
    df = pd.DataFrame(index=features, columns=columns)
    fixed_features = cleaning_features(df)
    # prediction = model.predict(features)
    # prediction = 1
    # return "Income >50K" if prediction == 1 else "Income <=50K"
    return print(fixed_features)

def cleaning_features(data):
    le = LabelEncoder()
    scaler = StandardScaler()
    encoder = OneHotEncoder(sparse=False)
    numeric_cols = ['age', 'educational-num', 'hours-per-week']
    columns_to_encode = ['race','marital-status','relationship']
    
    
    # 1. Scale numerical features
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

    # 2. Label encode gender and income
    data['gender'] = le.fit_transform(data['gender'])
    data['educational-num'] = le.fit_transform(data['educational-num'])
    
    # 3. One-hot encode race
    for N in columns_to_encode:
        race_encoded = encoder.fit_transform(data[[N]])
        race_encoded_cols = encoder.get_feature_names_out([N])
        race_encoded_df = pd.DataFrame(race_encoded, columns=race_encoded_cols, index=data.index)
        # Combine the encoded data with original dataframe
        data = pd.concat([data.drop(N, axis=1), race_encoded_df], axis=1)
    # Binarize native country
    data['native-country'] = data['native-country'].apply(lambda x: x == 'United-States')
    data['native-country'] = data['native-country'].astype(int)
    data = pca(data)
    return data

# def pca(data):
#     encoder = OneHotEncoder(sparse_output=False)
#     one_hot_encoded = encoder.fit_transform(data[['workclass', 'occupation']])
#     encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
#     pca_net = PCA(n_components=10)
#     pca_result_net = pca_net.fit_transform(encoded_columns_df)
#     pca_columns = [f'pca_component_{i+1}' for i in range(10)]
#     pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
#     data = data.drop(columns=['workclass', 'occupation'], axis=1) #remove the original columns
#     data = pd.concat([data, pca_df], axis=1)
#     return data


def pca(data):
    encoder = joblib.load('onehot_encoder.joblib')
    pca_model = joblib.load('pca.joblib')
    one_hot_encoded = encoder.transform(data[['workclass', 'occupation']])
    encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
    pca_result_net = pca_model.transform(encoded_columns_df)
    pca_columns = [f'pca_component_{i+1}' for i in range(pca_model.n_components_)]
    pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
    data = data.drop(columns=['workclass', 'occupation'], axis=1)
    data = pd.concat([data, pca_df], axis=1) 
    return data

def hbdscan_tranform(df_transformed):
    df_transformed['capital-gain'] = np.log1p(df_transformed['capital-gain'])
    df_transformed['capital-loss'] = np.log1p(df_transformed['capital-loss'])
    
    # Apply RobustScaler to all numerical features
    numerical_features = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
    scaler = RobustScaler()
    df_transformed[numerical_features] = scaler.fit_transform(df_transformed[numerical_features])
    return df_transformed


# Create the Gradio interface
interface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Slider(18, 90, step=1, label="Age"),
        gr.Dropdown(
            ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", 
             "Local-gov", "State-gov", "Without-pay", "Never-worked"], 
            label="Workclass"
        ),
        gr.Dropdown(
            ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", 
             "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", 
             "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"], 
            label="Education"
        ),
        gr.Dropdown(
            ["Married-civ-spouse", "Divorced", "Never-married", "Separated", 
             "Widowed", "Married-spouse-absent", "Married-AF-spouse"], 
            label="Marital Status"
        ),
        gr.Dropdown(
            ["Tech-support", "Craft-repair", "Other-service", "Sales", 
             "Exec-managerial", "Prof-specialty", "Handlers-cleaners", 
             "Machine-op-inspct", "Adm-clerical", "Farming-fishing", 
             "Transport-moving", "Priv-house-serv", "Protective-serv", 
             "Armed-Forces"], 
            label="Occupation"
        ),
        gr.Dropdown(
            ["Wife", "Husband", "Own-child", "Unmarried", "Other-relative", "Not-in-family"], 
            label="Relationship"
        ),
        gr.Dropdown(
            ["White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"], 
            label="Race"
        ),
        gr.Dropdown(
            ["Male", "Female"], 
            label="Gender"
        ),
        gr.Slider(1, 90, step=1, label="Hours Per Week"),
        gr.Slider(0, 100000, step=100, label="Capital Gain"),
        gr.Slider(0, 5000, step=50, label="Capital Loss"),
        gr.Dropdown(
            ["United-States", "Other"], 
            label="Native Country"
        )
    ],
    outputs="text",
    title="Adult Income Predictor"
)

# Launch the app
interface.launch()