import gradio as gr import joblib import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder from sklearn.impute import KNNImputer from sklearn.decomposition import PCA import pickle # # Define the prediction function def predict(age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country): features = [age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country] columns = { "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], "hours-per-week":[hours_per_week], "native-country":[native_country]} df = pd.DataFrame(data=columns) fixed_features = cleaning_features(df) with open('ann_model.pkl', 'rb') as ann_model_file: ann_model = pickle.load(ann_model_file) prediction = ann_model.predict(fixed_features) # prediction = 1 return "Income >50K" if prediction == 1 else "Income <=50K" def cleaning_features(data): with open('label_encoder_work.pkl', 'rb') as le_file: le_work = pickle.load(le_file) with open('label_encoder_occ.pkl', 'rb') as le_file: le_occ = pickle.load(le_file) with open('scaler.pkl', 'rb') as scaler_file: scaler = pickle.load(scaler_file) education_num_mapping = { "Preschool": 1, "1st-4th": 2, "5th-6th": 3, "7th-8th": 4, "9th": 5, "10th": 6, "11th": 7, "12th": 8, "HS-grad": 9, "Some-college": 10, "Assoc-voc": 11, "Assoc-acdm": 12, "Bachelors": 13, "Masters": 14, "Doctorate": 15, "Prof-school": 16 } gender_mapping = {"Male":1,"Female":0} country_mapping = {"United-States":1,"Other":0} numeric_cols = ['age', 'educational-num', 'hours-per-week'] columns_to_encode = ['race','marital-status','relationship'] data['workclass'] = le_work.transform(data['workclass']) data['occupation'] = le_occ.transform(data['occupation']) data['gender'] = data['gender'].map(gender_mapping) data['native-country'] = data['native-country'].map(country_mapping) data['educational-num'] = data['educational-num'].map(education_num_mapping) data[numeric_cols] = scaler.transform(data[numeric_cols]) data = pca(data) return data # def pca(data): # encoder = OneHotEncoder(sparse_output=False) # one_hot_encoded = encoder.fit_transform(data[['workclass', 'occupation']]) # encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out()) # pca_net = PCA(n_components=10) # pca_result_net = pca_net.fit_transform(encoded_columns_df) # pca_columns = [f'pca_component_{i+1}' for i in range(10)] # pca_df = pd.DataFrame(pca_result_net, columns=pca_columns) # data = data.drop(columns=['workclass', 'occupation'], axis=1) #remove the original columns # data = pd.concat([data, pca_df], axis=1) # return data def pca(data): encoder_pkl = 'onehot_encoder.pkl' pca_model_pkl = 'pca.pkl' with open(pca_model_pkl, 'rb') as file: pca_model = pickle.load(file) with open(encoder_pkl, 'rb') as file: encoder = pickle.load(file) one_hot_encoded = encoder.transform(data[['workclass', 'occupation']]) encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out()) pca_result_net = pca_model.transform(encoded_columns_df) pca_columns = [f'pca_component_{i+1}' for i in range(pca_model.n_components_)] pca_df = pd.DataFrame(pca_result_net, columns=pca_columns) data = data.drop(columns=['workclass', 'occupation'], axis=1) data = pd.concat([data, pca_df], axis=1) return data def hbdscan_tranform(df_transformed): df_transformed['capital-gain'] = np.log1p(df_transformed['capital-gain']) df_transformed['capital-loss'] = np.log1p(df_transformed['capital-loss']) # Apply RobustScaler to all numerical features numerical_features = ['age', 'capital-gain', 'capital-loss', 'hours-per-week'] scaler = RobustScaler() df_transformed[numerical_features] = scaler.fit_transform(df_transformed[numerical_features]) return df_transformed # Create the Gradio interface interface = gr.Interface( fn=predict, inputs=[ gr.Slider(18, 90, step=1, label="Age"), gr.Dropdown( ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"], label="Workclass" ), gr.Dropdown( ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"], label="Education" ), gr.Dropdown( ["Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"], label="Marital Status" ), gr.Dropdown( ["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"], label="Occupation" ), gr.Dropdown( ["Wife", "Husband", "Own-child", "Unmarried", "Other-relative", "Not-in-family"], label="Relationship" ), gr.Dropdown( ["White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"], label="Race" ), gr.Dropdown( ["Male", "Female"], label="Gender" ), gr.Slider(1, 60, step=1, label="Hours Per Week"), gr.Slider(0, 100000, step=100, label="Capital Gain"), gr.Slider(0, 5000, step=50, label="Capital Loss"), gr.Dropdown( ["United-States", "Other"], label="Native Country" ) ], outputs="text", title="Adult Income Predictor" ) # Launch the app interface.launch()