import gradio as gr
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
import pickle
from tensorflow.keras.models import load_model
import pickle
import hdbscan


# # Define the prediction function
def predict_ann(age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country):
    # columns = {
    # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], 
    # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    # "hours-per-week":[hours_per_week], "native-country":[native_country]}
    columns = { "0":[0],
    "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation],
    "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    "hours-per-week":[hours_per_week], "native-country":[native_country]}
    df = pd.DataFrame(data=columns)
    fixed_features = cleaning_features(df,race)
    print(fixed_features)
    # with open('ann_model.pkl', 'rb') as ann_model_file:
    #     ann_model = pickle.load(ann_model_file)
    scaler = StandardScaler()
    ann_model = load_model('ann_model.h5')
    prediction = ann_model.predict(fixed_features)
    # prediction = 1
    return "Income >50K" if prediction == 1 else "Income <=50K"

def predict_rf(age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country):
    # columns = {
    # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], 
    # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    # "hours-per-week":[hours_per_week], "native-country":[native_country]}
    columns = { "0":[0],
    "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation],
    "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    "hours-per-week":[hours_per_week], "native-country":[native_country]}
    df = pd.DataFrame(data=columns)
    fixed_features = cleaning_features(df,race)
    print(fixed_features)
    # with open('ann_model.pkl', 'rb') as ann_model_file:
    #     ann_model = pickle.load(ann_model_file)
    scaler = StandardScaler()
    rf_model = pickle.load(open('rf_model.pkl', 'rb'))
    prediction = rf_model.predict(fixed_features)
    # prediction = 1
    return "Income >50K" if prediction == 1 else "Income <=50K"

def predict_hb(age, workclass, education, marital_status, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country):
    # columns = {
    # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], 
    # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    # "hours-per-week":[hours_per_week], "native-country":[native_country]}
    columns = { "0":[0],
    "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation],
    "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], 
    "hours-per-week":[hours_per_week], "native-country":[native_country]}
    df = pd.DataFrame(data=columns)
    fixed_features = cleaning_features(df,race)
    print(fixed_features)
    # with open('ann_model.pkl', 'rb') as ann_model_file:
    #     ann_model = pickle.load(ann_model_file)
    scaler = StandardScaler()
    X = scaler.fit_transform(fixed_features)
    hb_model = pickle.load(open('hdbscan_model.pkl', 'rb'))
    prediction = hdbscan.approximate_predict(hb_model,fixed_features)
    # prediction = 1
    return f"Predicted Cluster (HDBSCAN): {prediction}"


def cleaning_features(data,race):
    # with open('race_onehot_encoder.pkl', 'rb') as enc_file:
    #     encoder = pickle.load(enc_file)
    
    with open('label_encoder_work.pkl', 'rb') as le_file:
        le_work = pickle.load(le_file)
    with open('label_encoder_occ.pkl', 'rb') as le_file:
        le_occ = pickle.load(le_file)

    with open('scaler.pkl', 'rb') as scaler_file:
        scaler = pickle.load(scaler_file)
        
    education_num_mapping = {
        "Preschool": 1,
        "1st-4th": 2,
        "5th-6th": 3,
        "7th-8th": 4,
        "9th": 5,
        "10th": 6,
        "11th": 7,
        "12th": 8,
        "HS-grad": 9,
        "Some-college": 10,
        "Assoc-voc": 11,
        "Assoc-acdm": 12,
        "Bachelors": 13,
        "Masters": 14,
        "Doctorate": 15,
        "Prof-school": 16
    }
    race_categories = ["Amer-Indian-Eskimo", "Asian-Pac-Islander","Black", "Other","White"]
    gender_mapping = {"Male":1,"Female":0}
    country_mapping = {"United-States":1,"Other":0}
    
    numeric_cols = ['age', 'educational-num', 'hours-per-week']
    # columns_to_encode = ['race','marital-status','relationship']
    columns_to_encode = ['race']
    
    data['workclass'] = le_work.transform(data['workclass'])
    data['occupation'] = le_occ.transform(data['occupation'])
    data['gender'] = data['gender'].map(gender_mapping)
    data['native-country'] = data['native-country'].map(country_mapping)
    data['educational-num'] = data['educational-num'].map(education_num_mapping)
    
    data[numeric_cols] = scaler.transform(data[numeric_cols])

    for races in race_categories:
        if race == races:
            data[f'race_{races}'] = 1
        else:
            data[f'race_{races}'] = 0
    # for N in columns_to_encode:
    #     race_encoded = encoder.transform(data[[N]])
    #     race_encoded_cols = encoder.get_feature_names_out([N])
    #     race_encoded_df = pd.DataFrame(race_encoded, columns=race_encoded_cols, index=data.index)
    #     # Combine the encoded data with original dataframe
    #     data = pd.concat([data.drop(N, axis=1), race_encoded_df], axis=1)
    data = data.drop(columns=['race'])

    data = pca(data)
    return data

# def pca(data):
#     encoder = OneHotEncoder(sparse_output=False)
#     one_hot_encoded = encoder.fit_transform(data[['workclass', 'occupation']])
#     encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
#     pca_net = PCA(n_components=10)
#     pca_result_net = pca_net.fit_transform(encoded_columns_df)
#     pca_columns = [f'pca_component_{i+1}' for i in range(10)]
#     pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
#     data = data.drop(columns=['workclass', 'occupation'], axis=1) #remove the original columns
#     data = pd.concat([data, pca_df], axis=1)
#     return data


def pca(data):
    encoder_pkl = 'onehot_encoder.pkl'
    pca_model_pkl = 'pca.pkl'
    
    with open(pca_model_pkl, 'rb') as file:  
        pca_model = pickle.load(file)
    with open(encoder_pkl, 'rb') as file:  
        encoder = pickle.load(file)
    
    one_hot_encoded = encoder.transform(data[['workclass', 'occupation']])
    encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
    pca_result_net = pca_model.transform(encoded_columns_df)
    pca_columns = [f'pca_component_{i+1}' for i in range(pca_model.n_components_)]
    pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
    data = data.drop(columns=['workclass', 'occupation'], axis=1)
    data = pd.concat([data, pca_df], axis=1) 
    return data

def hbdscan_tranform(df_transformed):
    df_transformed['capital-gain'] = np.log1p(df_transformed['capital-gain'])
    df_transformed['capital-loss'] = np.log1p(df_transformed['capital-loss'])
    
    # Apply RobustScaler to all numerical features
    numerical_features = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
    scaler = RobustScaler()
    df_transformed[numerical_features] = scaler.fit_transform(df_transformed[numerical_features])
    return df_transformed

# Shared inputs
ann_inputs = [
    gr.Slider(18, 90, step=1, label="Age"),
    gr.Dropdown(["Male", "Female"], label="Gender"),
    gr.Dropdown(["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"], label="Workclass"),
    gr.Dropdown(["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th", "HS-grad", "Some-college", "Assoc-voc", "Assoc-acdm", "Bachelors", "Masters", "Doctorate", "Prof-school"], label="Education"),
    gr.Dropdown(["Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"], label="Marital Status"),
    gr.Dropdown(["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"], label="Occupation"),
    gr.Dropdown(["Wife", "Husband", "Own-child", "Not-in-family", "Other-relative", "Unmarried"], label="Relationship"),
    gr.Dropdown(["White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"], label="Race"),
    gr.Slider(0, 100000, step=100, label="Capital Gain"),
    gr.Slider(0, 5000, step=50, label="Capital Loss"),
    gr.Slider(1, 60, step=1, label="Hours Per Week"),
    gr.Dropdown(["United-States", "Canada", "Mexico", "Other"], label="Native Country")
]
rf_inputs = [
    gr.Slider(18, 90, step=1, label="Age"),
    gr.Dropdown(["Male", "Female"], label="Gender"),
    gr.Dropdown(["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"], label="Workclass"),
    gr.Dropdown(["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th", "HS-grad", "Some-college", "Assoc-voc", "Assoc-acdm", "Bachelors", "Masters", "Doctorate", "Prof-school"], label="Education"),
    gr.Dropdown(["Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"], label="Marital Status"),
    gr.Dropdown(["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"], label="Occupation"),
    gr.Dropdown(["Wife", "Husband", "Own-child", "Not-in-family", "Other-relative", "Unmarried"], label="Relationship"),
    gr.Dropdown(["White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"], label="Race"),
    gr.Slider(0, 100000, step=100, label="Capital Gain"),
    gr.Slider(0, 5000, step=50, label="Capital Loss"),
    gr.Slider(1, 60, step=1, label="Hours Per Week"),
    gr.Dropdown(["United-States", "Canada", "Mexico", "Other"], label="Native Country")
]
hbd_inputs = [
    gr.Slider(18, 90, step=1, label="Age"),
    gr.Dropdown(["Male", "Female"], label="Gender"),
    gr.Dropdown(["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"], label="Workclass"),
    gr.Dropdown(["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th", "HS-grad", "Some-college", "Assoc-voc", "Assoc-acdm", "Bachelors", "Masters", "Doctorate", "Prof-school"], label="Education"),
    gr.Dropdown(["Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"], label="Marital Status"),
    gr.Dropdown(["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"], label="Occupation"),
    gr.Dropdown(["Wife", "Husband", "Own-child", "Not-in-family", "Other-relative", "Unmarried"], label="Relationship"),
    gr.Dropdown(["White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"], label="Race"),
    gr.Slider(0, 100000, step=100, label="Capital Gain"),
    gr.Slider(0, 5000, step=50, label="Capital Loss"),
    gr.Slider(1, 60, step=1, label="Hours Per Week"),
    gr.Dropdown(["United-States", "Canada", "Mexico", "Other"], label="Native Country")
]

# Interfaces for each model
ann_interface = gr.Interface(
    fn=predict_ann,
    inputs=ann_inputs,
    outputs="text",
    title="Artificial Neural Network",
    description="Predict income using an Artificial Neural Network."
)

rf_interface = gr.Interface(
    fn=predict_rf,
    inputs=rf_inputs,
    outputs="text",
    title="Random Forest",
    description="Predict income using a Random Forest model."
)

hb_interface = gr.Interface(
    fn=predict_hb,
    inputs=hbd_inputs,
    outputs="text",
    title="HDBScan Clustering",
    description="Predict income using a HDBScan Clustering model."
)

interface = gr.TabbedInterface(
    [ann_interface, rf_interface, hb_interface],
    ["ANN Model", "Random Forest Model", "HDBScan Model"]
)

interface.launch()