import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.model_selection import train_test_split from fastai.tabular.all import * from sklearn.ensemble import VotingRegressor from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.base import BaseEstimator, RegressorMixin from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline import gradio as gr df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False) df = df.replace([np.inf, -np.inf], np.nan) cat_names = ['EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'GENDER', 'ETHNICITY', 'JOB_TITLE', 'DEPARTMENT_NO'] cont_names = ['PAY_YEAR', 'REGULAR_PAY', 'OVERTIME_PAY', 'ALL_OTHER_PAY'] df['PAY_RATIO'] = df['REGULAR_PAY'] / (df['OVERTIME_PAY'] + df['ALL_OTHER_PAY'] + 1) df['TOTAL_NON_REGULAR_PAY'] = df['OVERTIME_PAY'] + df['ALL_OTHER_PAY'] cont_names.extend(['PAY_RATIO', 'TOTAL_NON_REGULAR_PAY']) X = df[cat_names + cont_names].copy() y = df['TOTAL_PAY'].copy() for col in cat_names: X[col] = X[col].fillna('Unknown') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, train_size=0.3, random_state=42) to = TabularPandas(df, procs=[Categorify, FillMissing, Normalize], cat_names=cat_names, cont_names=cont_names, y_names='TOTAL_PAY', splits=RandomSplitter(valid_pct=0.2)(range_of(df))) dls = to.dataloaders(bs=64) learn = tabular_learner(dls, layers=[200, 100, 50], metrics=rmse) learn.fit_one_cycle(9) class FastAIWrapper(BaseEstimator, RegressorMixin): def __init__(self, learn): self.learn = learn def fit(self, X, y): return self def predict(self, X): dl = self.learn.dls.test_dl(X) preds, _ = self.learn.get_preds(dl=dl) return preds.numpy().flatten() preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), cont_names), ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), cat_names) ]) model1 = FastAIWrapper(learn) model2 = Pipeline([('preprocessor', preprocessor), ('regressor', LinearRegression())]) model3 = Pipeline([('preprocessor', preprocessor), ('regressor', DecisionTreeRegressor())]) ensemble = VotingRegressor( estimators=[('fastai', model1), ('lr', model2), ('dt', model3)], weights=[2, 1, 1] ) ensemble.fit(X_train_sample, y_train_sample) def predict_total_pay(gender, job_title, ethnicity): sample = pd.DataFrame({ 'GENDER': [gender], 'JOB_TITLE': [job_title], 'ETHNICITY': [ethnicity], }) group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)] if len(group) > 0: sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]] sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]] sample['MOU'] = [group['MOU'].mode().iloc[0]] sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]] sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()] sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()] sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()] else: job_group = df[df['JOB_TITLE'] == job_title] if len(job_group) > 0: sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]] sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]] sample['MOU'] = [job_group['MOU'].mode().iloc[0]] sample['DEPARTMENT_NO'] = [job_group['DEPARTMENT_NO'].mode().iloc[0]] sample['REGULAR_PAY'] = [job_group['REGULAR_PAY'].mean()] sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()] sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()] else: sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]] sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]] sample['MOU'] = [df['MOU'].mode().iloc[0]] sample['DEPARTMENT_NO'] = [df['DEPARTMENT_NO'].mode().iloc[0]] sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()] sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()] sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()] sample['PAY_YEAR'] = [df['PAY_YEAR'].max()] sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1) sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO'] for col in categorical_columns: sample[col] = sample[col].astype('object') prediction = ensemble.predict(sample)[0] return prediction def gradio_predict(gender, ethnicity, job_title): predicted_pay = predict_total_pay(gender, job_title, ethnicity) return f"${predicted_pay:.2f}" genders = df['GENDER'].dropna().unique().tolist() ethnicities = df['ETHNICITY'].dropna().unique().tolist() job_titles = sorted(df['JOB_TITLE'].dropna().unique().tolist()) iface = gr.Interface( fn=gradio_predict, inputs=[ gr.Dropdown(choices=genders, label="Gender"), gr.Dropdown(choices=ethnicities, label="Ethnicity"), gr.Dropdown(choices=job_titles, label="Job Title") ], outputs=gr.Textbox(label="Predicted Total Pay"), title="LA City Employee Pay Predictor", description="Predict the total pay for LA City employees based on gender, ethnicity, and job title." ) iface.launch()