huntrezz commited on
Commit
137e5e0
1 Parent(s): b5954ff

Upload 2 files

Browse files
Files changed (2) hide show
  1. LosAngelesPayPredictor.py +127 -0
  2. requirements.txt +5 -0
LosAngelesPayPredictor.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
4
+ from sklearn.model_selection import train_test_split
5
+ from fastai.tabular.all import *
6
+ from sklearn.ensemble import VotingRegressor
7
+ from sklearn.linear_model import LinearRegression
8
+ from sklearn.tree import DecisionTreeRegressor
9
+ from sklearn.base import BaseEstimator, RegressorMixin
10
+ from sklearn.compose import ColumnTransformer
11
+ from sklearn.pipeline import Pipeline
12
+ import gradio as gr
13
+
14
+ df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
15
+ df = df.replace([np.inf, -np.inf], np.nan)
16
+
17
+ cat_names = ['EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'GENDER', 'ETHNICITY', 'JOB_TITLE', 'DEPARTMENT_NO']
18
+ cont_names = ['PAY_YEAR', 'REGULAR_PAY', 'OVERTIME_PAY', 'ALL_OTHER_PAY']
19
+
20
+ df['PAY_RATIO'] = df['REGULAR_PAY'] / (df['OVERTIME_PAY'] + df['ALL_OTHER_PAY'] + 1)
21
+ df['TOTAL_NON_REGULAR_PAY'] = df['OVERTIME_PAY'] + df['ALL_OTHER_PAY']
22
+ cont_names.extend(['PAY_RATIO', 'TOTAL_NON_REGULAR_PAY'])
23
+
24
+ X = df[cat_names + cont_names].copy()
25
+ y = df['TOTAL_PAY'].copy()
26
+ for col in cat_names:
27
+ X[col] = X[col].fillna('Unknown')
28
+
29
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
30
+ X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, train_size=0.3, random_state=42)
31
+
32
+ to = TabularPandas(df, procs=[Categorify, FillMissing, Normalize], cat_names=cat_names, cont_names=cont_names, y_names='TOTAL_PAY', splits=RandomSplitter(valid_pct=0.2)(range_of(df)))
33
+ dls = to.dataloaders(bs=64)
34
+
35
+ learn = tabular_learner(dls, layers=[200, 100, 50], metrics=rmse)
36
+ learn.fit_one_cycle(9)
37
+
38
+ class FastAIWrapper(BaseEstimator, RegressorMixin):
39
+ def __init__(self, learn):
40
+ self.learn = learn
41
+ def fit(self, X, y):
42
+ return self
43
+ def predict(self, X):
44
+ dl = self.learn.dls.test_dl(X)
45
+ preds, _ = self.learn.get_preds(dl=dl)
46
+ return preds.numpy().flatten()
47
+
48
+ preprocessor = ColumnTransformer(
49
+ transformers=[
50
+ ('num', StandardScaler(), cont_names),
51
+ ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), cat_names)
52
+ ])
53
+
54
+ model1 = FastAIWrapper(learn)
55
+ model2 = Pipeline([('preprocessor', preprocessor), ('regressor', LinearRegression())])
56
+ model3 = Pipeline([('preprocessor', preprocessor), ('regressor', DecisionTreeRegressor())])
57
+
58
+ ensemble = VotingRegressor(
59
+ estimators=[('fastai', model1), ('lr', model2), ('dt', model3)],
60
+ weights=[2, 1, 1]
61
+ )
62
+
63
+ ensemble.fit(X_train_sample, y_train_sample)
64
+
65
+ def predict_total_pay(gender, job_title, ethnicity):
66
+ sample = pd.DataFrame({
67
+ 'GENDER': [gender],
68
+ 'JOB_TITLE': [job_title],
69
+ 'ETHNICITY': [ethnicity],
70
+ })
71
+ group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
72
+ if len(group) > 0:
73
+ sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
74
+ sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
75
+ sample['MOU'] = [group['MOU'].mode().iloc[0]]
76
+ sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
77
+ sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
78
+ sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
79
+ sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
80
+ else:
81
+ job_group = df[df['JOB_TITLE'] == job_title]
82
+ if len(job_group) > 0:
83
+ sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
84
+ sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
85
+ sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
86
+ sample['DEPARTMENT_NO'] = [job_group['DEPARTMENT_NO'].mode().iloc[0]]
87
+ sample['REGULAR_PAY'] = [job_group['REGULAR_PAY'].mean()]
88
+ sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
89
+ sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
90
+ else:
91
+ sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
92
+ sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
93
+ sample['MOU'] = [df['MOU'].mode().iloc[0]]
94
+ sample['DEPARTMENT_NO'] = [df['DEPARTMENT_NO'].mode().iloc[0]]
95
+ sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
96
+ sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
97
+ sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
98
+ sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
99
+ sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
100
+ sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
101
+ categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
102
+ for col in categorical_columns:
103
+ sample[col] = sample[col].astype('object')
104
+ prediction = ensemble.predict(sample)[0]
105
+ return prediction
106
+
107
+ def gradio_predict(gender, ethnicity, job_title):
108
+ predicted_pay = predict_total_pay(gender, job_title, ethnicity)
109
+ return f"${predicted_pay:.2f}"
110
+
111
+ genders = df['GENDER'].dropna().unique().tolist()
112
+ ethnicities = df['ETHNICITY'].dropna().unique().tolist()
113
+ job_titles = sorted(df['JOB_TITLE'].dropna().unique().tolist())
114
+
115
+ iface = gr.Interface(
116
+ fn=gradio_predict,
117
+ inputs=[
118
+ gr.Dropdown(choices=genders, label="Gender"),
119
+ gr.Dropdown(choices=ethnicities, label="Ethnicity"),
120
+ gr.Dropdown(choices=job_titles, label="Job Title")
121
+ ],
122
+ outputs=gr.Textbox(label="Predicted Total Pay"),
123
+ title="LA City Employee Pay Predictor",
124
+ description="Predict the total pay for LA City employees based on gender, ethnicity, and job title."
125
+ )
126
+
127
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ scikit-learn
4
+ fastai
5
+ gradio