Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- LosAngelesPayPredictor.py +127 -0
- requirements.txt +5 -0
LosAngelesPayPredictor.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from fastai.tabular.all import *
|
6 |
+
from sklearn.ensemble import VotingRegressor
|
7 |
+
from sklearn.linear_model import LinearRegression
|
8 |
+
from sklearn.tree import DecisionTreeRegressor
|
9 |
+
from sklearn.base import BaseEstimator, RegressorMixin
|
10 |
+
from sklearn.compose import ColumnTransformer
|
11 |
+
from sklearn.pipeline import Pipeline
|
12 |
+
import gradio as gr
|
13 |
+
|
14 |
+
df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
|
15 |
+
df = df.replace([np.inf, -np.inf], np.nan)
|
16 |
+
|
17 |
+
cat_names = ['EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'GENDER', 'ETHNICITY', 'JOB_TITLE', 'DEPARTMENT_NO']
|
18 |
+
cont_names = ['PAY_YEAR', 'REGULAR_PAY', 'OVERTIME_PAY', 'ALL_OTHER_PAY']
|
19 |
+
|
20 |
+
df['PAY_RATIO'] = df['REGULAR_PAY'] / (df['OVERTIME_PAY'] + df['ALL_OTHER_PAY'] + 1)
|
21 |
+
df['TOTAL_NON_REGULAR_PAY'] = df['OVERTIME_PAY'] + df['ALL_OTHER_PAY']
|
22 |
+
cont_names.extend(['PAY_RATIO', 'TOTAL_NON_REGULAR_PAY'])
|
23 |
+
|
24 |
+
X = df[cat_names + cont_names].copy()
|
25 |
+
y = df['TOTAL_PAY'].copy()
|
26 |
+
for col in cat_names:
|
27 |
+
X[col] = X[col].fillna('Unknown')
|
28 |
+
|
29 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
30 |
+
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, train_size=0.3, random_state=42)
|
31 |
+
|
32 |
+
to = TabularPandas(df, procs=[Categorify, FillMissing, Normalize], cat_names=cat_names, cont_names=cont_names, y_names='TOTAL_PAY', splits=RandomSplitter(valid_pct=0.2)(range_of(df)))
|
33 |
+
dls = to.dataloaders(bs=64)
|
34 |
+
|
35 |
+
learn = tabular_learner(dls, layers=[200, 100, 50], metrics=rmse)
|
36 |
+
learn.fit_one_cycle(9)
|
37 |
+
|
38 |
+
class FastAIWrapper(BaseEstimator, RegressorMixin):
|
39 |
+
def __init__(self, learn):
|
40 |
+
self.learn = learn
|
41 |
+
def fit(self, X, y):
|
42 |
+
return self
|
43 |
+
def predict(self, X):
|
44 |
+
dl = self.learn.dls.test_dl(X)
|
45 |
+
preds, _ = self.learn.get_preds(dl=dl)
|
46 |
+
return preds.numpy().flatten()
|
47 |
+
|
48 |
+
preprocessor = ColumnTransformer(
|
49 |
+
transformers=[
|
50 |
+
('num', StandardScaler(), cont_names),
|
51 |
+
('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), cat_names)
|
52 |
+
])
|
53 |
+
|
54 |
+
model1 = FastAIWrapper(learn)
|
55 |
+
model2 = Pipeline([('preprocessor', preprocessor), ('regressor', LinearRegression())])
|
56 |
+
model3 = Pipeline([('preprocessor', preprocessor), ('regressor', DecisionTreeRegressor())])
|
57 |
+
|
58 |
+
ensemble = VotingRegressor(
|
59 |
+
estimators=[('fastai', model1), ('lr', model2), ('dt', model3)],
|
60 |
+
weights=[2, 1, 1]
|
61 |
+
)
|
62 |
+
|
63 |
+
ensemble.fit(X_train_sample, y_train_sample)
|
64 |
+
|
65 |
+
def predict_total_pay(gender, job_title, ethnicity):
|
66 |
+
sample = pd.DataFrame({
|
67 |
+
'GENDER': [gender],
|
68 |
+
'JOB_TITLE': [job_title],
|
69 |
+
'ETHNICITY': [ethnicity],
|
70 |
+
})
|
71 |
+
group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
|
72 |
+
if len(group) > 0:
|
73 |
+
sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
74 |
+
sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
|
75 |
+
sample['MOU'] = [group['MOU'].mode().iloc[0]]
|
76 |
+
sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
|
77 |
+
sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
|
78 |
+
sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
|
79 |
+
sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
|
80 |
+
else:
|
81 |
+
job_group = df[df['JOB_TITLE'] == job_title]
|
82 |
+
if len(job_group) > 0:
|
83 |
+
sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
84 |
+
sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
|
85 |
+
sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
|
86 |
+
sample['DEPARTMENT_NO'] = [job_group['DEPARTMENT_NO'].mode().iloc[0]]
|
87 |
+
sample['REGULAR_PAY'] = [job_group['REGULAR_PAY'].mean()]
|
88 |
+
sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
|
89 |
+
sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
|
90 |
+
else:
|
91 |
+
sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
92 |
+
sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
|
93 |
+
sample['MOU'] = [df['MOU'].mode().iloc[0]]
|
94 |
+
sample['DEPARTMENT_NO'] = [df['DEPARTMENT_NO'].mode().iloc[0]]
|
95 |
+
sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
|
96 |
+
sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
|
97 |
+
sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
|
98 |
+
sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
|
99 |
+
sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
|
100 |
+
sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
|
101 |
+
categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
|
102 |
+
for col in categorical_columns:
|
103 |
+
sample[col] = sample[col].astype('object')
|
104 |
+
prediction = ensemble.predict(sample)[0]
|
105 |
+
return prediction
|
106 |
+
|
107 |
+
def gradio_predict(gender, ethnicity, job_title):
|
108 |
+
predicted_pay = predict_total_pay(gender, job_title, ethnicity)
|
109 |
+
return f"${predicted_pay:.2f}"
|
110 |
+
|
111 |
+
genders = df['GENDER'].dropna().unique().tolist()
|
112 |
+
ethnicities = df['ETHNICITY'].dropna().unique().tolist()
|
113 |
+
job_titles = sorted(df['JOB_TITLE'].dropna().unique().tolist())
|
114 |
+
|
115 |
+
iface = gr.Interface(
|
116 |
+
fn=gradio_predict,
|
117 |
+
inputs=[
|
118 |
+
gr.Dropdown(choices=genders, label="Gender"),
|
119 |
+
gr.Dropdown(choices=ethnicities, label="Ethnicity"),
|
120 |
+
gr.Dropdown(choices=job_titles, label="Job Title")
|
121 |
+
],
|
122 |
+
outputs=gr.Textbox(label="Predicted Total Pay"),
|
123 |
+
title="LA City Employee Pay Predictor",
|
124 |
+
description="Predict the total pay for LA City employees based on gender, ethnicity, and job title."
|
125 |
+
)
|
126 |
+
|
127 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
numpy
|
3 |
+
scikit-learn
|
4 |
+
fastai
|
5 |
+
gradio
|