Final Updates
Browse files- .streamlit/config.toml +9 -0
- app.py +7 -6
- pipeline.pkl +2 -2
- rfc_model.pkl +2 -2
- transformers.py +4 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[global]
|
2 |
+
developmentMode = false
|
3 |
+
|
4 |
+
[theme]
|
5 |
+
base = "light"
|
6 |
+
primaryColor = "#AEDFF7"
|
7 |
+
backgroundColor = "#FFFFFF"
|
8 |
+
secondaryBackgroundColor = "#AEDFF7"
|
9 |
+
textColor = "#000000"
|
app.py
CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
|
|
2 |
import pandas as pd
|
3 |
import pickle
|
4 |
import os
|
|
|
5 |
|
6 |
# Load the model and encoder
|
7 |
SRC = os.path.abspath('.')
|
@@ -82,20 +83,20 @@ if options == "Prediction":
|
|
82 |
losses = st.number_input("Losses", min_value=0)
|
83 |
stocks_status = st.number_input("Stocks Status", min_value=0)
|
84 |
citizenship = st.selectbox("Citizenship", ['citizen', 'foreigner'])
|
85 |
-
|
86 |
|
87 |
if st.button('Predict Income Level'):
|
88 |
input_data = pd.DataFrame([[
|
89 |
age, gender, education, worker_class, marital_status, race, is_hispanic, employment_commitment,
|
90 |
employment_stat, wage_per_hour, working_week_per_year, industry_code, industry_code_main, occupation_code,
|
91 |
occupation_code_main, total_employed, household_summary, vet_benefit, tax_status, gains, losses,
|
92 |
-
stocks_status, citizenship
|
93 |
]], columns=[
|
94 |
'age', 'gender', 'education', 'worker_class', 'marital_status', 'race', 'is_hispanic',
|
95 |
'employment_commitment', 'employment_stat', 'wage_per_hour', 'working_week_per_year',
|
96 |
'industry_code', 'industry_code_main', 'occupation_code', 'occupation_code_main', 'total_employed',
|
97 |
'household_summary', 'vet_benefit', 'tax_status', 'gains', 'losses', 'stocks_status',
|
98 |
-
'citizenship'
|
99 |
])
|
100 |
|
101 |
# Preprocess the input data through the pipeline before making predictions
|
@@ -120,10 +121,10 @@ elif options == "Model Information":
|
|
120 |
- The Random Forest is a versatile and robust machine learning method that combines multiple decision trees to produce more accurate and stable predictions. It's known for its high accuracy, ability to handle large datasets with higher dimensionality, and its robustness to overfitting.
|
121 |
|
122 |
- **Training Data:**
|
123 |
-
-
|
124 |
|
125 |
-
- **
|
126 |
-
- With an
|
127 |
|
128 |
- **What It Aims to Solve:**
|
129 |
- **Economic Research:** Assists in socio-economic studies, understanding income distribution, and identifying key factors influencing income levels.
|
|
|
2 |
import pandas as pd
|
3 |
import pickle
|
4 |
import os
|
5 |
+
from transformers import log_transform
|
6 |
|
7 |
# Load the model and encoder
|
8 |
SRC = os.path.abspath('.')
|
|
|
83 |
losses = st.number_input("Losses", min_value=0)
|
84 |
stocks_status = st.number_input("Stocks Status", min_value=0)
|
85 |
citizenship = st.selectbox("Citizenship", ['citizen', 'foreigner'])
|
86 |
+
|
87 |
|
88 |
if st.button('Predict Income Level'):
|
89 |
input_data = pd.DataFrame([[
|
90 |
age, gender, education, worker_class, marital_status, race, is_hispanic, employment_commitment,
|
91 |
employment_stat, wage_per_hour, working_week_per_year, industry_code, industry_code_main, occupation_code,
|
92 |
occupation_code_main, total_employed, household_summary, vet_benefit, tax_status, gains, losses,
|
93 |
+
stocks_status, citizenship
|
94 |
]], columns=[
|
95 |
'age', 'gender', 'education', 'worker_class', 'marital_status', 'race', 'is_hispanic',
|
96 |
'employment_commitment', 'employment_stat', 'wage_per_hour', 'working_week_per_year',
|
97 |
'industry_code', 'industry_code_main', 'occupation_code', 'occupation_code_main', 'total_employed',
|
98 |
'household_summary', 'vet_benefit', 'tax_status', 'gains', 'losses', 'stocks_status',
|
99 |
+
'citizenship'
|
100 |
])
|
101 |
|
102 |
# Preprocess the input data through the pipeline before making predictions
|
|
|
121 |
- The Random Forest is a versatile and robust machine learning method that combines multiple decision trees to produce more accurate and stable predictions. It's known for its high accuracy, ability to handle large datasets with higher dimensionality, and its robustness to overfitting.
|
122 |
|
123 |
- **Training Data:**
|
124 |
+
- The model is trained on comprehensive census data, encompassing a wide range of features such as age, education, marital status, race, occupation, and more. This rich dataset ensures a nuanced understanding of the socio-economic factors influencing income levels.
|
125 |
|
126 |
+
- **F1 Score:** 98%
|
127 |
+
- With an F1 score of 98%, the model stands as a reliable predictor, demonstrating its effectiveness in understanding and categorizing income levels.
|
128 |
|
129 |
- **What It Aims to Solve:**
|
130 |
- **Economic Research:** Assists in socio-economic studies, understanding income distribution, and identifying key factors influencing income levels.
|
pipeline.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5798dfdbe5793f5277903f528f13beb821822483d09768f514183eab7a6c7335
|
3 |
+
size 5296
|
rfc_model.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28b7b8c02725a3b1914fc171cdf0a03ac31360a2bc55515832273809804203ef
|
3 |
+
size 353869321
|
transformers.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
def log_transform(x):
|
4 |
+
return np.log(x + 1)
|