Feiiisal commited on
Commit
e2e32ef
·
1 Parent(s): c6e9cf0

Final Updates

Browse files
Files changed (5) hide show
  1. .streamlit/config.toml +9 -0
  2. app.py +7 -6
  3. pipeline.pkl +2 -2
  4. rfc_model.pkl +2 -2
  5. transformers.py +4 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [global]
2
+ developmentMode = false
3
+
4
+ [theme]
5
+ base = "light"
6
+ primaryColor = "#AEDFF7"
7
+ backgroundColor = "#FFFFFF"
8
+ secondaryBackgroundColor = "#AEDFF7"
9
+ textColor = "#000000"
app.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  import pandas as pd
3
  import pickle
4
  import os
 
5
 
6
  # Load the model and encoder
7
  SRC = os.path.abspath('.')
@@ -82,20 +83,20 @@ if options == "Prediction":
82
  losses = st.number_input("Losses", min_value=0)
83
  stocks_status = st.number_input("Stocks Status", min_value=0)
84
  citizenship = st.selectbox("Citizenship", ['citizen', 'foreigner'])
85
- importance_of_record = st.number_input("Importance of Record", min_value=0.0, format='%f')
86
 
87
  if st.button('Predict Income Level'):
88
  input_data = pd.DataFrame([[
89
  age, gender, education, worker_class, marital_status, race, is_hispanic, employment_commitment,
90
  employment_stat, wage_per_hour, working_week_per_year, industry_code, industry_code_main, occupation_code,
91
  occupation_code_main, total_employed, household_summary, vet_benefit, tax_status, gains, losses,
92
- stocks_status, citizenship, importance_of_record
93
  ]], columns=[
94
  'age', 'gender', 'education', 'worker_class', 'marital_status', 'race', 'is_hispanic',
95
  'employment_commitment', 'employment_stat', 'wage_per_hour', 'working_week_per_year',
96
  'industry_code', 'industry_code_main', 'occupation_code', 'occupation_code_main', 'total_employed',
97
  'household_summary', 'vet_benefit', 'tax_status', 'gains', 'losses', 'stocks_status',
98
- 'citizenship', 'importance_of_record'
99
  ])
100
 
101
  # Preprocess the input data through the pipeline before making predictions
@@ -120,10 +121,10 @@ elif options == "Model Information":
120
  - The Random Forest is a versatile and robust machine learning method that combines multiple decision trees to produce more accurate and stable predictions. It's known for its high accuracy, ability to handle large datasets with higher dimensionality, and its robustness to overfitting.
121
 
122
  - **Training Data:**
123
- - Our model is trained on comprehensive census data, encompassing a wide range of features such as age, education, marital status, race, occupation, and more. This rich dataset ensures a nuanced understanding of the socio-economic factors influencing income levels.
124
 
125
- - **Accuracy:** 94%
126
- - With an accuracy of 94%, our model stands as a reliable predictor, demonstrating its effectiveness in understanding and categorizing income levels.
127
 
128
  - **What It Aims to Solve:**
129
  - **Economic Research:** Assists in socio-economic studies, understanding income distribution, and identifying key factors influencing income levels.
 
2
  import pandas as pd
3
  import pickle
4
  import os
5
+ from transformers import log_transform
6
 
7
  # Load the model and encoder
8
  SRC = os.path.abspath('.')
 
83
  losses = st.number_input("Losses", min_value=0)
84
  stocks_status = st.number_input("Stocks Status", min_value=0)
85
  citizenship = st.selectbox("Citizenship", ['citizen', 'foreigner'])
86
+
87
 
88
  if st.button('Predict Income Level'):
89
  input_data = pd.DataFrame([[
90
  age, gender, education, worker_class, marital_status, race, is_hispanic, employment_commitment,
91
  employment_stat, wage_per_hour, working_week_per_year, industry_code, industry_code_main, occupation_code,
92
  occupation_code_main, total_employed, household_summary, vet_benefit, tax_status, gains, losses,
93
+ stocks_status, citizenship
94
  ]], columns=[
95
  'age', 'gender', 'education', 'worker_class', 'marital_status', 'race', 'is_hispanic',
96
  'employment_commitment', 'employment_stat', 'wage_per_hour', 'working_week_per_year',
97
  'industry_code', 'industry_code_main', 'occupation_code', 'occupation_code_main', 'total_employed',
98
  'household_summary', 'vet_benefit', 'tax_status', 'gains', 'losses', 'stocks_status',
99
+ 'citizenship'
100
  ])
101
 
102
  # Preprocess the input data through the pipeline before making predictions
 
121
  - The Random Forest is a versatile and robust machine learning method that combines multiple decision trees to produce more accurate and stable predictions. It's known for its high accuracy, ability to handle large datasets with higher dimensionality, and its robustness to overfitting.
122
 
123
  - **Training Data:**
124
+ - The model is trained on comprehensive census data, encompassing a wide range of features such as age, education, marital status, race, occupation, and more. This rich dataset ensures a nuanced understanding of the socio-economic factors influencing income levels.
125
 
126
+ - **F1 Score:** 98%
127
+ - With an F1 score of 98%, the model stands as a reliable predictor, demonstrating its effectiveness in understanding and categorizing income levels.
128
 
129
  - **What It Aims to Solve:**
130
  - **Economic Research:** Assists in socio-economic studies, understanding income distribution, and identifying key factors influencing income levels.
pipeline.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd28c23cc70beba35906a28c1b6937630ffb3bb4a7c5e8cb8276f66a33eb60e4
3
- size 4811
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5798dfdbe5793f5277903f528f13beb821822483d09768f514183eab7a6c7335
3
+ size 5296
rfc_model.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd6ad029aa941d54353a585b399b77c15380a06cfa80a2c81faf9697effe0aac
3
- size 267723561
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28b7b8c02725a3b1914fc171cdf0a03ac31360a2bc55515832273809804203ef
3
+ size 353869321
transformers.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def log_transform(x):
4
+ return np.log(x + 1)