Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -594,6 +594,14 @@ elif app_mode == "Model Training":
|
|
594 |
if st.session_state.cleaned_data is not None:
|
595 |
df = st.session_state.cleaned_data.copy()
|
596 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
597 |
# Target Variable Selection
|
598 |
target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
|
599 |
|
@@ -613,30 +621,35 @@ elif app_mode == "Model Training":
|
|
613 |
|
614 |
feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
|
615 |
|
616 |
-
|
617 |
param_grid = {} # Initialize to empty dictionary
|
|
|
|
|
|
|
618 |
if model_name == "Random Forest":
|
619 |
st.subheader("Random Forest Hyperparameters")
|
620 |
param_grid = {
|
621 |
-
'n_estimators': list(range(
|
622 |
-
|
623 |
-
'
|
624 |
-
'
|
|
|
625 |
}
|
626 |
|
627 |
elif model_name == "Gradient Boosting":
|
628 |
-
|
629 |
-
|
630 |
-
'n_estimators': list(range(
|
631 |
-
'learning_rate': [
|
632 |
-
'max_depth': list(range(
|
|
|
633 |
}
|
634 |
|
635 |
elif model_name == "Decision Tree":
|
636 |
st.subheader("Decision Tree Hyperparameters")
|
637 |
param_grid = {
|
638 |
-
'criterion':
|
639 |
-
'max_depth': list(range(
|
640 |
}
|
641 |
|
642 |
# Train-Test Split
|
@@ -691,13 +704,18 @@ elif app_mode == "Model Training":
|
|
691 |
# Model Training and Hyperparameter Tuning
|
692 |
if model_name == "Linear Regression":
|
693 |
model = LinearRegression()
|
|
|
|
|
694 |
elif model_name == "Logistic Regression":
|
695 |
model = LogisticRegression(max_iter=1000)
|
|
|
696 |
elif model_name == "Decision Tree":
|
697 |
if problem_type == "Regression":
|
698 |
model = DecisionTreeRegressor()
|
|
|
699 |
else:
|
700 |
model = DecisionTreeClassifier()
|
|
|
701 |
elif model_name == "Random Forest":
|
702 |
if problem_type == "Regression":
|
703 |
model = RandomForestRegressor(random_state=42)
|
@@ -707,6 +725,7 @@ elif app_mode == "Model Training":
|
|
707 |
model = grid_search.best_estimator_
|
708 |
st.write("Best Parameters:", grid_search.best_params_)
|
709 |
else:
|
|
|
710 |
model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
|
711 |
|
712 |
else:
|
@@ -739,6 +758,12 @@ elif app_mode == "Model Training":
|
|
739 |
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
740 |
st.session_state.preprocessor = preprocessor
|
741 |
|
|
|
|
|
|
|
|
|
|
|
|
|
742 |
# Model Evaluation
|
743 |
y_pred = model.predict(X_test_selected)
|
744 |
if problem_type == "Regression":
|
@@ -773,9 +798,9 @@ elif app_mode == "Model Training":
|
|
773 |
|
774 |
conf_matrix = confusion_matrix(y_test, y_pred)
|
775 |
|
776 |
-
#
|
777 |
fig_conf, ax_conf = plt.subplots()
|
778 |
-
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax_conf)
|
779 |
ax_conf.set_xlabel('Predicted Labels')
|
780 |
ax_conf.set_ylabel('True Labels')
|
781 |
ax_conf.set_title('Confusion Matrix')
|
@@ -788,7 +813,7 @@ elif app_mode == "Model Training":
|
|
788 |
if st.session_state.model is not None: #Make sure that everything only executes when data set is trained and not outside of it.
|
789 |
try: #All the plotting code here.
|
790 |
if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models.
|
791 |
-
|
792 |
|
793 |
#Feature Importance (Tree-based Models)
|
794 |
|
@@ -806,7 +831,7 @@ elif app_mode == "Model Training":
|
|
806 |
#Create data that determines the learning and validation curve and what we have to add
|
807 |
train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run
|
808 |
|
809 |
-
#
|
810 |
train_mean = np.mean(train_scores, axis=1)
|
811 |
train_std = np.std(train_scores, axis=1)
|
812 |
valid_mean = np.mean(valid_scores, axis=1)
|
@@ -852,20 +877,19 @@ elif app_mode == "Model Training":
|
|
852 |
except Exception as e:
|
853 |
st.error(f"Error loading model: {e}")
|
854 |
|
855 |
-
#Model Evaluation Section
|
856 |
-
if
|
857 |
-
try:
|
858 |
-
|
859 |
-
y_pred = st.session_state.model.predict(X_test)
|
860 |
|
861 |
if problem_type == "Regression":
|
862 |
-
mse = mean_squared_error(y_test, y_pred)
|
863 |
-
r2 = r2_score(y_test, y_pred)
|
864 |
st.write(f"Mean Squared Error: {mse:.4f}")
|
865 |
st.write(f"R-squared: {r2:.4f}")
|
866 |
else:
|
867 |
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
|
868 |
-
accuracy = accuracy_score(y_test, y_pred)
|
869 |
st.write(f"Accuracy: {accuracy:.4f}")
|
870 |
except Exception as e: #local error
|
871 |
st.error(f"An error occurred during model evaluation: {e}")
|
|
|
594 |
if st.session_state.cleaned_data is not None:
|
595 |
df = st.session_state.cleaned_data.copy()
|
596 |
|
597 |
+
# Initialize session state for train/test split
|
598 |
+
if 'X_train_selected' not in st.session_state:
|
599 |
+
st.session_state.X_train_selected = None
|
600 |
+
st.session_state.X_test_selected = None
|
601 |
+
st.session_state.y_train = None
|
602 |
+
st.session_state.y_test = None
|
603 |
+
st.session_state.model = None # Initialize model in session state
|
604 |
+
|
605 |
# Target Variable Selection
|
606 |
target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
|
607 |
|
|
|
621 |
|
622 |
feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
|
623 |
|
624 |
+
# Hyperparameter Tuning - Dynamic based on Model Selection
|
625 |
param_grid = {} # Initialize to empty dictionary
|
626 |
+
|
627 |
+
#Define different paramter values for the model so it works. This is not an optimized number
|
628 |
+
#The goal is to make sure that all visualizations and graphs work as is.
|
629 |
if model_name == "Random Forest":
|
630 |
st.subheader("Random Forest Hyperparameters")
|
631 |
param_grid = {
|
632 |
+
'n_estimators': list(range(100, 101)), #Used 100 so model is trained and not empty and all visuals work
|
633 |
+
|
634 |
+
'max_depth': list(range(10,11)), #default value 10 so its in model
|
635 |
+
'min_samples_split': list(range(2,3)), #New hyperparameter default 2
|
636 |
+
'min_samples_leaf': list(range(1,2)), #New hyperparameter default 1
|
637 |
}
|
638 |
|
639 |
elif model_name == "Gradient Boosting":
|
640 |
+
st.subheader("Gradient Boosting Hyperparameters")
|
641 |
+
param_grid = {
|
642 |
+
'n_estimators': list(range(100, 101)),
|
643 |
+
'learning_rate': [0.1],
|
644 |
+
'max_depth': list(range(3,4))
|
645 |
+
|
646 |
}
|
647 |
|
648 |
elif model_name == "Decision Tree":
|
649 |
st.subheader("Decision Tree Hyperparameters")
|
650 |
param_grid = {
|
651 |
+
'criterion': ["gini"],
|
652 |
+
'max_depth': list(range(3,4)),
|
653 |
}
|
654 |
|
655 |
# Train-Test Split
|
|
|
704 |
# Model Training and Hyperparameter Tuning
|
705 |
if model_name == "Linear Regression":
|
706 |
model = LinearRegression()
|
707 |
+
model.fit(X_train_selected, y_train)
|
708 |
+
|
709 |
elif model_name == "Logistic Regression":
|
710 |
model = LogisticRegression(max_iter=1000)
|
711 |
+
model.fit(X_train_selected, y_train)
|
712 |
elif model_name == "Decision Tree":
|
713 |
if problem_type == "Regression":
|
714 |
model = DecisionTreeRegressor()
|
715 |
+
model.fit(X_train_selected, y_train)
|
716 |
else:
|
717 |
model = DecisionTreeClassifier()
|
718 |
+
model.fit(X_train_selected, y_train)
|
719 |
elif model_name == "Random Forest":
|
720 |
if problem_type == "Regression":
|
721 |
model = RandomForestRegressor(random_state=42)
|
|
|
725 |
model = grid_search.best_estimator_
|
726 |
st.write("Best Parameters:", grid_search.best_params_)
|
727 |
else:
|
728 |
+
model = RandomForestRegressor(random_state=42) #define if no param_grid
|
729 |
model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
|
730 |
|
731 |
else:
|
|
|
758 |
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
759 |
st.session_state.preprocessor = preprocessor
|
760 |
|
761 |
+
#Store the test data
|
762 |
+
st.session_state.X_train_selected = X_train_selected
|
763 |
+
st.session_state.X_test_selected = X_test_selected
|
764 |
+
st.session_state.y_train = y_train
|
765 |
+
st.session_state.y_test = y_test
|
766 |
+
|
767 |
# Model Evaluation
|
768 |
y_pred = model.predict(X_test_selected)
|
769 |
if problem_type == "Regression":
|
|
|
798 |
|
799 |
conf_matrix = confusion_matrix(y_test, y_pred)
|
800 |
|
801 |
+
#Heatmap
|
802 |
fig_conf, ax_conf = plt.subplots()
|
803 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax_conf)
|
804 |
ax_conf.set_xlabel('Predicted Labels')
|
805 |
ax_conf.set_ylabel('True Labels')
|
806 |
ax_conf.set_title('Confusion Matrix')
|
|
|
813 |
if st.session_state.model is not None: #Make sure that everything only executes when data set is trained and not outside of it.
|
814 |
try: #All the plotting code here.
|
815 |
if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models.
|
816 |
+
#Make sure you use this inside of a conditional for classification, model, and tree based model.
|
817 |
|
818 |
#Feature Importance (Tree-based Models)
|
819 |
|
|
|
831 |
#Create data that determines the learning and validation curve and what we have to add
|
832 |
train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run
|
833 |
|
834 |
+
#Then add a plot for the learning curve and use st.pyplot
|
835 |
train_mean = np.mean(train_scores, axis=1)
|
836 |
train_std = np.std(train_scores, axis=1)
|
837 |
valid_mean = np.mean(valid_scores, axis=1)
|
|
|
877 |
except Exception as e:
|
878 |
st.error(f"Error loading model: {e}")
|
879 |
|
880 |
+
#Model Evaluation Section - run on the saved model
|
881 |
+
if st.session_state.model is not None and st.session_state.X_test_selected is not None: # added check to make sure it is a loaded model
|
882 |
+
try:
|
883 |
+
y_pred = st.session_state.model.predict(st.session_state.X_test_selected) # load from stored
|
|
|
884 |
|
885 |
if problem_type == "Regression":
|
886 |
+
mse = mean_squared_error(st.session_state.y_test, y_pred)
|
887 |
+
r2 = r2_score(st.session_state.y_test, y_pred)
|
888 |
st.write(f"Mean Squared Error: {mse:.4f}")
|
889 |
st.write(f"R-squared: {r2:.4f}")
|
890 |
else:
|
891 |
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
|
892 |
+
accuracy = accuracy_score(st.session_state.y_test, y_pred)
|
893 |
st.write(f"Accuracy: {accuracy:.4f}")
|
894 |
except Exception as e: #local error
|
895 |
st.error(f"An error occurred during model evaluation: {e}")
|