Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -588,7 +588,6 @@ elif app_mode == "Advanced EDA":
|
|
588 |
except Exception as e:
|
589 |
st.error(f"An error occurred during the T-test: {e}")
|
590 |
|
591 |
-
#MODEL TRAINING
|
592 |
elif app_mode == "Model Training":
|
593 |
st.title("🚂 Model Training")
|
594 |
|
@@ -612,9 +611,6 @@ elif app_mode == "Model Training":
|
|
612 |
|
613 |
feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
|
614 |
|
615 |
-
# Hyperparameter tuning
|
616 |
-
tuning_method = st.selectbox("Hyperparameter Tuning Method",["Grid Search","Bayesian Optimization","None"])
|
617 |
-
|
618 |
if model_name == "Random Forest" and feature_columns: # Check if Random Forest and features are selected
|
619 |
min_features = 1 # Ensure at least one feature is used
|
620 |
max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
|
@@ -674,41 +670,43 @@ elif app_mode == "Model Training":
|
|
674 |
X_train_selected = X_train_processed
|
675 |
X_test_selected = X_test_processed
|
676 |
|
677 |
-
|
678 |
if model_name == "Linear Regression":
|
679 |
model = LinearRegression()
|
|
|
|
|
680 |
elif model_name == "Logistic Regression":
|
681 |
model = LogisticRegression(max_iter=1000)
|
|
|
682 |
elif model_name == "Decision Tree":
|
683 |
if problem_type == "Regression":
|
684 |
model = DecisionTreeRegressor()
|
|
|
685 |
else:
|
686 |
model = DecisionTreeClassifier()
|
|
|
687 |
elif model_name == "Random Forest":
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
model =
|
708 |
-
|
709 |
-
else:
|
710 |
-
model = RandomForestClassifier(random_state=42) #define if no param_grid
|
711 |
-
model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
|
712 |
elif model_name == "Gradient Boosting":
|
713 |
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
|
714 |
model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
|
@@ -727,10 +725,6 @@ elif app_mode == "Model Training":
|
|
727 |
|
728 |
# Store model and preprocessor
|
729 |
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
730 |
-
st.session_state.preprocessor = preprocessor
|
731 |
-
|
732 |
-
# Store model and preprocessor
|
733 |
-
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
734 |
st.session_state.preprocessor = preprocessor
|
735 |
|
736 |
# Model Evaluation
|
@@ -767,9 +761,9 @@ elif app_mode == "Model Training":
|
|
767 |
|
768 |
conf_matrix = confusion_matrix(y_test, y_pred)
|
769 |
|
770 |
-
|
771 |
fig_conf, ax_conf = plt.subplots()
|
772 |
-
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
|
773 |
ax_conf.set_xlabel('Predicted Labels')
|
774 |
ax_conf.set_ylabel('True Labels')
|
775 |
ax_conf.set_title('Confusion Matrix')
|
@@ -778,31 +772,38 @@ elif app_mode == "Model Training":
|
|
778 |
|
779 |
#Added section for model visualization
|
780 |
st.subheader("Model Visualization")
|
|
|
|
|
|
|
|
|
|
|
|
|
781 |
|
782 |
-
|
783 |
-
if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models.
|
784 |
-
#Feature Importance (Tree-based Models) and model selected was good
|
785 |
-
importances = model.feature_importances_ # Assumed tree-based model
|
786 |
-
feat_importances = pd.Series(importances, index=X_train.columns)
|
787 |
-
feat_importances = feat_importances.nlargest(20)
|
788 |
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
ax_feat.set_ylabel('Features')
|
793 |
-
ax_feat.set_title('Feature Importances')
|
794 |
-
st.pyplot(fig_feat)
|
795 |
|
|
|
|
|
|
|
|
|
|
|
|
|
796 |
|
797 |
-
|
798 |
train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run
|
799 |
|
800 |
-
|
|
|
801 |
train_mean = np.mean(train_scores, axis=1)
|
802 |
train_std = np.std(train_scores, axis=1)
|
803 |
valid_mean = np.mean(valid_scores, axis=1)
|
804 |
valid_std = np.std(valid_scores, axis=1)
|
805 |
|
|
|
|
|
806 |
fig_lc, ax_lc = plt.subplots()
|
807 |
ax_lc.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training ' + ('Accuracy' if problem_type == "Classification" else "Neg MSE"))
|
808 |
ax_lc.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
|
@@ -815,8 +816,12 @@ elif app_mode == "Model Training":
|
|
815 |
ax_lc.legend(loc='best')
|
816 |
st.pyplot(fig_lc)
|
817 |
|
818 |
-
|
819 |
-
|
|
|
|
|
|
|
|
|
820 |
|
821 |
except Exception as e:
|
822 |
st.error(f"An error occurred: {e}")
|
@@ -856,7 +861,6 @@ elif app_mode == "Model Training":
|
|
856 |
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
|
857 |
accuracy = accuracy_score(y_test, y_pred)
|
858 |
st.write(f"Accuracy: {accuracy:.4f}")
|
859 |
-
|
860 |
except Exception as e: #local error
|
861 |
st.error(f"An error occurred during model evaluation: {e}")
|
862 |
|
|
|
588 |
except Exception as e:
|
589 |
st.error(f"An error occurred during the T-test: {e}")
|
590 |
|
|
|
591 |
elif app_mode == "Model Training":
|
592 |
st.title("🚂 Model Training")
|
593 |
|
|
|
611 |
|
612 |
feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
|
613 |
|
|
|
|
|
|
|
614 |
if model_name == "Random Forest" and feature_columns: # Check if Random Forest and features are selected
|
615 |
min_features = 1 # Ensure at least one feature is used
|
616 |
max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
|
|
|
670 |
X_train_selected = X_train_processed
|
671 |
X_test_selected = X_test_processed
|
672 |
|
673 |
+
# Model Training and Hyperparameter Tuning
|
674 |
if model_name == "Linear Regression":
|
675 |
model = LinearRegression()
|
676 |
+
model.fit(X_train_selected, y_train)
|
677 |
+
|
678 |
elif model_name == "Logistic Regression":
|
679 |
model = LogisticRegression(max_iter=1000)
|
680 |
+
model.fit(X_train_selected, y_train)
|
681 |
elif model_name == "Decision Tree":
|
682 |
if problem_type == "Regression":
|
683 |
model = DecisionTreeRegressor()
|
684 |
+
model.fit(X_train_selected, y_train)
|
685 |
else:
|
686 |
model = DecisionTreeClassifier()
|
687 |
+
model.fit(X_train_selected, y_train)
|
688 |
elif model_name == "Random Forest":
|
689 |
+
if problem_type == "Regression":
|
690 |
+
model = RandomForestRegressor(random_state=42)
|
691 |
+
if 'param_grid' in locals():
|
692 |
+
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') # Example scoring
|
693 |
+
grid_search.fit(X_train_selected, y_train)
|
694 |
+
model = grid_search.best_estimator_
|
695 |
+
st.write("Best Parameters:", grid_search.best_params_)
|
696 |
+
else:
|
697 |
+
model = RandomForestRegressor(random_state=42) #define if no param_grid
|
698 |
+
model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
|
699 |
+
|
700 |
+
else:
|
701 |
+
model = RandomForestClassifier(random_state=42)
|
702 |
+
if 'param_grid' in locals():
|
703 |
+
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
|
704 |
+
grid_search.fit(X_train_selected, y_train)
|
705 |
+
model = grid_search.best_estimator_
|
706 |
+
st.write("Best Parameters:", grid_search.best_params_)
|
707 |
+
else:
|
708 |
+
model = RandomForestClassifier(random_state=42) #define if no param_grid
|
709 |
+
model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
|
|
|
|
|
|
|
710 |
elif model_name == "Gradient Boosting":
|
711 |
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
|
712 |
model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
|
|
|
725 |
|
726 |
# Store model and preprocessor
|
727 |
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
|
|
|
|
|
|
|
|
728 |
st.session_state.preprocessor = preprocessor
|
729 |
|
730 |
# Model Evaluation
|
|
|
761 |
|
762 |
conf_matrix = confusion_matrix(y_test, y_pred)
|
763 |
|
764 |
+
#Heatmap
|
765 |
fig_conf, ax_conf = plt.subplots()
|
766 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax_conf)
|
767 |
ax_conf.set_xlabel('Predicted Labels')
|
768 |
ax_conf.set_ylabel('True Labels')
|
769 |
ax_conf.set_title('Confusion Matrix')
|
|
|
772 |
|
773 |
#Added section for model visualization
|
774 |
st.subheader("Model Visualization")
|
775 |
+
#Use conditional to make sure that everything only executes when the data set is trained and not outside of it.
|
776 |
+
if st.session_state.model is not None: #Make sure that everything only executes when data set is trained and not outside of it.
|
777 |
+
try: #All the plotting code here.
|
778 |
+
if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models.
|
779 |
+
|
780 |
+
#Make sure you use this inside of a conditional for classification, model, and tree based model.
|
781 |
|
782 |
+
#Feature Importance (Tree-based Models)
|
|
|
|
|
|
|
|
|
|
|
783 |
|
784 |
+
importances = model.feature_importances_ # Assumed tree-based model
|
785 |
+
feat_importances = pd.Series(importances, index=X_train.columns)
|
786 |
+
feat_importances = feat_importances.nlargest(20)
|
|
|
|
|
|
|
787 |
|
788 |
+
fig_feat, ax_feat = plt.subplots()
|
789 |
+
feat_importances.plot(kind='barh', ax=ax_feat)
|
790 |
+
ax_feat.set_xlabel('Relative Importance')
|
791 |
+
ax_feat.set_ylabel('Features')
|
792 |
+
ax_feat.set_title('Feature Importances')
|
793 |
+
st.pyplot(fig_feat)
|
794 |
|
795 |
+
#Create data that determines the learning and validation curve and what we have to add
|
796 |
train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run
|
797 |
|
798 |
+
|
799 |
+
#Take and define what this is for from the results that has been generated
|
800 |
train_mean = np.mean(train_scores, axis=1)
|
801 |
train_std = np.std(train_scores, axis=1)
|
802 |
valid_mean = np.mean(valid_scores, axis=1)
|
803 |
valid_std = np.std(valid_scores, axis=1)
|
804 |
|
805 |
+
#Plot each of the variables that has to be used.
|
806 |
+
|
807 |
fig_lc, ax_lc = plt.subplots()
|
808 |
ax_lc.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training ' + ('Accuracy' if problem_type == "Classification" else "Neg MSE"))
|
809 |
ax_lc.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
|
|
|
816 |
ax_lc.legend(loc='best')
|
817 |
st.pyplot(fig_lc)
|
818 |
|
819 |
+
|
820 |
+
except Exception as e: #Local error
|
821 |
+
st.write(f"Visuals are only available for tree based models or if models are selected prior: {e}") #Write only if error
|
822 |
+
|
823 |
+
|
824 |
+
|
825 |
|
826 |
except Exception as e:
|
827 |
st.error(f"An error occurred: {e}")
|
|
|
861 |
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
|
862 |
accuracy = accuracy_score(y_test, y_pred)
|
863 |
st.write(f"Accuracy: {accuracy:.4f}")
|
|
|
864 |
except Exception as e: #local error
|
865 |
st.error(f"An error occurred during model evaluation: {e}")
|
866 |
|