Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Feb 28

Commit

95e0685

verified ·

1 Parent(s): c1c0798

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -52

app.py CHANGED Viewed

@@ -588,7 +588,6 @@ elif app_mode == "Advanced EDA":
                     except Exception as e:
                         st.error(f"An error occurred during the T-test: {e}")
-#MODEL TRAINING
 elif app_mode == "Model Training":
     st.title("🚂 Model Training")
@@ -612,9 +611,6 @@ elif app_mode == "Model Training":
         feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
-        # Hyperparameter tuning
-        tuning_method = st.selectbox("Hyperparameter Tuning Method",["Grid Search","Bayesian Optimization","None"])
         if model_name == "Random Forest" and feature_columns:  # Check if Random Forest and features are selected
             min_features = 1 # Ensure at least one feature is used
             max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
@@ -674,41 +670,43 @@ elif app_mode == "Model Training":
                         X_train_selected = X_train_processed
                         X_test_selected = X_test_processed
-                     # Model Training and Hyperparameter Tuning
                     if model_name == "Linear Regression":
                         model = LinearRegression()
                     elif model_name == "Logistic Regression":
                         model = LogisticRegression(max_iter=1000)
                     elif model_name == "Decision Tree":
                         if problem_type == "Regression":
                             model = DecisionTreeRegressor()
                         else:
                             model = DecisionTreeClassifier()
                     elif model_name == "Random Forest":
-                         if tuning_method == "Bayesian Optimization":
-                             st.write("Implementing this function to be added soon")
-                         elif problem_type == "Regression":
-                             model = RandomForestRegressor(random_state=42)
-                             if 'param_grid' in locals():
-                                 grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')  # Example scoring
-                                 grid_search.fit(X_train_selected, y_train)
-                                 model = grid_search.best_estimator_
-                                 st.write("Best Parameters:", grid_search.best_params_)
-                             else:
-                                 model = RandomForestRegressor(random_state=42) #define if no param_grid
-                                 model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
-                         else:
-                             model = RandomForestClassifier(random_state=42)
-                             if 'param_grid' in locals():
-                                 grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
-                                 grid_search.fit(X_train_selected, y_train)
-                                 model = grid_search.best_estimator_
-                                 st.write("Best Parameters:", grid_search.best_params_)
-                             else:
-                                  model = RandomForestClassifier(random_state=42) #define if no param_grid
-                                  model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
                     elif model_name == "Gradient Boosting":
                         from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
                         model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
@@ -727,10 +725,6 @@ elif app_mode == "Model Training":
                     # Store model and preprocessor
                     st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
-                    st.session_state.preprocessor = preprocessor
-                     # Store model and preprocessor
-                    st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
                     st.session_state.preprocessor = preprocessor
                     # Model Evaluation
@@ -767,9 +761,9 @@ elif app_mode == "Model Training":
                         conf_matrix = confusion_matrix(y_test, y_pred)
-                      # Assuming conf_matrix is your confusion matrix
                         fig_conf, ax_conf = plt.subplots()
-                        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax_conf)
                         ax_conf.set_xlabel('Predicted Labels')
                         ax_conf.set_ylabel('True Labels')
                         ax_conf.set_title('Confusion Matrix')
@@ -778,31 +772,38 @@ elif app_mode == "Model Training":
                     #Added section for model visualization
                     st.subheader("Model Visualization")
-                    try: #All the plotting code here.
-                        if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models.
-                            #Feature Importance (Tree-based Models) and model selected was good
-                            importances = model.feature_importances_ # Assumed tree-based model
-                            feat_importances = pd.Series(importances, index=X_train.columns)
-                            feat_importances = feat_importances.nlargest(20)
-                            fig_feat, ax_feat = plt.subplots()
-                            feat_importances.plot(kind='barh', ax=ax_feat)
-                            ax_feat.set_xlabel('Relative Importance')
-                            ax_feat.set_ylabel('Features')
-                            ax_feat.set_title('Feature Importances')
-                            st.pyplot(fig_feat)
-                         #Create data that determines the learning and validation curve and what we have to add
                             train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run
-                            #Then add a plot for the learning curve and use st.pyplot
                             train_mean = np.mean(train_scores, axis=1)
                             train_std = np.std(train_scores, axis=1)
                             valid_mean = np.mean(valid_scores, axis=1)
                             valid_std = np.std(valid_scores, axis=1)
                             fig_lc, ax_lc = plt.subplots()
                             ax_lc.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training ' + ('Accuracy' if problem_type == "Classification" else "Neg MSE"))
                             ax_lc.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
@@ -815,8 +816,12 @@ elif app_mode == "Model Training":
                             ax_lc.legend(loc='best')
                             st.pyplot(fig_lc)
-                    except Exception as e: #Local error
-                        st.write(f"Visuals are only available for tree based models or if models are selected prior: {e}")
                 except Exception as e:
                     st.error(f"An error occurred: {e}")
@@ -856,7 +861,6 @@ elif app_mode == "Model Training":
                      from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
                      accuracy = accuracy_score(y_test, y_pred)
                      st.write(f"Accuracy: {accuracy:.4f}")
             except Exception as e: #local error
                  st.error(f"An error occurred during model evaluation: {e}")

                     except Exception as e:
                         st.error(f"An error occurred during the T-test: {e}")
 elif app_mode == "Model Training":
     st.title("🚂 Model Training")
         feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
         if model_name == "Random Forest" and feature_columns:  # Check if Random Forest and features are selected
             min_features = 1 # Ensure at least one feature is used
             max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
                         X_train_selected = X_train_processed
                         X_test_selected = X_test_processed
+                    # Model Training and Hyperparameter Tuning
                     if model_name == "Linear Regression":
                         model = LinearRegression()
+                        model.fit(X_train_selected, y_train)
                     elif model_name == "Logistic Regression":
                         model = LogisticRegression(max_iter=1000)
+                        model.fit(X_train_selected, y_train)
                     elif model_name == "Decision Tree":
                         if problem_type == "Regression":
                             model = DecisionTreeRegressor()
+                            model.fit(X_train_selected, y_train)
                         else:
                             model = DecisionTreeClassifier()
+                            model.fit(X_train_selected, y_train)
                     elif model_name == "Random Forest":
+                        if problem_type == "Regression":
+                            model = RandomForestRegressor(random_state=42)
+                            if 'param_grid' in locals():
+                                grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')  # Example scoring
+                                grid_search.fit(X_train_selected, y_train)
+                                model = grid_search.best_estimator_
+                                st.write("Best Parameters:", grid_search.best_params_)
+                            else:
+                                model = RandomForestRegressor(random_state=42) #define if no param_grid
+                                model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
+                        else:
+                            model = RandomForestClassifier(random_state=42)
+                            if 'param_grid' in locals():
+                                grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
+                                grid_search.fit(X_train_selected, y_train)
+                                model = grid_search.best_estimator_
+                                st.write("Best Parameters:", grid_search.best_params_)
+                            else:
+                                 model = RandomForestClassifier(random_state=42) #define if no param_grid
+                                 model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
                     elif model_name == "Gradient Boosting":
                         from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
                         model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
                     # Store model and preprocessor
                     st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
                     st.session_state.preprocessor = preprocessor
                     # Model Evaluation
                         conf_matrix = confusion_matrix(y_test, y_pred)
+                        #Heatmap
                         fig_conf, ax_conf = plt.subplots()
+                        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax_conf)
                         ax_conf.set_xlabel('Predicted Labels')
                         ax_conf.set_ylabel('True Labels')
                         ax_conf.set_title('Confusion Matrix')
                     #Added section for model visualization
                     st.subheader("Model Visualization")
+                    #Use conditional to make sure that everything only executes when the data set is trained and not outside of it.
+                    if st.session_state.model is not None: #Make sure that everything only executes when data set is trained and not outside of it.
+                         try: #All the plotting code here.
+                            if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models.
+                                 #Make sure you use this inside of a conditional for classification, model, and tree based model.
+                                    #Feature Importance (Tree-based Models)
+                                    importances = model.feature_importances_ # Assumed tree-based model
+                                    feat_importances = pd.Series(importances, index=X_train.columns)
+                                    feat_importances = feat_importances.nlargest(20)
+                                    fig_feat, ax_feat = plt.subplots()
+                                    feat_importances.plot(kind='barh', ax=ax_feat)
+                                    ax_feat.set_xlabel('Relative Importance')
+                                    ax_feat.set_ylabel('Features')
+                                    ax_feat.set_title('Feature Importances')
+                                    st.pyplot(fig_feat)
+                            #Create data that determines the learning and validation curve and what we have to add
                             train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run
+                            #Take and define what this is for from the results that has been generated
                             train_mean = np.mean(train_scores, axis=1)
                             train_std = np.std(train_scores, axis=1)
                             valid_mean = np.mean(valid_scores, axis=1)
                             valid_std = np.std(valid_scores, axis=1)
+                            #Plot each of the variables that has to be used.
                             fig_lc, ax_lc = plt.subplots()
                             ax_lc.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training ' + ('Accuracy' if problem_type == "Classification" else "Neg MSE"))
                             ax_lc.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
                             ax_lc.legend(loc='best')
                             st.pyplot(fig_lc)
+                         except Exception as e: #Local error
+                            st.write(f"Visuals are only available for tree based models or if models are selected prior: {e}") #Write only if error
                 except Exception as e:
                     st.error(f"An error occurred: {e}")
                      from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
                      accuracy = accuracy_score(y_test, y_pred)
                      st.write(f"Accuracy: {accuracy:.4f}")
             except Exception as e: #local error
                  st.error(f"An error occurred during model evaluation: {e}")