Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Feb 28

Commit

d429dc4

verified ·

1 Parent(s): 8d6760a

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -24

app.py CHANGED Viewed

@@ -594,6 +594,14 @@ elif app_mode == "Model Training":
     if st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data.copy()
         # Target Variable Selection
         target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
@@ -613,30 +621,35 @@ elif app_mode == "Model Training":
         feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
-         # Hyperparameter Tuning - Dynamic based on Model Selection
         param_grid = {}  # Initialize to empty dictionary
         if model_name == "Random Forest":
             st.subheader("Random Forest Hyperparameters")
             param_grid = {
-                'n_estimators': list(range(st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest.", key="n_estimators"),(st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest.", key = "n_estimators2")+1))),
-                'max_depth': list(range(st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree.", key="max_depth1"),(st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree.", key = "max_depth2")+1))),
-                'min_samples_split': list(range(st.slider("Minimum Samples Split", 2, 10, 2, help="Minimum samples required to split an internal node", key="min_samples_split1"),(st.slider("Minimum Samples Split", 2, 10, 2, help="Minimum samples required to split an internal node", key = "min_samples_split2")+1))), #New hyperparameter
-                'min_samples_leaf': list(range(st.slider("Minimum Samples Leaf", 1, 10, 1, help="Minimum samples required to be at a leaf node", key="min_samples_leaf1"),(st.slider("Minimum Samples Leaf", 1, 10, 1, help="Minimum samples required to be at a leaf node", key = "min_samples_leaf2")+1))), #New hyperparameter
             }
         elif model_name == "Gradient Boosting":
-             st.subheader("Gradient Boosting Hyperparameters")
-             param_grid = {
-                'n_estimators': list(range(st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest.", key="gb_n_estimators1"),(st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest.", key = "gb_n_estimators2")+1))),
-                'learning_rate': [st.slider("Learning Rate", 0.01, 1.0, 0.1, step=0.01, help="Learning rate", key = 'gb_learning_rate')],  # Example, add more
-                'max_depth': list(range(st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree.", key="gb_max_depth1"),(st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree.", key = "gb_max_depth2")+1))),
             }
         elif model_name == "Decision Tree":
             st.subheader("Decision Tree Hyperparameters")
             param_grid = {
-                'criterion': st.selectbox("Criterion", ["gini", "entropy"], help="Splitting criterion"),
-                'max_depth': list(range(st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree.", key="dt_max_depth1"),(st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree.", key = "dt_max_depth2")+1))),
             }
          # Train-Test Split
@@ -691,13 +704,18 @@ elif app_mode == "Model Training":
                     # Model Training and Hyperparameter Tuning
                     if model_name == "Linear Regression":
                         model = LinearRegression()
                     elif model_name == "Logistic Regression":
                         model = LogisticRegression(max_iter=1000)
                     elif model_name == "Decision Tree":
                         if problem_type == "Regression":
                             model = DecisionTreeRegressor()
                         else:
                             model = DecisionTreeClassifier()
                     elif model_name == "Random Forest":
                         if problem_type == "Regression":
                             model = RandomForestRegressor(random_state=42)
@@ -707,6 +725,7 @@ elif app_mode == "Model Training":
                                 model = grid_search.best_estimator_
                                 st.write("Best Parameters:", grid_search.best_params_)
                             else:
                                 model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
                         else:
@@ -739,6 +758,12 @@ elif app_mode == "Model Training":
                     st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
                     st.session_state.preprocessor = preprocessor
                     # Model Evaluation
                     y_pred = model.predict(X_test_selected)
                     if problem_type == "Regression":
@@ -773,9 +798,9 @@ elif app_mode == "Model Training":
                         conf_matrix = confusion_matrix(y_test, y_pred)
-                        # Assuming conf_matrix is your confusion matrix
                         fig_conf, ax_conf = plt.subplots()
-                        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax_conf)  # Corrected named argument
                         ax_conf.set_xlabel('Predicted Labels')
                         ax_conf.set_ylabel('True Labels')
                         ax_conf.set_title('Confusion Matrix')
@@ -788,7 +813,7 @@ elif app_mode == "Model Training":
                     if st.session_state.model is not None: #Make sure that everything only executes when data set is trained and not outside of it.
                          try: #All the plotting code here.
                             if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models.
-                                   #Make sure you use this inside of a conditional for classification, model, and tree based model.
                                     #Feature Importance (Tree-based Models)
@@ -806,7 +831,7 @@ elif app_mode == "Model Training":
                             #Create data that determines the learning and validation curve and what we have to add
                             train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run
-                            #Take and define what this is for from the results that has been generated
                             train_mean = np.mean(train_scores, axis=1)
                             train_std = np.std(train_scores, axis=1)
                             valid_mean = np.mean(valid_scores, axis=1)
@@ -852,20 +877,19 @@ elif app_mode == "Model Training":
             except Exception as e:
                 st.error(f"Error loading model: {e}")
-       #Model Evaluation Section
-        if 'X_test' in locals() and st.session_state.model is not None:
-            try: #Error catching with new test data
-                y_pred = st.session_state.model.predict(X_test)
                 if problem_type == "Regression":
-                    mse = mean_squared_error(y_test, y_pred)
-                    r2 = r2_score(y_test, y_pred)
                     st.write(f"Mean Squared Error: {mse:.4f}")
                     st.write(f"R-squared: {r2:.4f}")
                 else:
                      from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
-                     accuracy = accuracy_score(y_test, y_pred)
                      st.write(f"Accuracy: {accuracy:.4f}")
             except Exception as e: #local error
                  st.error(f"An error occurred during model evaluation: {e}")

     if st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data.copy()
+        # Initialize session state for train/test split
+        if 'X_train_selected' not in st.session_state:
+            st.session_state.X_train_selected = None
+            st.session_state.X_test_selected = None
+            st.session_state.y_train = None
+            st.session_state.y_test = None
+            st.session_state.model = None  # Initialize model in session state
         # Target Variable Selection
         target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
         feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
+        # Hyperparameter Tuning - Dynamic based on Model Selection
         param_grid = {}  # Initialize to empty dictionary
+        #Define different paramter values for the model so it works. This is not an optimized number
+        #The goal is to make sure that all visualizations and graphs work as is.
         if model_name == "Random Forest":
             st.subheader("Random Forest Hyperparameters")
             param_grid = {
+                'n_estimators': list(range(100, 101)), #Used 100 so model is trained and not empty and all visuals work
+                'max_depth': list(range(10,11)), #default value 10 so its in model
+                'min_samples_split': list(range(2,3)), #New hyperparameter default 2
+                'min_samples_leaf': list(range(1,2)), #New hyperparameter default 1
             }
         elif model_name == "Gradient Boosting":
+            st.subheader("Gradient Boosting Hyperparameters")
+            param_grid = {
+                'n_estimators': list(range(100, 101)),
+                'learning_rate': [0.1],
+                'max_depth': list(range(3,4))
             }
         elif model_name == "Decision Tree":
             st.subheader("Decision Tree Hyperparameters")
             param_grid = {
+                'criterion': ["gini"],
+                'max_depth': list(range(3,4)),
             }
          # Train-Test Split
                     # Model Training and Hyperparameter Tuning
                     if model_name == "Linear Regression":
                         model = LinearRegression()
+                        model.fit(X_train_selected, y_train)
                     elif model_name == "Logistic Regression":
                         model = LogisticRegression(max_iter=1000)
+                        model.fit(X_train_selected, y_train)
                     elif model_name == "Decision Tree":
                         if problem_type == "Regression":
                             model = DecisionTreeRegressor()
+                            model.fit(X_train_selected, y_train)
                         else:
                             model = DecisionTreeClassifier()
+                            model.fit(X_train_selected, y_train)
                     elif model_name == "Random Forest":
                         if problem_type == "Regression":
                             model = RandomForestRegressor(random_state=42)
                                 model = grid_search.best_estimator_
                                 st.write("Best Parameters:", grid_search.best_params_)
                             else:
+                                model = RandomForestRegressor(random_state=42) #define if no param_grid
                                 model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
                         else:
                     st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
                     st.session_state.preprocessor = preprocessor
+                    #Store the test data
+                    st.session_state.X_train_selected = X_train_selected
+                    st.session_state.X_test_selected = X_test_selected
+                    st.session_state.y_train = y_train
+                    st.session_state.y_test = y_test
                     # Model Evaluation
                     y_pred = model.predict(X_test_selected)
                     if problem_type == "Regression":
                         conf_matrix = confusion_matrix(y_test, y_pred)
+                        #Heatmap
                         fig_conf, ax_conf = plt.subplots()
+                        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax_conf)
                         ax_conf.set_xlabel('Predicted Labels')
                         ax_conf.set_ylabel('True Labels')
                         ax_conf.set_title('Confusion Matrix')
                     if st.session_state.model is not None: #Make sure that everything only executes when data set is trained and not outside of it.
                          try: #All the plotting code here.
                             if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models.
+                                 #Make sure you use this inside of a conditional for classification, model, and tree based model.
                                     #Feature Importance (Tree-based Models)
                             #Create data that determines the learning and validation curve and what we have to add
                             train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run
+                            #Then add a plot for the learning curve and use st.pyplot
                             train_mean = np.mean(train_scores, axis=1)
                             train_std = np.std(train_scores, axis=1)
                             valid_mean = np.mean(valid_scores, axis=1)
             except Exception as e:
                 st.error(f"Error loading model: {e}")
+       #Model Evaluation Section - run on the saved model
+        if  st.session_state.model is not None and st.session_state.X_test_selected is not None: # added check to make sure it is a loaded model
+            try:
+                y_pred = st.session_state.model.predict(st.session_state.X_test_selected) # load from stored
                 if problem_type == "Regression":
+                    mse = mean_squared_error(st.session_state.y_test, y_pred)
+                    r2 = r2_score(st.session_state.y_test, y_pred)
                     st.write(f"Mean Squared Error: {mse:.4f}")
                     st.write(f"R-squared: {r2:.4f}")
                 else:
                      from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
+                     accuracy = accuracy_score(st.session_state.y_test, y_pred)
                      st.write(f"Accuracy: {accuracy:.4f}")
             except Exception as e: #local error
                  st.error(f"An error occurred during model evaluation: {e}")