Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Feb 28

Commit

760d90d

verified ·

1 Parent(s): 130b052

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -43

app.py CHANGED Viewed

@@ -587,6 +587,7 @@ elif app_mode == "Advanced EDA":
                     except Exception as e:
                         st.error(f"An error occurred during the T-test: {e}")
 elif app_mode == "Model Training":
     st.title("🚂 Model Training")
@@ -610,6 +611,9 @@ elif app_mode == "Model Training":
         feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
         if model_name == "Random Forest" and feature_columns:  # Check if Random Forest and features are selected
             min_features = 1 # Ensure at least one feature is used
             max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
@@ -669,43 +673,41 @@ elif app_mode == "Model Training":
                         X_train_selected = X_train_processed
                         X_test_selected = X_test_processed
-                    # Model Training and Hyperparameter Tuning
                     if model_name == "Linear Regression":
                         model = LinearRegression()
-                        model.fit(X_train_selected, y_train)
                     elif model_name == "Logistic Regression":
                         model = LogisticRegression(max_iter=1000)
-                        model.fit(X_train_selected, y_train)
                     elif model_name == "Decision Tree":
                         if problem_type == "Regression":
                             model = DecisionTreeRegressor()
-                            model.fit(X_train_selected, y_train)
                         else:
                             model = DecisionTreeClassifier()
-                            model.fit(X_train_selected, y_train)
                     elif model_name == "Random Forest":
-                        if problem_type == "Regression":
-                            model = RandomForestRegressor(random_state=42)
-                            if 'param_grid' in locals():
-                                grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')  # Example scoring
-                                grid_search.fit(X_train_selected, y_train)
-                                model = grid_search.best_estimator_
-                                st.write("Best Parameters:", grid_search.best_params_)
-                            else:
-                                model = RandomForestRegressor(random_state=42) #define if no param_grid
-                                model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
-                        else:
-                            model = RandomForestClassifier(random_state=42)
-                            if 'param_grid' in locals():
-                                grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
-                                grid_search.fit(X_train_selected, y_train)
-                                model = grid_search.best_estimator_
-                                st.write("Best Parameters:", grid_search.best_params_)
-                            else:
-                                 model = RandomForestClassifier(random_state=42) #define if no param_grid
-                                 model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
                     elif model_name == "Gradient Boosting":
                         from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
                         model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
@@ -724,6 +726,10 @@ elif app_mode == "Model Training":
                     # Store model and preprocessor
                     st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
                     st.session_state.preprocessor = preprocessor
                     # Model Evaluation
@@ -762,7 +768,7 @@ elif app_mode == "Model Training":
                         #Heatmap
                         fig_conf, ax_conf = plt.subplots()
-                        sns.heatmap(conf_matrix, ax=ax_conf, annot=True, fmt='d', cmap='Blues')
                         ax_conf.set_xlabel('Predicted Labels')
                         ax_conf.set_ylabel('True Labels')
                         ax_conf.set_title('Confusion Matrix')
@@ -813,6 +819,7 @@ elif app_mode == "Model Training":
                 except Exception as e:
                     st.error(f"An error occurred: {e}")
     else:
         st.write("Please upload and clean data first.")
@@ -848,18 +855,16 @@ elif app_mode == "Model Training":
                      from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
                      accuracy = accuracy_score(y_test, y_pred)
                      st.write(f"Accuracy: {accuracy:.4f}")
             except Exception as e: #local error
                  st.error(f"An error occurred during model evaluation: {e}")
 elif app_mode == "Predictions":
     st.title("🔮 Make Predictions")
     if st.session_state.model is not None and st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data.copy()
-        # Input data for prediction
-        st.subheader("Enter Data for Prediction")
-        input_data = {}
         try:
             numeric_transformer_columns = st.session_state.model.steps[0][1].transformers_[0][2] if hasattr(st.session_state.model.steps[0][1].transformers_[0][2], '__len__') else []
@@ -868,33 +873,58 @@ elif app_mode == "Predictions":
         except AttributeError as e:
             st.error(f"Error accessing model transformers: {e}. Please ensure a valid model is trained and loaded.")
             st.stop()
         if not set(model_columns).issubset(set(df.columns)): #Fixed comparison
             st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.")
             st.stop()
         for col in model_columns:
             if pd.api.types.is_numeric_dtype(df[col]):
                 input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean())
             else:
                 input_data[col] = st.selectbox(f"Select {col}", df[col].unique())
-        # Prediction Button
         if st.button("Make Prediction"):
             try:
                 input_df = pd.DataFrame([input_data])
-                prediction = st.session_state.model.predict(input_df)[0]
                 st.subheader("Prediction Result")
                 st.write(f"The predicted value is: {prediction}")
-                # Additional Feedback (Example for Classification)
-                if isinstance(st.session_state.model.steps[-1][1], LogisticRegression):
                     probabilities = st.session_state.model.predict_proba(input_df)[0]
                     st.write("Predicted Probabilities:")
-                    st.write(probabilities)
             except Exception as e:
-                st.error(f"An error occurred during prediction: {e}")
         #Add batch prediction section in prediction tab
         st.subheader("Batch Predictions")
@@ -902,11 +932,36 @@ elif app_mode == "Predictions":
         if batch_file is not None:
             try:
                 batch_df = pd.read_csv(batch_file)
                 # Preprocess the batch data
-                batch_processed = st.session_state.preprocessor.transform(batch_df)
                 # Make predictions
                 batch_predictions = st.session_state.model.predict(batch_processed)
                 batch_df['Prediction'] = batch_predictions
                 st.dataframe(batch_df)
              # Download predictions

                     except Exception as e:
                         st.error(f"An error occurred during the T-test: {e}")
+#MODEL TRAINING
 elif app_mode == "Model Training":
     st.title("🚂 Model Training")
         feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
+        # Hyperparameter tuning
+        tuning_method = st.selectbox("Hyperparameter Tuning Method",["Grid Search","Bayesian Optimization","None"])
         if model_name == "Random Forest" and feature_columns:  # Check if Random Forest and features are selected
             min_features = 1 # Ensure at least one feature is used
             max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
                         X_train_selected = X_train_processed
                         X_test_selected = X_test_processed
+                     # Model Training and Hyperparameter Tuning
                     if model_name == "Linear Regression":
                         model = LinearRegression()
                     elif model_name == "Logistic Regression":
                         model = LogisticRegression(max_iter=1000)
                     elif model_name == "Decision Tree":
                         if problem_type == "Regression":
                             model = DecisionTreeRegressor()
                         else:
                             model = DecisionTreeClassifier()
                     elif model_name == "Random Forest":
+                         if tuning_method == "Bayesian Optimization":
+                             st.write("Implementing this function to be added soon")
+                         elif problem_type == "Regression":
+                             model = RandomForestRegressor(random_state=42)
+                             if 'param_grid' in locals():
+                                 grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')  # Example scoring
+                                 grid_search.fit(X_train_selected, y_train)
+                                 model = grid_search.best_estimator_
+                                 st.write("Best Parameters:", grid_search.best_params_)
+                             else:
+                                 model = RandomForestRegressor(random_state=42) #define if no param_grid
+                                 model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
+                         else:
+                             model = RandomForestClassifier(random_state=42)
+                             if 'param_grid' in locals():
+                                 grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
+                                 grid_search.fit(X_train_selected, y_train)
+                                 model = grid_search.best_estimator_
+                                 st.write("Best Parameters:", grid_search.best_params_)
+                             else:
+                                  model = RandomForestClassifier(random_state=42) #define if no param_grid
+                                  model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
                     elif model_name == "Gradient Boosting":
                         from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
                         model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
                     # Store model and preprocessor
                     st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
+                    st.session_state.preprocessor = preprocessor
+                     # Store model and preprocessor
+                    st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
                     st.session_state.preprocessor = preprocessor
                     # Model Evaluation
                         #Heatmap
                         fig_conf, ax_conf = plt.subplots()
+                        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax_conf)
                         ax_conf.set_xlabel('Predicted Labels')
                         ax_conf.set_ylabel('True Labels')
                         ax_conf.set_title('Confusion Matrix')
                 except Exception as e:
                     st.error(f"An error occurred: {e}")
     else:
         st.write("Please upload and clean data first.")
                      from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
                      accuracy = accuracy_score(y_test, y_pred)
                      st.write(f"Accuracy: {accuracy:.4f}")
             except Exception as e: #local error
                  st.error(f"An error occurred during model evaluation: {e}")
 elif app_mode == "Predictions":
     st.title("🔮 Make Predictions")
     if st.session_state.model is not None and st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data.copy()
+        model = st.session_state.model.steps[-1][1] #Define model from the state
         try:
             numeric_transformer_columns = st.session_state.model.steps[0][1].transformers_[0][2] if hasattr(st.session_state.model.steps[0][1].transformers_[0][2], '__len__') else []
         except AttributeError as e:
             st.error(f"Error accessing model transformers: {e}. Please ensure a valid model is trained and loaded.")
             st.stop()
+        model_is_classification = hasattr(model, 'predict_proba')  # Check for classification or other problem
         if not set(model_columns).issubset(set(df.columns)): #Fixed comparison
             st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.")
             st.stop()
+        input_data = {}
+        st.subheader("Enter Data for Prediction")
         for col in model_columns:
             if pd.api.types.is_numeric_dtype(df[col]):
                 input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean())
             else:
                 input_data[col] = st.selectbox(f"Select {col}", df[col].unique())
+        # Make Prediction Button
         if st.button("Make Prediction"):
             try:
                 input_df = pd.DataFrame([input_data])
+                #Preprocess for model
+                input_processed = st.session_state.preprocessor.transform(input_df)
+                prediction = st.session_state.model.predict(input_processed)[0]
                 st.subheader("Prediction Result")
                 st.write(f"The predicted value is: {prediction}")
+                # Show shap values chart
+                show_shap_values = st.checkbox("View SHAP Explanation") #select model to show shap values
+                if show_shap_values and model_is_classification and model_name not in ["Linear Regression","Logistic Regression","SVM","Naive Bayes", "KNN"]:#Show shap values if this can perform.
+                    try:
+                        import shap #Import lib
+                        explainer = shap.TreeExplainer(st.session_state.model.steps[-1][1]) #Used tree model because these are easily visualized
+                        shap_values = explainer.shap_values(input_processed) #Get output of each values, only used in tree models
+                        st.subheader("SHAP Values")
+                        #Plot for each of the different class labels.
+                        shap.initjs()
+                        fig_shap, ax_shap = plt.subplots(1, figsize = (10,10))
+                        shap.summary_plot(shap_values, features = X_train, feature_names = feature_columns, plot_type = "bar")#plot for multi class labels
+                        st.pyplot(fig_shap) #Show the figure
+                    except Exception as e:
+                        st.write(f"Can show shap values on tree based model: {e}") #Show error
+                 # Additional Feedback (Example for Classification)
+                if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'): #If the end variable has predict_proba and is therefore a predictor
                     probabilities = st.session_state.model.predict_proba(input_df)[0]
                     st.write("Predicted Probabilities:")
+                    st.write(probabilities) #write here
             except Exception as e:
+                st.error(f"An error occurred during prediction: {e}") #Base case error
         #Add batch prediction section in prediction tab
         st.subheader("Batch Predictions")
         if batch_file is not None:
             try:
                 batch_df = pd.read_csv(batch_file)
+                 #Verify data types and if it matches the ones used during the columns
+                for col in model_columns:
+                    if pd.api.types.is_numeric_dtype(df[col]):
+                        try:
+                            batch_df[col] = pd.to_numeric(batch_df[col], errors='raise')
+                        except ValueError:
+                            st.error(f"Column '{col}' must be numeric.")
+                            st.stop()
+                    else:
+                        #ensure columns are type string if that isnt the case
+                        batch_df[col] = batch_df[col].astype(str)
+                if not set(model_columns).issubset(set(batch_df.columns)): #Fixed comparison
+                    st.error("The batch dataframe that contains different columns than the currently used training dataframe. Please upload the correct dataframe.")
+                    st.stop()
                 # Preprocess the batch data
+                batch_processed = st.session_state.preprocessor.transform(batch_df[model_columns])
                 # Make predictions
                 batch_predictions = st.session_state.model.predict(batch_processed)
                 batch_df['Prediction'] = batch_predictions
+                 #Add probability output if that function is available.
+                if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'):
+                   batch_probabilities = st.session_state.model.predict_proba(batch_processed)
+                   for i in range(batch_probabilities.shape[1]): #Loop through and give each probability
+                        batch_df[f'Probability_Class_{i}'] = batch_probabilities[:, i]
                 st.dataframe(batch_df)
              # Download predictions