CosmickVisions commited on
Commit
760d90d
·
verified ·
1 Parent(s): 130b052

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -43
app.py CHANGED
@@ -587,6 +587,7 @@ elif app_mode == "Advanced EDA":
587
  except Exception as e:
588
  st.error(f"An error occurred during the T-test: {e}")
589
 
 
590
  elif app_mode == "Model Training":
591
  st.title("🚂 Model Training")
592
 
@@ -610,6 +611,9 @@ elif app_mode == "Model Training":
610
 
611
  feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
612
 
 
 
 
613
  if model_name == "Random Forest" and feature_columns: # Check if Random Forest and features are selected
614
  min_features = 1 # Ensure at least one feature is used
615
  max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
@@ -669,43 +673,41 @@ elif app_mode == "Model Training":
669
  X_train_selected = X_train_processed
670
  X_test_selected = X_test_processed
671
 
672
- # Model Training and Hyperparameter Tuning
673
  if model_name == "Linear Regression":
674
  model = LinearRegression()
675
- model.fit(X_train_selected, y_train)
676
-
677
  elif model_name == "Logistic Regression":
678
  model = LogisticRegression(max_iter=1000)
679
- model.fit(X_train_selected, y_train)
680
  elif model_name == "Decision Tree":
681
  if problem_type == "Regression":
682
  model = DecisionTreeRegressor()
683
- model.fit(X_train_selected, y_train)
684
  else:
685
  model = DecisionTreeClassifier()
686
- model.fit(X_train_selected, y_train)
687
  elif model_name == "Random Forest":
688
- if problem_type == "Regression":
689
- model = RandomForestRegressor(random_state=42)
690
- if 'param_grid' in locals():
691
- grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') # Example scoring
692
- grid_search.fit(X_train_selected, y_train)
693
- model = grid_search.best_estimator_
694
- st.write("Best Parameters:", grid_search.best_params_)
695
- else:
696
- model = RandomForestRegressor(random_state=42) #define if no param_grid
697
- model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
698
-
699
- else:
700
- model = RandomForestClassifier(random_state=42)
701
- if 'param_grid' in locals():
702
- grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
703
- grid_search.fit(X_train_selected, y_train)
704
- model = grid_search.best_estimator_
705
- st.write("Best Parameters:", grid_search.best_params_)
706
- else:
707
- model = RandomForestClassifier(random_state=42) #define if no param_grid
708
- model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
 
 
 
709
  elif model_name == "Gradient Boosting":
710
  from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
711
  model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
@@ -724,6 +726,10 @@ elif app_mode == "Model Training":
724
 
725
  # Store model and preprocessor
726
  st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
 
 
 
 
727
  st.session_state.preprocessor = preprocessor
728
 
729
  # Model Evaluation
@@ -762,7 +768,7 @@ elif app_mode == "Model Training":
762
 
763
  #Heatmap
764
  fig_conf, ax_conf = plt.subplots()
765
- sns.heatmap(conf_matrix, ax=ax_conf, annot=True, fmt='d', cmap='Blues')
766
  ax_conf.set_xlabel('Predicted Labels')
767
  ax_conf.set_ylabel('True Labels')
768
  ax_conf.set_title('Confusion Matrix')
@@ -813,6 +819,7 @@ elif app_mode == "Model Training":
813
 
814
  except Exception as e:
815
  st.error(f"An error occurred: {e}")
 
816
  else:
817
  st.write("Please upload and clean data first.")
818
 
@@ -848,18 +855,16 @@ elif app_mode == "Model Training":
848
  from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
849
  accuracy = accuracy_score(y_test, y_pred)
850
  st.write(f"Accuracy: {accuracy:.4f}")
 
851
  except Exception as e: #local error
852
  st.error(f"An error occurred during model evaluation: {e}")
853
-
854
  elif app_mode == "Predictions":
855
  st.title("🔮 Make Predictions")
856
 
857
  if st.session_state.model is not None and st.session_state.cleaned_data is not None:
858
  df = st.session_state.cleaned_data.copy()
859
-
860
- # Input data for prediction
861
- st.subheader("Enter Data for Prediction")
862
- input_data = {}
863
 
864
  try:
865
  numeric_transformer_columns = st.session_state.model.steps[0][1].transformers_[0][2] if hasattr(st.session_state.model.steps[0][1].transformers_[0][2], '__len__') else []
@@ -868,33 +873,58 @@ elif app_mode == "Predictions":
868
  except AttributeError as e:
869
  st.error(f"Error accessing model transformers: {e}. Please ensure a valid model is trained and loaded.")
870
  st.stop()
871
-
 
872
  if not set(model_columns).issubset(set(df.columns)): #Fixed comparison
873
  st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.")
874
  st.stop()
875
-
 
 
876
  for col in model_columns:
877
  if pd.api.types.is_numeric_dtype(df[col]):
878
  input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean())
879
  else:
880
  input_data[col] = st.selectbox(f"Select {col}", df[col].unique())
881
 
882
- # Prediction Button
883
  if st.button("Make Prediction"):
884
  try:
885
  input_df = pd.DataFrame([input_data])
886
- prediction = st.session_state.model.predict(input_df)[0]
 
 
887
  st.subheader("Prediction Result")
888
  st.write(f"The predicted value is: {prediction}")
889
 
890
- # Additional Feedback (Example for Classification)
891
- if isinstance(st.session_state.model.steps[-1][1], LogisticRegression):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892
  probabilities = st.session_state.model.predict_proba(input_df)[0]
893
  st.write("Predicted Probabilities:")
894
- st.write(probabilities)
895
-
896
  except Exception as e:
897
- st.error(f"An error occurred during prediction: {e}")
898
 
899
  #Add batch prediction section in prediction tab
900
  st.subheader("Batch Predictions")
@@ -902,11 +932,36 @@ elif app_mode == "Predictions":
902
  if batch_file is not None:
903
  try:
904
  batch_df = pd.read_csv(batch_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
905
  # Preprocess the batch data
906
- batch_processed = st.session_state.preprocessor.transform(batch_df)
907
  # Make predictions
908
  batch_predictions = st.session_state.model.predict(batch_processed)
909
  batch_df['Prediction'] = batch_predictions
 
 
 
 
 
 
 
 
 
910
  st.dataframe(batch_df)
911
 
912
  # Download predictions
 
587
  except Exception as e:
588
  st.error(f"An error occurred during the T-test: {e}")
589
 
590
+ #MODEL TRAINING
591
  elif app_mode == "Model Training":
592
  st.title("🚂 Model Training")
593
 
 
611
 
612
  feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
613
 
614
+ # Hyperparameter tuning
615
+ tuning_method = st.selectbox("Hyperparameter Tuning Method",["Grid Search","Bayesian Optimization","None"])
616
+
617
  if model_name == "Random Forest" and feature_columns: # Check if Random Forest and features are selected
618
  min_features = 1 # Ensure at least one feature is used
619
  max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
 
673
  X_train_selected = X_train_processed
674
  X_test_selected = X_test_processed
675
 
676
+ # Model Training and Hyperparameter Tuning
677
  if model_name == "Linear Regression":
678
  model = LinearRegression()
 
 
679
  elif model_name == "Logistic Regression":
680
  model = LogisticRegression(max_iter=1000)
 
681
  elif model_name == "Decision Tree":
682
  if problem_type == "Regression":
683
  model = DecisionTreeRegressor()
 
684
  else:
685
  model = DecisionTreeClassifier()
 
686
  elif model_name == "Random Forest":
687
+ if tuning_method == "Bayesian Optimization":
688
+ st.write("Implementing this function to be added soon")
689
+ elif problem_type == "Regression":
690
+ model = RandomForestRegressor(random_state=42)
691
+ if 'param_grid' in locals():
692
+ grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') # Example scoring
693
+ grid_search.fit(X_train_selected, y_train)
694
+ model = grid_search.best_estimator_
695
+ st.write("Best Parameters:", grid_search.best_params_)
696
+ else:
697
+ model = RandomForestRegressor(random_state=42) #define if no param_grid
698
+ model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
699
+
700
+
701
+ else:
702
+ model = RandomForestClassifier(random_state=42)
703
+ if 'param_grid' in locals():
704
+ grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
705
+ grid_search.fit(X_train_selected, y_train)
706
+ model = grid_search.best_estimator_
707
+ st.write("Best Parameters:", grid_search.best_params_)
708
+ else:
709
+ model = RandomForestClassifier(random_state=42) #define if no param_grid
710
+ model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
711
  elif model_name == "Gradient Boosting":
712
  from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
713
  model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
 
726
 
727
  # Store model and preprocessor
728
  st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
729
+ st.session_state.preprocessor = preprocessor
730
+
731
+ # Store model and preprocessor
732
+ st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
733
  st.session_state.preprocessor = preprocessor
734
 
735
  # Model Evaluation
 
768
 
769
  #Heatmap
770
  fig_conf, ax_conf = plt.subplots()
771
+ sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax_conf)
772
  ax_conf.set_xlabel('Predicted Labels')
773
  ax_conf.set_ylabel('True Labels')
774
  ax_conf.set_title('Confusion Matrix')
 
819
 
820
  except Exception as e:
821
  st.error(f"An error occurred: {e}")
822
+
823
  else:
824
  st.write("Please upload and clean data first.")
825
 
 
855
  from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
856
  accuracy = accuracy_score(y_test, y_pred)
857
  st.write(f"Accuracy: {accuracy:.4f}")
858
+
859
  except Exception as e: #local error
860
  st.error(f"An error occurred during model evaluation: {e}")
861
+
862
  elif app_mode == "Predictions":
863
  st.title("🔮 Make Predictions")
864
 
865
  if st.session_state.model is not None and st.session_state.cleaned_data is not None:
866
  df = st.session_state.cleaned_data.copy()
867
+ model = st.session_state.model.steps[-1][1] #Define model from the state
 
 
 
868
 
869
  try:
870
  numeric_transformer_columns = st.session_state.model.steps[0][1].transformers_[0][2] if hasattr(st.session_state.model.steps[0][1].transformers_[0][2], '__len__') else []
 
873
  except AttributeError as e:
874
  st.error(f"Error accessing model transformers: {e}. Please ensure a valid model is trained and loaded.")
875
  st.stop()
876
+
877
+ model_is_classification = hasattr(model, 'predict_proba') # Check for classification or other problem
878
  if not set(model_columns).issubset(set(df.columns)): #Fixed comparison
879
  st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.")
880
  st.stop()
881
+
882
+ input_data = {}
883
+ st.subheader("Enter Data for Prediction")
884
  for col in model_columns:
885
  if pd.api.types.is_numeric_dtype(df[col]):
886
  input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean())
887
  else:
888
  input_data[col] = st.selectbox(f"Select {col}", df[col].unique())
889
 
890
+ # Make Prediction Button
891
  if st.button("Make Prediction"):
892
  try:
893
  input_df = pd.DataFrame([input_data])
894
+ #Preprocess for model
895
+ input_processed = st.session_state.preprocessor.transform(input_df)
896
+ prediction = st.session_state.model.predict(input_processed)[0]
897
  st.subheader("Prediction Result")
898
  st.write(f"The predicted value is: {prediction}")
899
 
900
+ # Show shap values chart
901
+ show_shap_values = st.checkbox("View SHAP Explanation") #select model to show shap values
902
+
903
+
904
+ if show_shap_values and model_is_classification and model_name not in ["Linear Regression","Logistic Regression","SVM","Naive Bayes", "KNN"]:#Show shap values if this can perform.
905
+
906
+ try:
907
+ import shap #Import lib
908
+ explainer = shap.TreeExplainer(st.session_state.model.steps[-1][1]) #Used tree model because these are easily visualized
909
+
910
+ shap_values = explainer.shap_values(input_processed) #Get output of each values, only used in tree models
911
+
912
+ st.subheader("SHAP Values")
913
+ #Plot for each of the different class labels.
914
+
915
+ shap.initjs()
916
+ fig_shap, ax_shap = plt.subplots(1, figsize = (10,10))
917
+ shap.summary_plot(shap_values, features = X_train, feature_names = feature_columns, plot_type = "bar")#plot for multi class labels
918
+ st.pyplot(fig_shap) #Show the figure
919
+ except Exception as e:
920
+ st.write(f"Can show shap values on tree based model: {e}") #Show error
921
+ # Additional Feedback (Example for Classification)
922
+ if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'): #If the end variable has predict_proba and is therefore a predictor
923
  probabilities = st.session_state.model.predict_proba(input_df)[0]
924
  st.write("Predicted Probabilities:")
925
+ st.write(probabilities) #write here
 
926
  except Exception as e:
927
+ st.error(f"An error occurred during prediction: {e}") #Base case error
928
 
929
  #Add batch prediction section in prediction tab
930
  st.subheader("Batch Predictions")
 
932
  if batch_file is not None:
933
  try:
934
  batch_df = pd.read_csv(batch_file)
935
+ #Verify data types and if it matches the ones used during the columns
936
+ for col in model_columns:
937
+ if pd.api.types.is_numeric_dtype(df[col]):
938
+ try:
939
+ batch_df[col] = pd.to_numeric(batch_df[col], errors='raise')
940
+ except ValueError:
941
+ st.error(f"Column '{col}' must be numeric.")
942
+ st.stop()
943
+ else:
944
+ #ensure columns are type string if that isnt the case
945
+ batch_df[col] = batch_df[col].astype(str)
946
+
947
+ if not set(model_columns).issubset(set(batch_df.columns)): #Fixed comparison
948
+ st.error("The batch dataframe that contains different columns than the currently used training dataframe. Please upload the correct dataframe.")
949
+ st.stop()
950
+
951
  # Preprocess the batch data
952
+ batch_processed = st.session_state.preprocessor.transform(batch_df[model_columns])
953
  # Make predictions
954
  batch_predictions = st.session_state.model.predict(batch_processed)
955
  batch_df['Prediction'] = batch_predictions
956
+
957
+ #Add probability output if that function is available.
958
+ if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'):
959
+ batch_probabilities = st.session_state.model.predict_proba(batch_processed)
960
+ for i in range(batch_probabilities.shape[1]): #Loop through and give each probability
961
+ batch_df[f'Probability_Class_{i}'] = batch_probabilities[:, i]
962
+
963
+
964
+
965
  st.dataframe(batch_df)
966
 
967
  # Download predictions