Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -587,6 +587,7 @@ elif app_mode == "Advanced EDA":
|
|
587 |
except Exception as e:
|
588 |
st.error(f"An error occurred during the T-test: {e}")
|
589 |
|
|
|
590 |
elif app_mode == "Model Training":
|
591 |
st.title("🚂 Model Training")
|
592 |
|
@@ -610,6 +611,9 @@ elif app_mode == "Model Training":
|
|
610 |
|
611 |
feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
|
612 |
|
|
|
|
|
|
|
613 |
if model_name == "Random Forest" and feature_columns: # Check if Random Forest and features are selected
|
614 |
min_features = 1 # Ensure at least one feature is used
|
615 |
max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
|
@@ -669,43 +673,41 @@ elif app_mode == "Model Training":
|
|
669 |
X_train_selected = X_train_processed
|
670 |
X_test_selected = X_test_processed
|
671 |
|
672 |
-
|
673 |
if model_name == "Linear Regression":
|
674 |
model = LinearRegression()
|
675 |
-
model.fit(X_train_selected, y_train)
|
676 |
-
|
677 |
elif model_name == "Logistic Regression":
|
678 |
model = LogisticRegression(max_iter=1000)
|
679 |
-
model.fit(X_train_selected, y_train)
|
680 |
elif model_name == "Decision Tree":
|
681 |
if problem_type == "Regression":
|
682 |
model = DecisionTreeRegressor()
|
683 |
-
model.fit(X_train_selected, y_train)
|
684 |
else:
|
685 |
model = DecisionTreeClassifier()
|
686 |
-
model.fit(X_train_selected, y_train)
|
687 |
elif model_name == "Random Forest":
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
model =
|
708 |
-
|
|
|
|
|
|
|
709 |
elif model_name == "Gradient Boosting":
|
710 |
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
|
711 |
model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
|
@@ -724,6 +726,10 @@ elif app_mode == "Model Training":
|
|
724 |
|
725 |
# Store model and preprocessor
|
726 |
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
|
|
|
|
|
|
|
|
727 |
st.session_state.preprocessor = preprocessor
|
728 |
|
729 |
# Model Evaluation
|
@@ -762,7 +768,7 @@ elif app_mode == "Model Training":
|
|
762 |
|
763 |
#Heatmap
|
764 |
fig_conf, ax_conf = plt.subplots()
|
765 |
-
sns.heatmap(conf_matrix,
|
766 |
ax_conf.set_xlabel('Predicted Labels')
|
767 |
ax_conf.set_ylabel('True Labels')
|
768 |
ax_conf.set_title('Confusion Matrix')
|
@@ -813,6 +819,7 @@ elif app_mode == "Model Training":
|
|
813 |
|
814 |
except Exception as e:
|
815 |
st.error(f"An error occurred: {e}")
|
|
|
816 |
else:
|
817 |
st.write("Please upload and clean data first.")
|
818 |
|
@@ -848,18 +855,16 @@ elif app_mode == "Model Training":
|
|
848 |
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
|
849 |
accuracy = accuracy_score(y_test, y_pred)
|
850 |
st.write(f"Accuracy: {accuracy:.4f}")
|
|
|
851 |
except Exception as e: #local error
|
852 |
st.error(f"An error occurred during model evaluation: {e}")
|
853 |
-
|
854 |
elif app_mode == "Predictions":
|
855 |
st.title("🔮 Make Predictions")
|
856 |
|
857 |
if st.session_state.model is not None and st.session_state.cleaned_data is not None:
|
858 |
df = st.session_state.cleaned_data.copy()
|
859 |
-
|
860 |
-
# Input data for prediction
|
861 |
-
st.subheader("Enter Data for Prediction")
|
862 |
-
input_data = {}
|
863 |
|
864 |
try:
|
865 |
numeric_transformer_columns = st.session_state.model.steps[0][1].transformers_[0][2] if hasattr(st.session_state.model.steps[0][1].transformers_[0][2], '__len__') else []
|
@@ -868,33 +873,58 @@ elif app_mode == "Predictions":
|
|
868 |
except AttributeError as e:
|
869 |
st.error(f"Error accessing model transformers: {e}. Please ensure a valid model is trained and loaded.")
|
870 |
st.stop()
|
871 |
-
|
|
|
872 |
if not set(model_columns).issubset(set(df.columns)): #Fixed comparison
|
873 |
st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.")
|
874 |
st.stop()
|
875 |
-
|
|
|
|
|
876 |
for col in model_columns:
|
877 |
if pd.api.types.is_numeric_dtype(df[col]):
|
878 |
input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean())
|
879 |
else:
|
880 |
input_data[col] = st.selectbox(f"Select {col}", df[col].unique())
|
881 |
|
882 |
-
# Prediction Button
|
883 |
if st.button("Make Prediction"):
|
884 |
try:
|
885 |
input_df = pd.DataFrame([input_data])
|
886 |
-
|
|
|
|
|
887 |
st.subheader("Prediction Result")
|
888 |
st.write(f"The predicted value is: {prediction}")
|
889 |
|
890 |
-
#
|
891 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
892 |
probabilities = st.session_state.model.predict_proba(input_df)[0]
|
893 |
st.write("Predicted Probabilities:")
|
894 |
-
st.write(probabilities)
|
895 |
-
|
896 |
except Exception as e:
|
897 |
-
st.error(f"An error occurred during prediction: {e}")
|
898 |
|
899 |
#Add batch prediction section in prediction tab
|
900 |
st.subheader("Batch Predictions")
|
@@ -902,11 +932,36 @@ elif app_mode == "Predictions":
|
|
902 |
if batch_file is not None:
|
903 |
try:
|
904 |
batch_df = pd.read_csv(batch_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
905 |
# Preprocess the batch data
|
906 |
-
batch_processed = st.session_state.preprocessor.transform(batch_df)
|
907 |
# Make predictions
|
908 |
batch_predictions = st.session_state.model.predict(batch_processed)
|
909 |
batch_df['Prediction'] = batch_predictions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
910 |
st.dataframe(batch_df)
|
911 |
|
912 |
# Download predictions
|
|
|
587 |
except Exception as e:
|
588 |
st.error(f"An error occurred during the T-test: {e}")
|
589 |
|
590 |
+
#MODEL TRAINING
|
591 |
elif app_mode == "Model Training":
|
592 |
st.title("🚂 Model Training")
|
593 |
|
|
|
611 |
|
612 |
feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
|
613 |
|
614 |
+
# Hyperparameter tuning
|
615 |
+
tuning_method = st.selectbox("Hyperparameter Tuning Method",["Grid Search","Bayesian Optimization","None"])
|
616 |
+
|
617 |
if model_name == "Random Forest" and feature_columns: # Check if Random Forest and features are selected
|
618 |
min_features = 1 # Ensure at least one feature is used
|
619 |
max_features = len(feature_columns) if len(feature_columns) > 0 else 1 # Use 1 if no features are selected
|
|
|
673 |
X_train_selected = X_train_processed
|
674 |
X_test_selected = X_test_processed
|
675 |
|
676 |
+
# Model Training and Hyperparameter Tuning
|
677 |
if model_name == "Linear Regression":
|
678 |
model = LinearRegression()
|
|
|
|
|
679 |
elif model_name == "Logistic Regression":
|
680 |
model = LogisticRegression(max_iter=1000)
|
|
|
681 |
elif model_name == "Decision Tree":
|
682 |
if problem_type == "Regression":
|
683 |
model = DecisionTreeRegressor()
|
|
|
684 |
else:
|
685 |
model = DecisionTreeClassifier()
|
|
|
686 |
elif model_name == "Random Forest":
|
687 |
+
if tuning_method == "Bayesian Optimization":
|
688 |
+
st.write("Implementing this function to be added soon")
|
689 |
+
elif problem_type == "Regression":
|
690 |
+
model = RandomForestRegressor(random_state=42)
|
691 |
+
if 'param_grid' in locals():
|
692 |
+
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') # Example scoring
|
693 |
+
grid_search.fit(X_train_selected, y_train)
|
694 |
+
model = grid_search.best_estimator_
|
695 |
+
st.write("Best Parameters:", grid_search.best_params_)
|
696 |
+
else:
|
697 |
+
model = RandomForestRegressor(random_state=42) #define if no param_grid
|
698 |
+
model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
|
699 |
+
|
700 |
+
|
701 |
+
else:
|
702 |
+
model = RandomForestClassifier(random_state=42)
|
703 |
+
if 'param_grid' in locals():
|
704 |
+
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
|
705 |
+
grid_search.fit(X_train_selected, y_train)
|
706 |
+
model = grid_search.best_estimator_
|
707 |
+
st.write("Best Parameters:", grid_search.best_params_)
|
708 |
+
else:
|
709 |
+
model = RandomForestClassifier(random_state=42) #define if no param_grid
|
710 |
+
model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
|
711 |
elif model_name == "Gradient Boosting":
|
712 |
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
|
713 |
model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
|
|
|
726 |
|
727 |
# Store model and preprocessor
|
728 |
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
729 |
+
st.session_state.preprocessor = preprocessor
|
730 |
+
|
731 |
+
# Store model and preprocessor
|
732 |
+
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
733 |
st.session_state.preprocessor = preprocessor
|
734 |
|
735 |
# Model Evaluation
|
|
|
768 |
|
769 |
#Heatmap
|
770 |
fig_conf, ax_conf = plt.subplots()
|
771 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax_conf)
|
772 |
ax_conf.set_xlabel('Predicted Labels')
|
773 |
ax_conf.set_ylabel('True Labels')
|
774 |
ax_conf.set_title('Confusion Matrix')
|
|
|
819 |
|
820 |
except Exception as e:
|
821 |
st.error(f"An error occurred: {e}")
|
822 |
+
|
823 |
else:
|
824 |
st.write("Please upload and clean data first.")
|
825 |
|
|
|
855 |
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
|
856 |
accuracy = accuracy_score(y_test, y_pred)
|
857 |
st.write(f"Accuracy: {accuracy:.4f}")
|
858 |
+
|
859 |
except Exception as e: #local error
|
860 |
st.error(f"An error occurred during model evaluation: {e}")
|
861 |
+
|
862 |
elif app_mode == "Predictions":
|
863 |
st.title("🔮 Make Predictions")
|
864 |
|
865 |
if st.session_state.model is not None and st.session_state.cleaned_data is not None:
|
866 |
df = st.session_state.cleaned_data.copy()
|
867 |
+
model = st.session_state.model.steps[-1][1] #Define model from the state
|
|
|
|
|
|
|
868 |
|
869 |
try:
|
870 |
numeric_transformer_columns = st.session_state.model.steps[0][1].transformers_[0][2] if hasattr(st.session_state.model.steps[0][1].transformers_[0][2], '__len__') else []
|
|
|
873 |
except AttributeError as e:
|
874 |
st.error(f"Error accessing model transformers: {e}. Please ensure a valid model is trained and loaded.")
|
875 |
st.stop()
|
876 |
+
|
877 |
+
model_is_classification = hasattr(model, 'predict_proba') # Check for classification or other problem
|
878 |
if not set(model_columns).issubset(set(df.columns)): #Fixed comparison
|
879 |
st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.")
|
880 |
st.stop()
|
881 |
+
|
882 |
+
input_data = {}
|
883 |
+
st.subheader("Enter Data for Prediction")
|
884 |
for col in model_columns:
|
885 |
if pd.api.types.is_numeric_dtype(df[col]):
|
886 |
input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean())
|
887 |
else:
|
888 |
input_data[col] = st.selectbox(f"Select {col}", df[col].unique())
|
889 |
|
890 |
+
# Make Prediction Button
|
891 |
if st.button("Make Prediction"):
|
892 |
try:
|
893 |
input_df = pd.DataFrame([input_data])
|
894 |
+
#Preprocess for model
|
895 |
+
input_processed = st.session_state.preprocessor.transform(input_df)
|
896 |
+
prediction = st.session_state.model.predict(input_processed)[0]
|
897 |
st.subheader("Prediction Result")
|
898 |
st.write(f"The predicted value is: {prediction}")
|
899 |
|
900 |
+
# Show shap values chart
|
901 |
+
show_shap_values = st.checkbox("View SHAP Explanation") #select model to show shap values
|
902 |
+
|
903 |
+
|
904 |
+
if show_shap_values and model_is_classification and model_name not in ["Linear Regression","Logistic Regression","SVM","Naive Bayes", "KNN"]:#Show shap values if this can perform.
|
905 |
+
|
906 |
+
try:
|
907 |
+
import shap #Import lib
|
908 |
+
explainer = shap.TreeExplainer(st.session_state.model.steps[-1][1]) #Used tree model because these are easily visualized
|
909 |
+
|
910 |
+
shap_values = explainer.shap_values(input_processed) #Get output of each values, only used in tree models
|
911 |
+
|
912 |
+
st.subheader("SHAP Values")
|
913 |
+
#Plot for each of the different class labels.
|
914 |
+
|
915 |
+
shap.initjs()
|
916 |
+
fig_shap, ax_shap = plt.subplots(1, figsize = (10,10))
|
917 |
+
shap.summary_plot(shap_values, features = X_train, feature_names = feature_columns, plot_type = "bar")#plot for multi class labels
|
918 |
+
st.pyplot(fig_shap) #Show the figure
|
919 |
+
except Exception as e:
|
920 |
+
st.write(f"Can show shap values on tree based model: {e}") #Show error
|
921 |
+
# Additional Feedback (Example for Classification)
|
922 |
+
if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'): #If the end variable has predict_proba and is therefore a predictor
|
923 |
probabilities = st.session_state.model.predict_proba(input_df)[0]
|
924 |
st.write("Predicted Probabilities:")
|
925 |
+
st.write(probabilities) #write here
|
|
|
926 |
except Exception as e:
|
927 |
+
st.error(f"An error occurred during prediction: {e}") #Base case error
|
928 |
|
929 |
#Add batch prediction section in prediction tab
|
930 |
st.subheader("Batch Predictions")
|
|
|
932 |
if batch_file is not None:
|
933 |
try:
|
934 |
batch_df = pd.read_csv(batch_file)
|
935 |
+
#Verify data types and if it matches the ones used during the columns
|
936 |
+
for col in model_columns:
|
937 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
938 |
+
try:
|
939 |
+
batch_df[col] = pd.to_numeric(batch_df[col], errors='raise')
|
940 |
+
except ValueError:
|
941 |
+
st.error(f"Column '{col}' must be numeric.")
|
942 |
+
st.stop()
|
943 |
+
else:
|
944 |
+
#ensure columns are type string if that isnt the case
|
945 |
+
batch_df[col] = batch_df[col].astype(str)
|
946 |
+
|
947 |
+
if not set(model_columns).issubset(set(batch_df.columns)): #Fixed comparison
|
948 |
+
st.error("The batch dataframe that contains different columns than the currently used training dataframe. Please upload the correct dataframe.")
|
949 |
+
st.stop()
|
950 |
+
|
951 |
# Preprocess the batch data
|
952 |
+
batch_processed = st.session_state.preprocessor.transform(batch_df[model_columns])
|
953 |
# Make predictions
|
954 |
batch_predictions = st.session_state.model.predict(batch_processed)
|
955 |
batch_df['Prediction'] = batch_predictions
|
956 |
+
|
957 |
+
#Add probability output if that function is available.
|
958 |
+
if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'):
|
959 |
+
batch_probabilities = st.session_state.model.predict_proba(batch_processed)
|
960 |
+
for i in range(batch_probabilities.shape[1]): #Loop through and give each probability
|
961 |
+
batch_df[f'Probability_Class_{i}'] = batch_probabilities[:, i]
|
962 |
+
|
963 |
+
|
964 |
+
|
965 |
st.dataframe(batch_df)
|
966 |
|
967 |
# Download predictions
|