Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -670,13 +670,18 @@ elif app_mode == "Model Training":
|
|
670 |
# Model Training and Hyperparameter Tuning
|
671 |
if model_name == "Linear Regression":
|
672 |
model = LinearRegression()
|
|
|
|
|
673 |
elif model_name == "Logistic Regression":
|
674 |
model = LogisticRegression(max_iter=1000)
|
|
|
675 |
elif model_name == "Decision Tree":
|
676 |
if problem_type == "Regression":
|
677 |
model = DecisionTreeRegressor()
|
|
|
678 |
else:
|
679 |
model = DecisionTreeClassifier()
|
|
|
680 |
elif model_name == "Random Forest":
|
681 |
if problem_type == "Regression":
|
682 |
model = RandomForestRegressor(random_state=42)
|
@@ -700,70 +705,29 @@ elif app_mode == "Model Training":
|
|
700 |
elif model_name == "Gradient Boosting":
|
701 |
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
|
702 |
model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
|
|
|
703 |
elif model_name == "SVM":
|
704 |
model = SVR() if problem_type == "Regression" else SVC()
|
|
|
705 |
elif model_name == "Naive Bayes":
|
706 |
from sklearn.naive_bayes import GaussianNB
|
707 |
model = GaussianNB()
|
|
|
708 |
elif model_name == "KNN":
|
709 |
from sklearn.neighbors import KNeighborsClassifier
|
710 |
model = KNeighborsClassifier()
|
|
|
711 |
|
712 |
-
#
|
713 |
-
|
714 |
-
|
715 |
-
cost_sensitive = st.checkbox("Enable Cost-Sensitive Classification") #new
|
716 |
-
|
717 |
-
if cost_sensitive:
|
718 |
-
#Get class labels
|
719 |
-
classes = np.unique(y_train)
|
720 |
-
|
721 |
-
#Create a matrix, with default cost being 1
|
722 |
-
cost_matrix = np.ones((len(classes),len(classes)))
|
723 |
-
|
724 |
-
#Cost of correct predictions are 0
|
725 |
-
np.fill_diagonal(cost_matrix, 0)
|
726 |
-
|
727 |
-
#Allow for individual weight specification
|
728 |
-
st.write("Define misclassification costs:")
|
729 |
-
|
730 |
-
for i in range(len(classes)):
|
731 |
-
for j in range(len(classes)):
|
732 |
-
if i != j:
|
733 |
-
cost_matrix[i,j] = st.number_input(f"Cost of classifying {classes[i]} as {classes[j]}", value=1.0, min_value=0.0)
|
734 |
-
|
735 |
-
|
736 |
-
#Threshold adjustment options, only shows up for log regression
|
737 |
-
|
738 |
-
if model_name == "Logistic Regression" and problem_type == "Classification":
|
739 |
-
threshold = st.slider("Select Threshold", 0.0, 1.0, 0.5, 0.01, help="Adjust the classification threshold")
|
740 |
-
model = LogisticRegression(max_iter=1000)
|
741 |
-
|
742 |
-
model.fit(X_train_selected, y_train) #Fit model
|
743 |
-
|
744 |
-
#Adjust predictions according to threshold and make new variables
|
745 |
-
y_pred_prob = model.predict_proba(X_test_selected)[:,1]
|
746 |
-
y_pred = (y_pred_prob > threshold).astype(int)
|
747 |
-
|
748 |
-
|
749 |
-
else:
|
750 |
-
# Cross-validation
|
751 |
-
cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type == "Classification" else 'neg_mean_squared_error') #example, adjust cv
|
752 |
-
st.write(f"Cross-validation scores: {cv_scores}")
|
753 |
-
st.write(f"Mean cross-validation score: {cv_scores.mean():.4f}")
|
754 |
-
|
755 |
-
model.fit(X_train_selected, y_train)
|
756 |
-
|
757 |
-
# Store model and preprocessor
|
758 |
-
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
759 |
-
st.session_state.preprocessor = preprocessor
|
760 |
-
|
761 |
-
y_pred = model.predict(X_test_selected)
|
762 |
-
|
763 |
|
|
|
|
|
|
|
764 |
|
765 |
# Model Evaluation
|
766 |
-
|
767 |
if problem_type == "Regression":
|
768 |
mse = mean_squared_error(y_test, y_pred)
|
769 |
r2 = r2_score(y_test, y_pred)
|
@@ -799,10 +763,6 @@ elif app_mode == "Model Training":
|
|
799 |
ax_conf.set_title('Confusion Matrix')
|
800 |
st.pyplot(fig_conf)
|
801 |
|
802 |
-
st.success("Model trained successfully!")
|
803 |
-
|
804 |
-
except Exception as e:
|
805 |
-
st.error(f"An error occurred: {e}")
|
806 |
else:
|
807 |
st.write("Please upload and clean data first.")
|
808 |
|
@@ -833,6 +793,11 @@ elif app_mode == "Model Training":
|
|
833 |
st.write(f"Mean Squared Error: {mse:.4f}")
|
834 |
st.write(f"R-squared: {r2:.4f}")
|
835 |
else:
|
|
|
|
|
|
|
|
|
|
|
836 |
accuracy = accuracy_score(y_test, y_pred)
|
837 |
st.write(f"Accuracy: {accuracy:.4f}")
|
838 |
|
@@ -906,67 +871,7 @@ elif app_mode == "Predictions":
|
|
906 |
else:
|
907 |
st.write("Please train a model first in the 'Model Training' section.")
|
908 |
|
909 |
-
elif app_mode == "Predictions":
|
910 |
-
st.title("🔮 Make Predictions")
|
911 |
-
|
912 |
-
if st.session_state.model is not None and st.session_state.cleaned_data is not None:
|
913 |
-
df = st.session_state.cleaned_data.copy()
|
914 |
-
|
915 |
-
# Input data for prediction
|
916 |
-
st.subheader("Enter Data for Prediction")
|
917 |
-
input_data = {}
|
918 |
-
model_columns = st.session_state.model.steps[0][1].transformers_[0][2] + st.session_state.model.steps[0][1].transformers_[1][2]
|
919 |
-
if not set(model_columns).issubset(set(df.drop(columns=[st.session_state.model.steps[-1][0]]).columns)):
|
920 |
-
st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.")
|
921 |
-
st.stop()
|
922 |
-
|
923 |
-
for col in model_columns:
|
924 |
-
if pd.api.types.is_numeric_dtype(df[col]):
|
925 |
-
input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean())
|
926 |
-
else:
|
927 |
-
input_data[col] = st.selectbox(f"Select {col}", df[col].unique())
|
928 |
-
|
929 |
-
# Prediction Button
|
930 |
-
if st.button("Make Prediction"):
|
931 |
-
try:
|
932 |
-
input_df = pd.DataFrame([input_data])
|
933 |
-
prediction = st.session_state.model.predict(input_df)[0]
|
934 |
-
st.subheader("Prediction Result")
|
935 |
-
st.write(f"The predicted value is: {prediction}")
|
936 |
-
|
937 |
-
# Additional Feedback (Example for Classification)
|
938 |
-
if isinstance(st.session_state.model.steps[-1][1], LogisticRegression):
|
939 |
-
probabilities = st.session_state.model.predict_proba(input_df)[0]
|
940 |
-
st.write("Predicted Probabilities:")
|
941 |
-
st.write(probabilities)
|
942 |
-
|
943 |
-
except Exception as e:
|
944 |
-
st.error(f"An error occurred during prediction: {e}")
|
945 |
-
|
946 |
-
#Add batch prediction section in prediction tab
|
947 |
-
st.subheader("Batch Predictions")
|
948 |
-
batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"])
|
949 |
-
if batch_file is not None:
|
950 |
-
try:
|
951 |
-
batch_df = pd.read_csv(batch_file)
|
952 |
-
# Preprocess the batch data
|
953 |
-
batch_processed = st.session_state.preprocessor.transform(batch_df)
|
954 |
-
# Make predictions
|
955 |
-
batch_predictions = st.session_state.model.predict(batch_processed)
|
956 |
-
batch_df['Prediction'] = batch_predictions
|
957 |
-
st.dataframe(batch_df)
|
958 |
|
959 |
-
# Download predictions
|
960 |
-
csv = batch_df.to_csv(index=False)
|
961 |
-
b64 = base64.b64encode(csv.encode()).decode() # some strings
|
962 |
-
href = f'<a href="data:file/csv;base64,{b64}" download="predictions.csv">Download Predictions CSV</a>'
|
963 |
-
st.markdown(href, unsafe_allow_html=True)
|
964 |
-
|
965 |
-
except Exception as e:
|
966 |
-
st.error(f"Error processing batch file: {e}")
|
967 |
-
|
968 |
-
else:
|
969 |
-
st.write("Please train a model first in the 'Model Training' section.")
|
970 |
|
971 |
elif app_mode == "Visualization Lab":
|
972 |
st.title("🔬 Advanced Data Visualization and Clustering Lab")
|
|
|
670 |
# Model Training and Hyperparameter Tuning
|
671 |
if model_name == "Linear Regression":
|
672 |
model = LinearRegression()
|
673 |
+
model.fit(X_train_selected, y_train)
|
674 |
+
|
675 |
elif model_name == "Logistic Regression":
|
676 |
model = LogisticRegression(max_iter=1000)
|
677 |
+
model.fit(X_train_selected, y_train)
|
678 |
elif model_name == "Decision Tree":
|
679 |
if problem_type == "Regression":
|
680 |
model = DecisionTreeRegressor()
|
681 |
+
model.fit(X_train_selected, y_train)
|
682 |
else:
|
683 |
model = DecisionTreeClassifier()
|
684 |
+
model.fit(X_train_selected, y_train)
|
685 |
elif model_name == "Random Forest":
|
686 |
if problem_type == "Regression":
|
687 |
model = RandomForestRegressor(random_state=42)
|
|
|
705 |
elif model_name == "Gradient Boosting":
|
706 |
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
|
707 |
model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
|
708 |
+
model.fit(X_train_selected, y_train)
|
709 |
elif model_name == "SVM":
|
710 |
model = SVR() if problem_type == "Regression" else SVC()
|
711 |
+
model.fit(X_train_selected, y_train)
|
712 |
elif model_name == "Naive Bayes":
|
713 |
from sklearn.naive_bayes import GaussianNB
|
714 |
model = GaussianNB()
|
715 |
+
model.fit(X_train_selected, y_train)
|
716 |
elif model_name == "KNN":
|
717 |
from sklearn.neighbors import KNeighborsClassifier
|
718 |
model = KNeighborsClassifier()
|
719 |
+
model.fit(X_train_selected, y_train)
|
720 |
|
721 |
+
# Store model and preprocessor
|
722 |
+
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
723 |
+
st.session_state.preprocessor = preprocessor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
724 |
|
725 |
+
# Store model and preprocessor
|
726 |
+
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
727 |
+
st.session_state.preprocessor = preprocessor
|
728 |
|
729 |
# Model Evaluation
|
730 |
+
y_pred = model.predict(X_test_selected)
|
731 |
if problem_type == "Regression":
|
732 |
mse = mean_squared_error(y_test, y_pred)
|
733 |
r2 = r2_score(y_test, y_pred)
|
|
|
763 |
ax_conf.set_title('Confusion Matrix')
|
764 |
st.pyplot(fig_conf)
|
765 |
|
|
|
|
|
|
|
|
|
766 |
else:
|
767 |
st.write("Please upload and clean data first.")
|
768 |
|
|
|
793 |
st.write(f"Mean Squared Error: {mse:.4f}")
|
794 |
st.write(f"R-squared: {r2:.4f}")
|
795 |
else:
|
796 |
+
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
|
797 |
+
|
798 |
+
#Weighted averaging for metrics for multiclass
|
799 |
+
average_method = "weighted" #changed from None
|
800 |
+
|
801 |
accuracy = accuracy_score(y_test, y_pred)
|
802 |
st.write(f"Accuracy: {accuracy:.4f}")
|
803 |
|
|
|
871 |
else:
|
872 |
st.write("Please train a model first in the 'Model Training' section.")
|
873 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
874 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
875 |
|
876 |
elif app_mode == "Visualization Lab":
|
877 |
st.title("🔬 Advanced Data Visualization and Clustering Lab")
|