Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Feb 28

Commit

00b2520

verified ·

1 Parent(s): 71ef8d0

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -53

app.py CHANGED Viewed

@@ -573,62 +573,134 @@ elif app_mode == "Advanced EDA":
 elif app_mode == "Model Training":
     st.title("🚂 Model Training")
-    feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
-if model_name == "Random Forest":
-    param_grid = {
-        'n_estimators': st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest."),
-        'max_depth': st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree."),
-        'min_samples_split': st.slider("Minimum Samples Split", 2, 10, 2, help="Minimum samples required to split an internal node"), #New hyperparameter
-        'min_samples_leaf': st.slider("Minimum Samples Leaf", 1, 10, 1, help="Minimum samples required to be at a leaf node"), #New hyperparameter
-    }
-#Inside the train model button
-if st.button("Train Model"):
-     #Feature Selection
-        if feature_selection_method == "SelectKBest":
-            k = st.slider("Number of Features to Select", 1, len(feature_columns), len(feature_columns))
-            selector = SelectKBest(k=k)
-            X_train_selected = selector.fit_transform(X_train_processed, y_train)
-            X_test_selected = selector.transform(X_test_processed)
-        else:
-            X_train_selected = X_train_processed
-            X_test_selected = X_test_processed
-    # Model Training and Hyperparameter Tuning
-        if model_name == "Linear Regression":
-            model = LinearRegression()
-        elif model_name == "Logistic Regression":
-            model = LogisticRegression(max_iter=1000)
-        elif model_name == "Decision Tree":
-            if problem_type == "Regression":
-                model = DecisionTreeRegressor()
-            else:
-                model = DecisionTreeClassifier()
-        elif model_name == "Random Forest":
-            if problem_type == "Regression":
-                model = RandomForestRegressor(random_state=42)
-                grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')  # Example scoring
-                grid_search.fit(X_train_selected, y_train)
-                model = grid_search.best_estimator_
-                st.write("Best Parameters:", grid_search.best_params_)
-            else:
-                model = RandomForestClassifier(random_state=42)
-                grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
-                grid_search.fit(X_train_selected, y_train)
-                model = grid_search.best_estimator_
-                st.write("Best Parameters:", grid_search.best_params_)
-        elif model_name == "Gradient Boosting":
-            model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
-        elif model_name == "SVM":
-            model = SVR() if problem_type == "Regression" else SVC()
-        # Cross-validation
-        cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5) #example, adjust cv
-        st.write(f"Cross-validation scores: {cv_scores}")
-        st.write(f"Mean cross-validation score: {cv_scores.mean():.4f}")
-        model.fit(X_train_selected, y_train)
        # Model Saving
         model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model")
@@ -648,7 +720,7 @@ if st.button("Train Model"):
                 st.error(f"Error loading model: {e}")
        #Model Evaluation Section
-        y_pred = model.predict(X_test_selected)
         if problem_type == "Regression":
             mse = mean_squared_error(y_test, y_pred)

 elif app_mode == "Model Training":
     st.title("🚂 Model Training")
+    if st.session_state.cleaned_data is not None:
+        df = st.session_state.cleaned_data.copy()
+        # Target Variable Selection
+        target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
+        # Problem Type Selection
+        problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of problem.")
+        # Feature Selection
+        feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose features for training.")
+        # Model Selection
+        model_name = st.selectbox("Select Model", [
+            "Linear Regression", "Logistic Regression", "Decision Tree",
+            "Random Forest", "Gradient Boosting", "SVM"
+        ], help="Choose a model.")
+        feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
+        if model_name == "Random Forest":
+            param_grid = {
+                'n_estimators': st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest."),
+                'max_depth': st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree."),
+                'min_samples_split': st.slider("Minimum Samples Split", 2, 10, 2, help="Minimum samples required to split an internal node"), #New hyperparameter
+                'min_samples_leaf': st.slider("Minimum Samples Leaf", 1, 10, 1, help="Minimum samples required to be at a leaf node"), #New hyperparameter
+            }
+        # Train-Test Split
+        test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
+        if st.button("Train Model"):
+            with st.spinner("Training model..."):
+                try:
+                    X = df[feature_columns]
+                    y = df[target_column]
+                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
+                    # Preprocessing Pipeline
+                    numeric_features = X.select_dtypes(include=np.number).columns
+                    categorical_features = X.select_dtypes(exclude=np.number).columns
+                    numeric_transformer = Pipeline(steps=[
+                        ('imputer', SimpleImputer(strategy='median')),
+                        ('scaler', StandardScaler())
+                    ])
+                    categorical_transformer = Pipeline(steps=[
+                        ('imputer', SimpleImputer(strategy='most_frequent')),
+                        ('onehot', OneHotEncoder(handle_unknown='ignore'))
+                    ])
+                    preprocessor = ColumnTransformer(
+                        transformers=[
+                            ('num', numeric_transformer, numeric_features),
+                            ('cat', categorical_transformer, categorical_features)
+                        ])
+                    X_train_processed = preprocessor.fit_transform(X_train)
+                    X_test_processed = preprocessor.transform(X_test)
+                    #Feature Selection
+                    if feature_selection_method == "SelectKBest":
+                        k = st.slider("Number of Features to Select", 1, len(feature_columns), len(feature_columns))
+                        selector = SelectKBest(k=k)
+                        X_train_selected = selector.fit_transform(X_train_processed, y_train)
+                        X_test_selected = selector.transform(X_test_processed)
+                    else:
+                        X_train_selected = X_train_processed
+                        X_test_selected = X_test_processed
+                    # Model Training and Hyperparameter Tuning
+                    if model_name == "Linear Regression":
+                        model = LinearRegression()
+                    elif model_name == "Logistic Regression":
+                        model = LogisticRegression(max_iter=1000)
+                    elif model_name == "Decision Tree":
+                        if problem_type == "Regression":
+                            model = DecisionTreeRegressor()
+                        else:
+                            model = DecisionTreeClassifier()
+                    elif model_name == "Random Forest":
+                        if problem_type == "Regression":
+                            model = RandomForestRegressor(random_state=42)
+                            grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')  # Example scoring
+                            grid_search.fit(X_train_selected, y_train)
+                            model = grid_search.best_estimator_
+                            st.write("Best Parameters:", grid_search.best_params_)
+                        else:
+                            model = RandomForestClassifier(random_state=42)
+                            grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
+                            grid_search.fit(X_train_selected, y_train)
+                            model = grid_search.best_estimator_
+                            st.write("Best Parameters:", grid_search.best_params_)
+                    elif model_name == "Gradient Boosting":
+                        model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
+                    elif model_name == "SVM":
+                        model = SVR() if problem_type == "Regression" else SVC()
+                    # Cross-validation
+                    cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5) #example, adjust cv
+                    st.write(f"Cross-validation scores: {cv_scores}")
+                    st.write(f"Mean cross-validation score: {cv_scores.mean():.4f}")
+                    model.fit(X_train_selected, y_train)
+                    # Store model and preprocessor
+                    st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
+                    st.session_state.preprocessor = preprocessor
+                    # Model Evaluation
+                    y_pred = model.predict(X_test_selected)
+                    if problem_type == "Regression":
+                        mse = mean_squared_error(y_test, y_pred)
+                        r2 = r2_score(y_test, y_pred)
+                        st.write(f"Mean Squared Error: {mse:.4f}")
+                        st.write(f"R-squared: {r2:.4f}")
+                    else:
+                        accuracy = accuracy_score(y_test, y_pred)
+                        st.write(f"Accuracy: {accuracy:.4f}")
+                    st.success("Model trained successfully!")
+                except Exception as e:
+                    st.error(f"An error occurred: {e}")
+    else:
+        st.write("Please upload and clean data first.")
        # Model Saving
         model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model")
                 st.error(f"Error loading model: {e}")
        #Model Evaluation Section
+        y_pred = st.session_state.model.predict(X_test)
         if problem_type == "Regression":
             mse = mean_squared_error(y_test, y_pred)