Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Mar 2

Commit

6699046

verified ·

1 Parent(s): 093b3f4

Update app.py

Browse files

Files changed (1) hide show

app.py +233 -232

app.py CHANGED Viewed

@@ -176,6 +176,239 @@ def generate_quality_report(df):
         report['columns'][col] = col_report
     return report
 # --------------------------
 # Sidebar Navigation
 # --------------------------
@@ -548,238 +781,6 @@ def eda():
 # Call the EDA function
 eda()
-# Function to train the model (Separated for clarity and reusability)
-def train_model(df, target, features, problem_type, test_size, model_type, model_params, use_grid_search=False):
-    """Trains a model with hyperparameter tuning, cross-validation, and customizable model architecture."""
-    try:
-        X = df[features]
-        y = df[target]
-        # Input Validation
-        if target not in df.columns:
-            raise ValueError(f"Target variable '{target}' not found in DataFrame.")
-        for feature in features:
-            if feature not in df.columns:
-                raise ValueError(f"Feature '{feature}' not found in DataFrame.")
-        # Preprocessing Pipeline:  Handles missing values, encoding, scaling
-        # Imputation: Handle missing values BEFORE encoding (numerical only for SimpleImputer)
-        numerical_features = X.select_dtypes(include=np.number).columns
-        categorical_features = X.select_dtypes(exclude=np.number).columns
-        imputer_numerical = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', 'constant'
-        X[numerical_features] = imputer_numerical.fit_transform(X[numerical_features])
-        # Encoding (One-Hot Encode Categorical Features)
-        X = pd.get_dummies(X, columns=categorical_features, dummy_na=False) # dummy_na = False.  We imputed already.
-        # Target Encoding (if classification)
-        label_encoder = None #Initialize label_encoder
-        if problem_type == "Classification" or problem_type == "Multiclass":
-            label_encoder = LabelEncoder()
-            y = label_encoder.fit_transform(y)
-        # Split the data
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=test_size, random_state=42
-        )
-        # Scaling (AFTER splitting!)
-        scaler = StandardScaler() # Or try MinMaxScaler, RobustScaler, QuantileTransformer
-        X_train = scaler.fit_transform(X_train) #Fit to the training data ONLY
-        X_test = scaler.transform(X_test) #Transform the test data using the fitted scaler
-        # Model Selection and Hyperparameter Tuning
-        if problem_type == "Regression":
-            if model_type == "Random Forest":
-                model = RandomForestRegressor(random_state=42)
-                param_grid = {
-                    'n_estimators': [100, 200],
-                    'max_depth': [None, 5, 10],
-                    'min_samples_split': [2, 5]
-                }
-            elif model_type == "Gradient Boosting":
-                model = GradientBoostingRegressor(random_state=42)
-                param_grid = {
-                    'n_estimators': [100, 200],
-                    'learning_rate': [0.01, 0.1],
-                    'max_depth': [3, 5]
-                }
-            elif model_type == "Neural Network":
-                 model = MLPRegressor(random_state=42, max_iter=500) #set max_iter to 500
-                 param_grid = {
-                     'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
-                     'activation': ['relu', 'tanh'],
-                     'alpha': [0.0001, 0.001]
-                 }
-            else:
-                raise ValueError(f"Invalid model type: {model_type}")
-        elif problem_type == "Classification": #Binary
-            if model_type == "Random Forest":
-                model = RandomForestClassifier(random_state=42)
-                param_grid = {
-                    'n_estimators': [100, 200],
-                    'max_depth': [None, 5, 10],
-                    'min_samples_split': [2, 5]
-                }
-            elif model_type == "Gradient Boosting":
-                model = GradientBoostingClassifier(random_state=42)
-                param_grid = {
-                    'n_estimators': [100, 200],
-                    'learning_rate': [0.01, 0.1],
-                    'max_depth': [3, 5]
-                }
-            elif model_type == "Neural Network":
-                model = MLPClassifier(random_state=42, max_iter=500) #set max_iter to 500
-                param_grid = {
-                    'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
-                    'activation': ['relu', 'tanh'],
-                    'alpha': [0.0001, 0.001]
-                }
-            else:
-                raise ValueError(f"Invalid model type: {model_type}")
-        elif problem_type == "Multiclass": #Multiclass
-            if model_type == "Logistic Regression":
-                model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')  # 'ovr' for one-vs-rest
-                param_grid = {'C': [0.1, 1.0, 10.0]}  # Regularization parameter
-            elif model_type == "Support Vector Machine":
-                model = SVC(random_state=42, probability=True)  # probability=True for probabilities
-                param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
-            elif model_type == "Random Forest":
-                model = RandomForestClassifier(random_state=42)
-                param_grid = {
-                    'n_estimators': [100, 200],
-                    'max_depth': [None, 5, 10],
-                    'min_samples_split': [2, 5],
-                    'criterion': ['gini', 'entropy'] #criterion for decision
-                }
-            else:
-                raise ValueError(f"Invalid model type: {model_type} for Multiclass")
-        else:
-            raise ValueError(f"Invalid problem type: {problem_type}")
-        # Update param_grid with user-defined parameters
-        param_grid.update(model_params) #This is key to use the model_params provided by user
-        if use_grid_search:
-            grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error', verbose=1, n_jobs=-1)
-            grid_search.fit(X_train, y_train)
-            model = grid_search.best_estimator_ # Use the best model found
-            st.write("Best hyperparameters found by Grid Search:", grid_search.best_params_) #Print best parameters
-        else:
-            model.fit(X_train, y_train)
-        # Cross-Validation (after hyperparameter tuning, if applicable)
-        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error')
-        st.write("Cross-validation scores:", cv_scores)
-        st.write("Mean cross-validation score:", cv_scores.mean())
-        # Evaluation
-        y_pred = model.predict(X_test)
-        metrics = {} #Store metrics in a dictionary
-        if problem_type == "Classification":
-            metrics['accuracy'] = accuracy_score(y_test, y_pred)
-            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
-            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
-        elif problem_type == "Multiclass":
-            metrics['accuracy'] = accuracy_score(y_test, y_pred)
-            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
-            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
-        else:
-            metrics['mse'] = mean_squared_error(y_test, y_pred)
-            metrics['r2'] = r2_score(y_test, y_pred)
-        # Feature Importance (Permutation Importance for potentially better handling of correlated features)
-        try:
-            result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42) #Permutation Feature Importance
-            importance = result.importances_mean
-        except Exception as e:
-            st.warning(f"Could not calculate feature importance: {e}")
-            importance = None
-        # Store the column order for prediction purposes
-        column_order = X.columns
-        return model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance
-    except Exception as e:
-        st.error(f"Training failed: {str(e)}")
-        return None, None, None, None, None, None, None
-# Model Validation Function
-def validate_model(model_path, df, target, features, test_size):
-    """Loads a model, preprocesses data, and evaluates the model on a validation set."""
-    try:
-        loaded_data = joblib.load(model_path)
-        model = loaded_data['model']
-        scaler = loaded_data['scaler']
-        label_encoder = loaded_data['label_encoder']
-        imputer_numerical = loaded_data['imputer_numerical']
-        column_order = loaded_data['column_order']
-        problem_type = loaded_data['problem_type']
-        X = df[features]
-        y = df[target]
-        # Imputation
-        numerical_features = X.select_dtypes(include=np.number).columns
-        X[numerical_features] = imputer_numerical.transform(X[numerical_features])
-        # Encoding
-        X = pd.get_dummies(X, columns=X.select_dtypes(exclude=np.number).columns, dummy_na=False)
-        # Ensure correct column order
-        X = X[column_order] #Reorder the columns
-        # Split the data
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=test_size, random_state=42
-        )
-        # Scaling
-        X_train = scaler.transform(X_train)
-        X_test = scaler.transform(X_test)
-         # Target Encoding (if classification) - Use the same encoder used during training
-        if problem_type == "Classification" or problem_type == "Multiclass":
-            y = label_encoder.transform(y)
-        y_pred = model.predict(X_test)
-        metrics = {}
-        if problem_type == "Classification":
-            metrics['accuracy'] = accuracy_score(y_test, y_pred)
-            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
-            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
-        elif problem_type == "Multiclass":
-            metrics['accuracy'] = accuracy_score(y_test, y_pred)
-            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
-            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
-        else:
-            metrics['mse'] = mean_squared_error(y_test, y_pred)
-            metrics['r2'] = r2_score(y_test, y_pred)
-        return metrics, problem_type
-    except Exception as e:
-        st.error(f"Validation failed: {str(e)}")
-        return None, None
 # Streamlit App
 elif app_mode == "Model Training":
     st.title("🤖 Intelligent Model Training")

         report['columns'][col] = col_report
     return report
+# Function to train the model (Separated for clarity and reusability)
+def train_model(df, target, features, problem_type, test_size, model_type, model_params, use_grid_search=False):
+    """Trains a model with hyperparameter tuning, cross-validation, and customizable model architecture."""
+    try:
+        X = df[features]
+        y = df[target]
+        # Input Validation
+        if target not in df.columns:
+            raise ValueError(f"Target variable '{target}' not found in DataFrame.")
+        for feature in features:
+            if feature not in df.columns:
+                raise ValueError(f"Feature '{feature}' not found in DataFrame.")
+        # Preprocessing Pipeline:  Handles missing values, encoding, scaling
+        # Imputation: Handle missing values BEFORE encoding (numerical only for SimpleImputer)
+        numerical_features = X.select_dtypes(include=np.number).columns
+        categorical_features = X.select_dtypes(exclude=np.number).columns
+        imputer_numerical = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', 'constant'
+        X[numerical_features] = imputer_numerical.fit_transform(X[numerical_features])
+        # Encoding (One-Hot Encode Categorical Features)
+        X = pd.get_dummies(X, columns=categorical_features, dummy_na=False) # dummy_na = False.  We imputed already.
+        # Target Encoding (if classification)
+        label_encoder = None #Initialize label_encoder
+        if problem_type == "Classification" or problem_type == "Multiclass":
+            label_encoder = LabelEncoder()
+            y = label_encoder.fit_transform(y)
+        # Split the data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42
+        )
+        # Scaling (AFTER splitting!)
+        scaler = StandardScaler() # Or try MinMaxScaler, RobustScaler, QuantileTransformer
+        X_train = scaler.fit_transform(X_train) #Fit to the training data ONLY
+        X_test = scaler.transform(X_test) #Transform the test data using the fitted scaler
+        # Model Selection and Hyperparameter Tuning
+        if problem_type == "Regression":
+            if model_type == "Random Forest":
+                model = RandomForestRegressor(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'max_depth': [None, 5, 10],
+                    'min_samples_split': [2, 5]
+                }
+            elif model_type == "Gradient Boosting":
+                model = GradientBoostingRegressor(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'learning_rate': [0.01, 0.1],
+                    'max_depth': [3, 5]
+                }
+            elif model_type == "Neural Network":
+                 model = MLPRegressor(random_state=42, max_iter=500) #set max_iter to 500
+                 param_grid = {
+                     'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
+                     'activation': ['relu', 'tanh'],
+                     'alpha': [0.0001, 0.001]
+                 }
+            else:
+                raise ValueError(f"Invalid model type: {model_type}")
+        elif problem_type == "Classification": #Binary
+            if model_type == "Random Forest":
+                model = RandomForestClassifier(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'max_depth': [None, 5, 10],
+                    'min_samples_split': [2, 5]
+                }
+            elif model_type == "Gradient Boosting":
+                model = GradientBoostingClassifier(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'learning_rate': [0.01, 0.1],
+                    'max_depth': [3, 5]
+                }
+            elif model_type == "Neural Network":
+                model = MLPClassifier(random_state=42, max_iter=500) #set max_iter to 500
+                param_grid = {
+                    'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
+                    'activation': ['relu', 'tanh'],
+                    'alpha': [0.0001, 0.001]
+                }
+            else:
+                raise ValueError(f"Invalid model type: {model_type}")
+        elif problem_type == "Multiclass": #Multiclass
+            if model_type == "Logistic Regression":
+                model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')  # 'ovr' for one-vs-rest
+                param_grid = {'C': [0.1, 1.0, 10.0]}  # Regularization parameter
+            elif model_type == "Support Vector Machine":
+                model = SVC(random_state=42, probability=True)  # probability=True for probabilities
+                param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
+            elif model_type == "Random Forest":
+                model = RandomForestClassifier(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'max_depth': [None, 5, 10],
+                    'min_samples_split': [2, 5],
+                    'criterion': ['gini', 'entropy'] #criterion for decision
+                }
+            else:
+                raise ValueError(f"Invalid model type: {model_type} for Multiclass")
+        else:
+            raise ValueError(f"Invalid problem type: {problem_type}")
+        # Update param_grid with user-defined parameters
+        param_grid.update(model_params) #This is key to use the model_params provided by user
+        if use_grid_search:
+            grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error', verbose=1, n_jobs=-1)
+            grid_search.fit(X_train, y_train)
+            model = grid_search.best_estimator_ # Use the best model found
+            st.write("Best hyperparameters found by Grid Search:", grid_search.best_params_) #Print best parameters
+        else:
+            model.fit(X_train, y_train)
+        # Cross-Validation (after hyperparameter tuning, if applicable)
+        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error')
+        st.write("Cross-validation scores:", cv_scores)
+        st.write("Mean cross-validation score:", cv_scores.mean())
+        # Evaluation
+        y_pred = model.predict(X_test)
+        metrics = {} #Store metrics in a dictionary
+        if problem_type == "Classification":
+            metrics['accuracy'] = accuracy_score(y_test, y_pred)
+            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
+        elif problem_type == "Multiclass":
+            metrics['accuracy'] = accuracy_score(y_test, y_pred)
+            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
+        else:
+            metrics['mse'] = mean_squared_error(y_test, y_pred)
+            metrics['r2'] = r2_score(y_test, y_pred)
+        # Feature Importance (Permutation Importance for potentially better handling of correlated features)
+        try:
+            result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42) #Permutation Feature Importance
+            importance = result.importances_mean
+        except Exception as e:
+            st.warning(f"Could not calculate feature importance: {e}")
+            importance = None
+        # Store the column order for prediction purposes
+        column_order = X.columns
+        return model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance
+    except Exception as e:
+        st.error(f"Training failed: {str(e)}")
+        return None, None, None, None, None, None, None
+# Model Validation Function
+def validate_model(model_path, df, target, features, test_size):
+    """Loads a model, preprocesses data, and evaluates the model on a validation set."""
+    try:
+        loaded_data = joblib.load(model_path)
+        model = loaded_data['model']
+        scaler = loaded_data['scaler']
+        label_encoder = loaded_data['label_encoder']
+        imputer_numerical = loaded_data['imputer_numerical']
+        column_order = loaded_data['column_order']
+        problem_type = loaded_data['problem_type']
+        X = df[features]
+        y = df[target]
+        # Imputation
+        numerical_features = X.select_dtypes(include=np.number).columns
+        X[numerical_features] = imputer_numerical.transform(X[numerical_features])
+        # Encoding
+        X = pd.get_dummies(X, columns=X.select_dtypes(exclude=np.number).columns, dummy_na=False)
+        # Ensure correct column order
+        X = X[column_order] #Reorder the columns
+        # Split the data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42
+        )
+        # Scaling
+        X_train = scaler.transform(X_train)
+        X_test = scaler.transform(X_test)
+         # Target Encoding (if classification) - Use the same encoder used during training
+        if problem_type == "Classification" or problem_type == "Multiclass":
+            y = label_encoder.transform(y)
+        y_pred = model.predict(X_test)
+        metrics = {}
+        if problem_type == "Classification":
+            metrics['accuracy'] = accuracy_score(y_test, y_pred)
+            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
+        elif problem_type == "Multiclass":
+            metrics['accuracy'] = accuracy_score(y_test, y_pred)
+            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
+        else:
+            metrics['mse'] = mean_squared_error(y_test, y_pred)
+            metrics['r2'] = r2_score(y_test, y_pred)
+        return metrics, problem_type
+    except Exception as e:
+        st.error(f"Validation failed: {str(e)}")
+        return None, None
 # --------------------------
 # Sidebar Navigation
 # --------------------------
 # Call the EDA function
 eda()
 # Streamlit App
 elif app_mode == "Model Training":
     st.title("🤖 Intelligent Model Training")