CosmickVisions commited on
Commit
6699046
·
verified ·
1 Parent(s): 093b3f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +233 -232
app.py CHANGED
@@ -176,6 +176,239 @@ def generate_quality_report(df):
176
  report['columns'][col] = col_report
177
  return report
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  # --------------------------
180
  # Sidebar Navigation
181
  # --------------------------
@@ -548,238 +781,6 @@ def eda():
548
  # Call the EDA function
549
  eda()
550
 
551
- # Function to train the model (Separated for clarity and reusability)
552
- def train_model(df, target, features, problem_type, test_size, model_type, model_params, use_grid_search=False):
553
- """Trains a model with hyperparameter tuning, cross-validation, and customizable model architecture."""
554
-
555
- try:
556
- X = df[features]
557
- y = df[target]
558
-
559
- # Input Validation
560
- if target not in df.columns:
561
- raise ValueError(f"Target variable '{target}' not found in DataFrame.")
562
- for feature in features:
563
- if feature not in df.columns:
564
- raise ValueError(f"Feature '{feature}' not found in DataFrame.")
565
-
566
- # Preprocessing Pipeline: Handles missing values, encoding, scaling
567
- # Imputation: Handle missing values BEFORE encoding (numerical only for SimpleImputer)
568
- numerical_features = X.select_dtypes(include=np.number).columns
569
- categorical_features = X.select_dtypes(exclude=np.number).columns
570
-
571
- imputer_numerical = SimpleImputer(strategy='mean') # Or 'median', 'most_frequent', 'constant'
572
- X[numerical_features] = imputer_numerical.fit_transform(X[numerical_features])
573
-
574
- # Encoding (One-Hot Encode Categorical Features)
575
- X = pd.get_dummies(X, columns=categorical_features, dummy_na=False) # dummy_na = False. We imputed already.
576
-
577
- # Target Encoding (if classification)
578
- label_encoder = None #Initialize label_encoder
579
- if problem_type == "Classification" or problem_type == "Multiclass":
580
- label_encoder = LabelEncoder()
581
- y = label_encoder.fit_transform(y)
582
-
583
-
584
- # Split the data
585
- X_train, X_test, y_train, y_test = train_test_split(
586
- X, y, test_size=test_size, random_state=42
587
- )
588
-
589
- # Scaling (AFTER splitting!)
590
- scaler = StandardScaler() # Or try MinMaxScaler, RobustScaler, QuantileTransformer
591
- X_train = scaler.fit_transform(X_train) #Fit to the training data ONLY
592
- X_test = scaler.transform(X_test) #Transform the test data using the fitted scaler
593
-
594
- # Model Selection and Hyperparameter Tuning
595
- if problem_type == "Regression":
596
- if model_type == "Random Forest":
597
- model = RandomForestRegressor(random_state=42)
598
- param_grid = {
599
- 'n_estimators': [100, 200],
600
- 'max_depth': [None, 5, 10],
601
- 'min_samples_split': [2, 5]
602
- }
603
- elif model_type == "Gradient Boosting":
604
- model = GradientBoostingRegressor(random_state=42)
605
- param_grid = {
606
- 'n_estimators': [100, 200],
607
- 'learning_rate': [0.01, 0.1],
608
- 'max_depth': [3, 5]
609
- }
610
- elif model_type == "Neural Network":
611
- model = MLPRegressor(random_state=42, max_iter=500) #set max_iter to 500
612
- param_grid = {
613
- 'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
614
- 'activation': ['relu', 'tanh'],
615
- 'alpha': [0.0001, 0.001]
616
- }
617
- else:
618
- raise ValueError(f"Invalid model type: {model_type}")
619
-
620
-
621
- elif problem_type == "Classification": #Binary
622
- if model_type == "Random Forest":
623
- model = RandomForestClassifier(random_state=42)
624
- param_grid = {
625
- 'n_estimators': [100, 200],
626
- 'max_depth': [None, 5, 10],
627
- 'min_samples_split': [2, 5]
628
- }
629
- elif model_type == "Gradient Boosting":
630
- model = GradientBoostingClassifier(random_state=42)
631
- param_grid = {
632
- 'n_estimators': [100, 200],
633
- 'learning_rate': [0.01, 0.1],
634
- 'max_depth': [3, 5]
635
- }
636
- elif model_type == "Neural Network":
637
- model = MLPClassifier(random_state=42, max_iter=500) #set max_iter to 500
638
- param_grid = {
639
- 'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
640
- 'activation': ['relu', 'tanh'],
641
- 'alpha': [0.0001, 0.001]
642
- }
643
-
644
- else:
645
- raise ValueError(f"Invalid model type: {model_type}")
646
- elif problem_type == "Multiclass": #Multiclass
647
-
648
- if model_type == "Logistic Regression":
649
- model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr') # 'ovr' for one-vs-rest
650
- param_grid = {'C': [0.1, 1.0, 10.0]} # Regularization parameter
651
-
652
- elif model_type == "Support Vector Machine":
653
- model = SVC(random_state=42, probability=True) # probability=True for probabilities
654
- param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
655
-
656
- elif model_type == "Random Forest":
657
- model = RandomForestClassifier(random_state=42)
658
- param_grid = {
659
- 'n_estimators': [100, 200],
660
- 'max_depth': [None, 5, 10],
661
- 'min_samples_split': [2, 5],
662
- 'criterion': ['gini', 'entropy'] #criterion for decision
663
- }
664
-
665
- else:
666
- raise ValueError(f"Invalid model type: {model_type} for Multiclass")
667
- else:
668
- raise ValueError(f"Invalid problem type: {problem_type}")
669
-
670
- # Update param_grid with user-defined parameters
671
- param_grid.update(model_params) #This is key to use the model_params provided by user
672
-
673
- if use_grid_search:
674
- grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error', verbose=1, n_jobs=-1)
675
- grid_search.fit(X_train, y_train)
676
- model = grid_search.best_estimator_ # Use the best model found
677
- st.write("Best hyperparameters found by Grid Search:", grid_search.best_params_) #Print best parameters
678
-
679
- else:
680
- model.fit(X_train, y_train)
681
-
682
- # Cross-Validation (after hyperparameter tuning, if applicable)
683
- cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error')
684
- st.write("Cross-validation scores:", cv_scores)
685
- st.write("Mean cross-validation score:", cv_scores.mean())
686
-
687
- # Evaluation
688
- y_pred = model.predict(X_test)
689
- metrics = {} #Store metrics in a dictionary
690
-
691
- if problem_type == "Classification":
692
- metrics['accuracy'] = accuracy_score(y_test, y_pred)
693
- metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
694
- metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
695
-
696
- elif problem_type == "Multiclass":
697
-
698
- metrics['accuracy'] = accuracy_score(y_test, y_pred)
699
- metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
700
- metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
701
- else:
702
- metrics['mse'] = mean_squared_error(y_test, y_pred)
703
- metrics['r2'] = r2_score(y_test, y_pred)
704
-
705
- # Feature Importance (Permutation Importance for potentially better handling of correlated features)
706
- try:
707
- result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42) #Permutation Feature Importance
708
- importance = result.importances_mean
709
-
710
- except Exception as e:
711
- st.warning(f"Could not calculate feature importance: {e}")
712
- importance = None
713
-
714
- # Store the column order for prediction purposes
715
- column_order = X.columns
716
-
717
- return model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance
718
-
719
- except Exception as e:
720
- st.error(f"Training failed: {str(e)}")
721
- return None, None, None, None, None, None, None
722
- # Model Validation Function
723
- def validate_model(model_path, df, target, features, test_size):
724
- """Loads a model, preprocesses data, and evaluates the model on a validation set."""
725
- try:
726
- loaded_data = joblib.load(model_path)
727
- model = loaded_data['model']
728
- scaler = loaded_data['scaler']
729
- label_encoder = loaded_data['label_encoder']
730
- imputer_numerical = loaded_data['imputer_numerical']
731
- column_order = loaded_data['column_order']
732
- problem_type = loaded_data['problem_type']
733
-
734
- X = df[features]
735
- y = df[target]
736
-
737
- # Imputation
738
- numerical_features = X.select_dtypes(include=np.number).columns
739
- X[numerical_features] = imputer_numerical.transform(X[numerical_features])
740
-
741
- # Encoding
742
- X = pd.get_dummies(X, columns=X.select_dtypes(exclude=np.number).columns, dummy_na=False)
743
-
744
- # Ensure correct column order
745
- X = X[column_order] #Reorder the columns
746
-
747
- # Split the data
748
- X_train, X_test, y_train, y_test = train_test_split(
749
- X, y, test_size=test_size, random_state=42
750
- )
751
-
752
- # Scaling
753
- X_train = scaler.transform(X_train)
754
- X_test = scaler.transform(X_test)
755
-
756
- # Target Encoding (if classification) - Use the same encoder used during training
757
- if problem_type == "Classification" or problem_type == "Multiclass":
758
- y = label_encoder.transform(y)
759
-
760
- y_pred = model.predict(X_test)
761
-
762
- metrics = {}
763
- if problem_type == "Classification":
764
- metrics['accuracy'] = accuracy_score(y_test, y_pred)
765
- metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
766
- metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
767
-
768
- elif problem_type == "Multiclass":
769
-
770
- metrics['accuracy'] = accuracy_score(y_test, y_pred)
771
- metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
772
- metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
773
- else:
774
- metrics['mse'] = mean_squared_error(y_test, y_pred)
775
- metrics['r2'] = r2_score(y_test, y_pred)
776
-
777
- return metrics, problem_type
778
-
779
- except Exception as e:
780
- st.error(f"Validation failed: {str(e)}")
781
- return None, None
782
-
783
  # Streamlit App
784
  elif app_mode == "Model Training":
785
  st.title("🤖 Intelligent Model Training")
 
176
  report['columns'][col] = col_report
177
  return report
178
 
179
+ # Function to train the model (Separated for clarity and reusability)
180
+ def train_model(df, target, features, problem_type, test_size, model_type, model_params, use_grid_search=False):
181
+ """Trains a model with hyperparameter tuning, cross-validation, and customizable model architecture."""
182
+
183
+ try:
184
+ X = df[features]
185
+ y = df[target]
186
+
187
+ # Input Validation
188
+ if target not in df.columns:
189
+ raise ValueError(f"Target variable '{target}' not found in DataFrame.")
190
+ for feature in features:
191
+ if feature not in df.columns:
192
+ raise ValueError(f"Feature '{feature}' not found in DataFrame.")
193
+
194
+ # Preprocessing Pipeline: Handles missing values, encoding, scaling
195
+ # Imputation: Handle missing values BEFORE encoding (numerical only for SimpleImputer)
196
+ numerical_features = X.select_dtypes(include=np.number).columns
197
+ categorical_features = X.select_dtypes(exclude=np.number).columns
198
+
199
+ imputer_numerical = SimpleImputer(strategy='mean') # Or 'median', 'most_frequent', 'constant'
200
+ X[numerical_features] = imputer_numerical.fit_transform(X[numerical_features])
201
+
202
+ # Encoding (One-Hot Encode Categorical Features)
203
+ X = pd.get_dummies(X, columns=categorical_features, dummy_na=False) # dummy_na = False. We imputed already.
204
+
205
+ # Target Encoding (if classification)
206
+ label_encoder = None #Initialize label_encoder
207
+ if problem_type == "Classification" or problem_type == "Multiclass":
208
+ label_encoder = LabelEncoder()
209
+ y = label_encoder.fit_transform(y)
210
+
211
+
212
+ # Split the data
213
+ X_train, X_test, y_train, y_test = train_test_split(
214
+ X, y, test_size=test_size, random_state=42
215
+ )
216
+
217
+ # Scaling (AFTER splitting!)
218
+ scaler = StandardScaler() # Or try MinMaxScaler, RobustScaler, QuantileTransformer
219
+ X_train = scaler.fit_transform(X_train) #Fit to the training data ONLY
220
+ X_test = scaler.transform(X_test) #Transform the test data using the fitted scaler
221
+
222
+ # Model Selection and Hyperparameter Tuning
223
+ if problem_type == "Regression":
224
+ if model_type == "Random Forest":
225
+ model = RandomForestRegressor(random_state=42)
226
+ param_grid = {
227
+ 'n_estimators': [100, 200],
228
+ 'max_depth': [None, 5, 10],
229
+ 'min_samples_split': [2, 5]
230
+ }
231
+ elif model_type == "Gradient Boosting":
232
+ model = GradientBoostingRegressor(random_state=42)
233
+ param_grid = {
234
+ 'n_estimators': [100, 200],
235
+ 'learning_rate': [0.01, 0.1],
236
+ 'max_depth': [3, 5]
237
+ }
238
+ elif model_type == "Neural Network":
239
+ model = MLPRegressor(random_state=42, max_iter=500) #set max_iter to 500
240
+ param_grid = {
241
+ 'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
242
+ 'activation': ['relu', 'tanh'],
243
+ 'alpha': [0.0001, 0.001]
244
+ }
245
+ else:
246
+ raise ValueError(f"Invalid model type: {model_type}")
247
+
248
+
249
+ elif problem_type == "Classification": #Binary
250
+ if model_type == "Random Forest":
251
+ model = RandomForestClassifier(random_state=42)
252
+ param_grid = {
253
+ 'n_estimators': [100, 200],
254
+ 'max_depth': [None, 5, 10],
255
+ 'min_samples_split': [2, 5]
256
+ }
257
+ elif model_type == "Gradient Boosting":
258
+ model = GradientBoostingClassifier(random_state=42)
259
+ param_grid = {
260
+ 'n_estimators': [100, 200],
261
+ 'learning_rate': [0.01, 0.1],
262
+ 'max_depth': [3, 5]
263
+ }
264
+ elif model_type == "Neural Network":
265
+ model = MLPClassifier(random_state=42, max_iter=500) #set max_iter to 500
266
+ param_grid = {
267
+ 'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
268
+ 'activation': ['relu', 'tanh'],
269
+ 'alpha': [0.0001, 0.001]
270
+ }
271
+
272
+ else:
273
+ raise ValueError(f"Invalid model type: {model_type}")
274
+ elif problem_type == "Multiclass": #Multiclass
275
+
276
+ if model_type == "Logistic Regression":
277
+ model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr') # 'ovr' for one-vs-rest
278
+ param_grid = {'C': [0.1, 1.0, 10.0]} # Regularization parameter
279
+
280
+ elif model_type == "Support Vector Machine":
281
+ model = SVC(random_state=42, probability=True) # probability=True for probabilities
282
+ param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
283
+
284
+ elif model_type == "Random Forest":
285
+ model = RandomForestClassifier(random_state=42)
286
+ param_grid = {
287
+ 'n_estimators': [100, 200],
288
+ 'max_depth': [None, 5, 10],
289
+ 'min_samples_split': [2, 5],
290
+ 'criterion': ['gini', 'entropy'] #criterion for decision
291
+ }
292
+
293
+ else:
294
+ raise ValueError(f"Invalid model type: {model_type} for Multiclass")
295
+ else:
296
+ raise ValueError(f"Invalid problem type: {problem_type}")
297
+
298
+ # Update param_grid with user-defined parameters
299
+ param_grid.update(model_params) #This is key to use the model_params provided by user
300
+
301
+ if use_grid_search:
302
+ grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error', verbose=1, n_jobs=-1)
303
+ grid_search.fit(X_train, y_train)
304
+ model = grid_search.best_estimator_ # Use the best model found
305
+ st.write("Best hyperparameters found by Grid Search:", grid_search.best_params_) #Print best parameters
306
+
307
+ else:
308
+ model.fit(X_train, y_train)
309
+
310
+ # Cross-Validation (after hyperparameter tuning, if applicable)
311
+ cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error')
312
+ st.write("Cross-validation scores:", cv_scores)
313
+ st.write("Mean cross-validation score:", cv_scores.mean())
314
+
315
+ # Evaluation
316
+ y_pred = model.predict(X_test)
317
+ metrics = {} #Store metrics in a dictionary
318
+
319
+ if problem_type == "Classification":
320
+ metrics['accuracy'] = accuracy_score(y_test, y_pred)
321
+ metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
322
+ metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
323
+
324
+ elif problem_type == "Multiclass":
325
+
326
+ metrics['accuracy'] = accuracy_score(y_test, y_pred)
327
+ metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
328
+ metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
329
+ else:
330
+ metrics['mse'] = mean_squared_error(y_test, y_pred)
331
+ metrics['r2'] = r2_score(y_test, y_pred)
332
+
333
+ # Feature Importance (Permutation Importance for potentially better handling of correlated features)
334
+ try:
335
+ result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42) #Permutation Feature Importance
336
+ importance = result.importances_mean
337
+
338
+ except Exception as e:
339
+ st.warning(f"Could not calculate feature importance: {e}")
340
+ importance = None
341
+
342
+ # Store the column order for prediction purposes
343
+ column_order = X.columns
344
+
345
+ return model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance
346
+
347
+ except Exception as e:
348
+ st.error(f"Training failed: {str(e)}")
349
+ return None, None, None, None, None, None, None
350
+
351
+ # Model Validation Function
352
+ def validate_model(model_path, df, target, features, test_size):
353
+ """Loads a model, preprocesses data, and evaluates the model on a validation set."""
354
+ try:
355
+ loaded_data = joblib.load(model_path)
356
+ model = loaded_data['model']
357
+ scaler = loaded_data['scaler']
358
+ label_encoder = loaded_data['label_encoder']
359
+ imputer_numerical = loaded_data['imputer_numerical']
360
+ column_order = loaded_data['column_order']
361
+ problem_type = loaded_data['problem_type']
362
+
363
+ X = df[features]
364
+ y = df[target]
365
+
366
+ # Imputation
367
+ numerical_features = X.select_dtypes(include=np.number).columns
368
+ X[numerical_features] = imputer_numerical.transform(X[numerical_features])
369
+
370
+ # Encoding
371
+ X = pd.get_dummies(X, columns=X.select_dtypes(exclude=np.number).columns, dummy_na=False)
372
+
373
+ # Ensure correct column order
374
+ X = X[column_order] #Reorder the columns
375
+
376
+ # Split the data
377
+ X_train, X_test, y_train, y_test = train_test_split(
378
+ X, y, test_size=test_size, random_state=42
379
+ )
380
+
381
+ # Scaling
382
+ X_train = scaler.transform(X_train)
383
+ X_test = scaler.transform(X_test)
384
+
385
+ # Target Encoding (if classification) - Use the same encoder used during training
386
+ if problem_type == "Classification" or problem_type == "Multiclass":
387
+ y = label_encoder.transform(y)
388
+
389
+ y_pred = model.predict(X_test)
390
+
391
+ metrics = {}
392
+ if problem_type == "Classification":
393
+ metrics['accuracy'] = accuracy_score(y_test, y_pred)
394
+ metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
395
+ metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
396
+
397
+ elif problem_type == "Multiclass":
398
+
399
+ metrics['accuracy'] = accuracy_score(y_test, y_pred)
400
+ metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
401
+ metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
402
+ else:
403
+ metrics['mse'] = mean_squared_error(y_test, y_pred)
404
+ metrics['r2'] = r2_score(y_test, y_pred)
405
+
406
+ return metrics, problem_type
407
+
408
+ except Exception as e:
409
+ st.error(f"Validation failed: {str(e)}")
410
+ return None, None
411
+
412
  # --------------------------
413
  # Sidebar Navigation
414
  # --------------------------
 
781
  # Call the EDA function
782
  eda()
783
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784
  # Streamlit App
785
  elif app_mode == "Model Training":
786
  st.title("🤖 Intelligent Model Training")