prthgo commited on
Commit
9c48475
·
1 Parent(s): 6335d24

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -1018
app.py DELETED
@@ -1,1018 +0,0 @@
1
- import streamlit as st
2
- import numpy as np
3
- import pandas as pd
4
- import io
5
- import matplotlib.pyplot as plt
6
- from matplotlib.ticker import PercentFormatter
7
- import seaborn as sns
8
- from sklearn.preprocessing import (
9
- OneHotEncoder,
10
- OrdinalEncoder,
11
- StandardScaler,
12
- MinMaxScaler,
13
- )
14
- from sklearn.model_selection import train_test_split
15
- from imblearn.under_sampling import RandomUnderSampler
16
- from imblearn.over_sampling import RandomOverSampler, SMOTE
17
- from sklearn.linear_model import Ridge, Lasso, LogisticRegression
18
- from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
19
- from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
20
- from sklearn.svm import SVR, SVC
21
- from sklearn.naive_bayes import MultinomialNB
22
- from xgboost import XGBRFRegressor, XGBRFClassifier
23
- from lightgbm import LGBMRegressor, LGBMClassifier
24
- from sklearn.metrics import (
25
- mean_absolute_error,
26
- mean_squared_error,
27
- mean_squared_error,
28
- r2_score,
29
- )
30
- from sklearn.metrics import (
31
- accuracy_score,
32
- f1_score,
33
- roc_auc_score,
34
- confusion_matrix,
35
- )
36
- import pickle
37
-
38
- st.set_page_config(page_title="Tabular Data Analysis and Auto ML", page_icon="🤖")
39
- sns.set_style("white")
40
- sns.set_context("poster", font_scale=0.7)
41
- palette = [
42
- "#1d7874",
43
- "#679289",
44
- "#f4c095",
45
- "#ee2e31",
46
- "#ffb563",
47
- "#918450",
48
- "#f85e00",
49
- "#a41623",
50
- "#9a031e",
51
- "#d6d6d6",
52
- "#ffee32",
53
- "#ffd100",
54
- "#333533",
55
- "#202020",
56
- ]
57
-
58
-
59
- def main():
60
- file = st.sidebar.file_uploader("Upload Your CSV File Here: ")
61
- process = st.sidebar.button("Process")
62
- option = st.sidebar.radio(
63
- "Select an Option: ",
64
- (
65
- "Basic EDA",
66
- "Univariate Analysis",
67
- "Bivariate Analysis",
68
- "Preprocess",
69
- "Training and Evaluation",
70
- ),
71
- )
72
- placeholder = st.empty()
73
- placeholder.markdown(
74
- "<h1 style='text-align: center;'>Welcome to Tabular Data Analysis and Auto ML🤖</h1>",
75
- unsafe_allow_html=True
76
- )
77
-
78
-
79
- if file is not None and process:
80
- data = load_csv(file)
81
- st.session_state["data"] = data
82
-
83
- if "data" in st.session_state:
84
- data = st.session_state["data"]
85
- placeholder.empty()
86
-
87
- if option == "Basic EDA":
88
- st.markdown(
89
- "<h1 style='text-align: center;'>Basic EDA</h1>", unsafe_allow_html=True
90
- )
91
-
92
- st.subheader("Data Overview")
93
- st.write(data_overview(data))
94
- st.write(duplicate(data))
95
- st.dataframe(data.head())
96
-
97
- st.subheader("Data Types and Unique Value Counts")
98
- display_data_info(data)
99
-
100
- st.subheader("Missing Data")
101
- missing_data(data)
102
-
103
- st.subheader("Value Counts")
104
- value_counts(data)
105
-
106
- st.subheader("Descriptive Statistics")
107
- st.write(data.describe().T)
108
-
109
- if option == "Univariate Analysis":
110
- st.markdown(
111
- "<h1 style='text-align: center;'>Univariate Analysis</h1>",
112
- unsafe_allow_html=True,
113
- )
114
- plot = st.radio(
115
- "Select a chart: ",
116
- ("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"),
117
- )
118
-
119
- if plot == "Count Plot":
120
- column = st.selectbox(
121
- "Select a column", [""] + list(data.select_dtypes("O"))
122
- )
123
- if column:
124
- countplot(data, column)
125
-
126
- if plot == "Pie Chart":
127
- column = st.selectbox(
128
- "Select a column", [""] + list(data.select_dtypes("O"))
129
- )
130
- if column:
131
- piechart(data, column)
132
-
133
- if plot == "Histogram":
134
- column = st.selectbox(
135
- "Select a column",
136
- [""] + list(data.select_dtypes(include=["int", "float"])),
137
- )
138
- if column:
139
- histogram(data, column)
140
-
141
- if plot == "Violin Plot":
142
- column = st.selectbox(
143
- "Select a column",
144
- [""] + list(data.select_dtypes(include=["int", "float"])),
145
- )
146
- if column:
147
- violinplot(data, column)
148
-
149
- if plot == "Scatter Plot":
150
- column = st.selectbox(
151
- "Select a column",
152
- [""] + list(data.select_dtypes(include=["int", "float"])),
153
- )
154
- if column:
155
- scatterplot(data, column)
156
-
157
- if option == "Bivariate Analysis":
158
- st.markdown(
159
- "<h1 style='text-align: center;'>Bivariate Analysis</h1>",
160
- unsafe_allow_html=True,
161
- )
162
- plot = st.radio(
163
- "Select a chart: ",
164
- ("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"),
165
- )
166
-
167
- if plot == "Scatter Plot":
168
- columns = st.multiselect(
169
- "Select two columns",
170
- [""] + list(data.select_dtypes(include=["int", "float"])),
171
- )
172
-
173
- if columns:
174
- biscatterplot(data, columns)
175
-
176
- if plot == "Bar Plot":
177
- columns = st.multiselect("Select two columns", list(data.columns))
178
-
179
- if columns:
180
- bibarplot(data, columns)
181
-
182
- if plot == "Box Plot":
183
- columns = st.multiselect("Select two columns", list(data.columns))
184
-
185
- if columns:
186
- biboxplot(data, columns)
187
-
188
- if plot == "Pareto Chart":
189
- column = st.selectbox(
190
- "Select a columns",
191
- [""] + list(data.select_dtypes(include="object")),
192
- )
193
-
194
- if column:
195
- paretoplot(data, column)
196
-
197
- if option == "Preprocess":
198
- st.markdown(
199
- "<h1 style='text-align: center;'>Data Preprocessing</h1>",
200
- unsafe_allow_html=True,
201
- )
202
-
203
- operation = st.radio(
204
- "Select preprocessing step: ",
205
- (
206
- "Drop Columns",
207
- "Handling Missing Values",
208
- "Encode Categorical Features",
209
- ),
210
- )
211
-
212
- if operation == "Drop Columns":
213
- columns = st.multiselect("Select Columns to drop: ", (data.columns))
214
- drop_columns = st.button("Drop Columns")
215
- if drop_columns:
216
- data.drop(columns, axis=1, inplace=True)
217
- st.success("Dropped selected columns✅✅✅")
218
-
219
- elif operation == "Handling Missing Values":
220
- num_missing = st.selectbox(
221
- "Select a Approach (Numerical columns only): ",
222
- ("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"),
223
- ).lower()
224
-
225
- cat_missing = st.selectbox(
226
- "Select a Approach (Categorical columns only): ",
227
- ("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"),
228
- ).lower()
229
- hmv = st.button("Handle Missing Values")
230
-
231
- if hmv:
232
- if num_missing:
233
- num_data = data.select_dtypes(include=["int64", "float64"])
234
-
235
- if num_missing == "drop":
236
- data = data.dropna(subset=num_data.columns)
237
-
238
- elif num_missing in [
239
- "mean",
240
- "median",
241
- "backward fill",
242
- "forward fill",
243
- ]:
244
- if num_missing == "mean":
245
- fill_values = num_data.mean()
246
- elif num_missing == "median":
247
- fill_values = num_data.median()
248
- elif num_missing == "backward fill":
249
- fill_values = num_data.bfill()
250
- elif num_missing == "forward fill":
251
- fill_values = num_data.ffill()
252
-
253
- data.fillna(value=fill_values, inplace=True)
254
-
255
- st.success(
256
- "Imputed missing values in numerical columns with selected approach."
257
- )
258
-
259
- if cat_missing:
260
- cat_data = data.select_dtypes(exclude=["int", "float"])
261
-
262
- if cat_missing == "drop":
263
- data = data.dropna(subset=cat_data.columns)
264
-
265
- elif cat_missing == "most frequent values":
266
- mode_values = data[cat_data.columns].mode().iloc[0]
267
- data[cat_data.columns] = data[cat_data.columns].fillna(
268
- mode_values
269
- )
270
-
271
- elif cat_missing == "replace with 'unknown'":
272
- data[cat_data.columns] = data[cat_data.columns].fillna(
273
- "Unknown"
274
- )
275
-
276
- st.success(
277
- "Imputed missing values in categorical columns with selected approach."
278
- )
279
-
280
- elif operation == "Encode Categorical Features":
281
- oe_columns = st.multiselect(
282
- "Choose Columns for Ordinal Encoding",
283
- [""] + list(data.select_dtypes(include="object")),
284
- )
285
- st.info("Other columns will be One Hot Encoded.")
286
-
287
- encode_columns = st.button("Encode Columns")
288
-
289
- if encode_columns:
290
- bool_columns = data.select_dtypes(include=bool).columns
291
- data[bool_columns] = data[bool_columns].astype(int)
292
- if oe_columns:
293
- oe = OrdinalEncoder()
294
- data[oe_columns] = oe.fit_transform(
295
- data[oe_columns].astype("str")
296
- )
297
-
298
- try:
299
- remaining_cat_cols = [
300
- col
301
- for col in data.select_dtypes(include="object")
302
- if col not in oe_columns
303
- ]
304
- except:
305
- pass
306
-
307
- if len(remaining_cat_cols) > 0:
308
- data = pd.get_dummies(
309
- data, columns=remaining_cat_cols, drop_first=False
310
- )
311
- bool_columns = data.select_dtypes(include=bool).columns
312
- data[bool_columns] = data[bool_columns].astype(int)
313
-
314
- st.success("Encoded categorical columns")
315
-
316
- preprocessed_data_csv = data.to_csv(index=False)
317
-
318
- # Create a StringIO object to handle the data
319
- preprocessed_data_buffer = io.StringIO()
320
- preprocessed_data_buffer.write(preprocessed_data_csv)
321
- preprocessed_data_bytes = preprocessed_data_buffer.getvalue()
322
-
323
- # Now you can add a download button for the preprocessed data
324
- if st.download_button(
325
- label="Download Preprocessed Data",
326
- key="preprocessed_data",
327
- on_click=None,
328
- data=preprocessed_data_bytes.encode(),
329
- file_name="preprocessed_data.csv",
330
- mime="text/csv",
331
- ):
332
- pass
333
-
334
-
335
- if option == "Training and Evaluation":
336
- st.markdown(
337
- "<h1 style='text-align: center;'>Training and Evaluation</h1>",
338
- unsafe_allow_html=True,
339
- )
340
- algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification"))
341
-
342
- if algo == "Regression":
343
- target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
344
-
345
- try:
346
- X = data.drop(target, axis=1)
347
- Y = data[target]
348
- except Exception as e:
349
- st.write(str(e))
350
-
351
- st.write(
352
- "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
353
- )
354
- X_train, X_test, y_train, y_test = train_test_split(
355
- X, Y, test_size=0.2, random_state=42
356
- )
357
-
358
- scale = st.selectbox(
359
- "Choose how do you want to scale features:",
360
- ("", "Standard Scaler", "Min Max Scaler"),
361
- )
362
-
363
- if scale == "Standard Scaler":
364
- scaler = StandardScaler()
365
- X_train = scaler.fit_transform(X_train)
366
- X_test = scaler.transform(X_test)
367
-
368
- elif scale == "Min Max Scaler":
369
- scaler = MinMaxScaler()
370
- X_train = scaler.fit_transform(X_train)
371
- X_test = scaler.transform(X_test)
372
-
373
- model = st.selectbox(
374
- "Choose Regression Model for training: ",
375
- (
376
- "",
377
- "Ridge Regression",
378
- "Decision Tree Regressor",
379
- "Random Forest Regressor",
380
- "SVR",
381
- "XGBRF Regressor",
382
- "LGBM Regressor",
383
- ),
384
- )
385
-
386
- if model == "Ridge Regression":
387
- reg = Ridge(alpha=1.0)
388
- reg.fit(X_train, y_train)
389
- pred = reg.predict(X_test)
390
- st.write(
391
- "Mean Absolute Error (MAE): {:.4f}".format(
392
- mean_absolute_error(pred, y_test)
393
- )
394
- )
395
- st.write(
396
- "Mean Squared Error (MSE): {:.4f}".format(
397
- mean_squared_error(pred, y_test)
398
- )
399
- )
400
- st.write(
401
- "Root Mean Squared Error (RMSE): {:.4f}".format(
402
- mean_squared_error(pred, y_test, squared=False)
403
- )
404
- )
405
- st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
406
-
407
- if st.download_button(
408
- label="Download Trained Model",
409
- key="trained_model",
410
- on_click=None,
411
- data=pickle.dumps(reg),
412
- file_name="ridge_regression_model.pkl",
413
- mime="application/octet-stream",
414
- ):
415
- with open("ridge_regression_model.pkl", "wb") as model_file:
416
- pickle.dump(reg, model_file)
417
-
418
- elif model == "Decision Tree Regressor":
419
- reg = DecisionTreeRegressor(max_depth=10)
420
- reg.fit(X_train, y_train)
421
- pred = reg.predict(X_test)
422
- st.write(
423
- "Mean Absolute Error (MAE): {:.4f}".format(
424
- mean_absolute_error(pred, y_test)
425
- )
426
- )
427
- st.write(
428
- "Mean Squared Error (MSE): {:.4f}".format(
429
- mean_squared_error(pred, y_test)
430
- )
431
- )
432
- st.write(
433
- "Root Mean Squared Error (RMSE): {:.4f}".format(
434
- mean_squared_error(pred, y_test, squared=False)
435
- )
436
- )
437
- st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
438
-
439
- if st.download_button(
440
- label="Download Trained Model",
441
- key="trained_model",
442
- on_click=None,
443
- data=pickle.dumps(reg),
444
- file_name="decision_tree_regression_model.pkl",
445
- mime="application/octet-stream",
446
- ):
447
- with open(
448
- "decision_tree_regression_model.pkl", "wb"
449
- ) as model_file:
450
- pickle.dump(reg, model_file)
451
-
452
- elif model == "Random Forest Regressor":
453
- reg = RandomForestRegressor(max_depth=10, n_estimators=100)
454
- reg.fit(X_train, y_train)
455
- pred = reg.predict(X_test)
456
- st.write(
457
- "Mean Absolute Error (MAE): {:.4f}".format(
458
- mean_absolute_error(pred, y_test)
459
- )
460
- )
461
- st.write(
462
- "Mean Squared Error (MSE): {:.4f}".format(
463
- mean_squared_error(pred, y_test)
464
- )
465
- )
466
- st.write(
467
- "Root Mean Squared Error (RMSE): {:.4f}".format(
468
- mean_squared_error(pred, y_test, squared=False)
469
- )
470
- )
471
- st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
472
-
473
- if st.download_button(
474
- label="Download Trained Model",
475
- key="trained_model",
476
- on_click=None,
477
- data=pickle.dumps(reg),
478
- file_name="random_forest_regression_model.pkl",
479
- mime="application/octet-stream",
480
- ):
481
- with open(
482
- "random_forest_regression_model.pkl", "wb"
483
- ) as model_file:
484
- pickle.dump(reg, model_file)
485
-
486
- elif model == "SVR":
487
- reg = SVR(C=1.0, epsilon=0.2)
488
- reg.fit(X_train, y_train)
489
- pred = reg.predict(X_test)
490
- st.write(
491
- "Mean Absolute Error (MAE): {:.4f}".format(
492
- mean_absolute_error(pred, y_test)
493
- )
494
- )
495
- st.write(
496
- "Mean Squared Error (MSE): {:.4f}".format(
497
- mean_squared_error(pred, y_test)
498
- )
499
- )
500
- st.write(
501
- "Root Mean Squared Error (RMSE): {:.4f}".format(
502
- mean_squared_error(pred, y_test, squared=False)
503
- )
504
- )
505
- st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
506
-
507
- if st.download_button(
508
- label="Download Trained Model",
509
- key="trained_model",
510
- on_click=None,
511
- data=pickle.dumps(reg),
512
- file_name="svr_model.pkl",
513
- mime="application/octet-stream",
514
- ):
515
- with open("svr_model.pkl", "wb") as model_file:
516
- pickle.dump(reg, model_file)
517
-
518
- elif model == "XGBRF Regressor":
519
- reg = XGBRFRegressor(reg_lambda=1)
520
- reg.fit(X_train, y_train)
521
- pred = reg.predict(X_test)
522
- st.write(
523
- "Mean Absolute Error (MAE): {:.4f}".format(
524
- mean_absolute_error(pred, y_test)
525
- )
526
- )
527
- st.write(
528
- "Mean Squared Error (MSE): {:.4f}".format(
529
- mean_squared_error(pred, y_test)
530
- )
531
- )
532
- st.write(
533
- "Root Mean Squared Error (RMSE): {:.4f}".format(
534
- mean_squared_error(pred, y_test, squared=False)
535
- )
536
- )
537
- st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
538
-
539
- if st.download_button(
540
- label="Download Trained Model",
541
- key="trained_model",
542
- on_click=None,
543
- data=pickle.dumps(reg),
544
- file_name="xgbrf_regression_model.pkl",
545
- mime="application/octet-stream",
546
- ):
547
- with open("xgbrf_regression_model.pkl", "wb") as model_file:
548
- pickle.dump(reg, model_file)
549
-
550
- elif model == "LGBM Regressor":
551
- reg = LGBMRegressor(reg_lambda=1)
552
- reg.fit(X_train, y_train)
553
- pred = reg.predict(X_test)
554
- st.write(
555
- "Mean Absolute Error (MAE): {:.4f}".format(
556
- mean_absolute_error(pred, y_test)
557
- )
558
- )
559
- st.write(
560
- "Mean Squared Error (MSE): {:.4f}".format(
561
- mean_squared_error(pred, y_test)
562
- )
563
- )
564
- st.write(
565
- "Root Mean Squared Error (RMSE): {:.4f}".format(
566
- mean_squared_error(pred, y_test, squared=False)
567
- )
568
- )
569
- st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
570
-
571
- if st.download_button(
572
- label="Download Trained Model",
573
- key="trained_model",
574
- on_click=None,
575
- data=pickle.dumps(reg),
576
- file_name="lgbm_regression_model.pkl",
577
- mime="application/octet-stream",
578
- ):
579
- with open("lgbm_regression_model.pkl", "wb") as model_file:
580
- pickle.dump(reg, model_file)
581
-
582
- elif algo == "Classification":
583
- target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
584
-
585
- try:
586
- X = data.drop(target, axis=1)
587
- Y = data[target]
588
- except Exception as e:
589
- st.write(str(e))
590
-
591
- st.write(
592
- "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
593
- )
594
- X_train, X_test, y_train, y_test = train_test_split(
595
- X, Y, test_size=0.2, random_state=42
596
- )
597
-
598
- balance = st.selectbox(
599
- "Do you want to balance dataset?", ("", "Yes", "No")
600
- )
601
- if balance == "Yes":
602
- piechart(data, target)
603
-
604
- sample = st.selectbox(
605
- "Which approach you want to use?",
606
- ("", "Random Under Sampling", "Random Over Sampling", "SMOTE"),
607
- )
608
-
609
- if sample == "Random Under Sampling":
610
- rus = RandomUnderSampler(random_state=42)
611
- X_train, y_train = rus.fit_resample(X_train, y_train)
612
-
613
- elif sample == "Random Over Sampling":
614
- ros = RandomOverSampler(random_state=42)
615
- X_train, y_train = ros.fit_resample(X_train, y_train)
616
-
617
- elif sample == "SMOTE":
618
- smote = SMOTE(random_state=42)
619
- X_train, y_train = smote.fit_resample(X_train, y_train)
620
-
621
- scale = st.selectbox(
622
- "Choose how do you want to scale features:",
623
- ("", "Standard Scaler", "Min Max Scaler"),
624
- )
625
-
626
- if scale == "Standard Scaler":
627
- scaler = StandardScaler()
628
- X_train = scaler.fit_transform(X_train)
629
- X_test = scaler.transform(X_test)
630
-
631
- elif scale == "Min Max Scaler":
632
- scaler = MinMaxScaler()
633
- X_train = scaler.fit_transform(X_train)
634
- X_test = scaler.transform(X_test)
635
-
636
- model = st.selectbox(
637
- "Choose Classification Model for training: ",
638
- (
639
- "",
640
- "Logistic Regression",
641
- "Decision Tree Classifier",
642
- "Random Forest Classifier",
643
- "SVC",
644
- "XGBRF Classifier",
645
- "LGBM Classifier",
646
- ),
647
- )
648
-
649
- if model == "Logistic Regression":
650
- clf = LogisticRegression(penalty="l2")
651
- clf.fit(X_train, y_train)
652
- pred = clf.predict(X_test)
653
- st.write(
654
- "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
655
- )
656
- try:
657
- st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
658
- except ValueError:
659
- st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
660
-
661
-
662
- plot_confusion_matrix(
663
- pred, y_test, "Logistic Regression Confusion Matrix "
664
- )
665
-
666
- if st.download_button(
667
- label="Download Trained Model",
668
- key="trained_model",
669
- on_click=None,
670
- data=pickle.dumps(clf),
671
- file_name="logistic_regression_model.pkl",
672
- mime="application/octet-stream",
673
- ):
674
- with open("logistic_regression_model.pkl", "wb") as model_file:
675
- pickle.dump(clf, model_file)
676
-
677
- if model == "Decision Tree Classifier":
678
- clf = DecisionTreeClassifier(max_depth=5)
679
- clf.fit(X_train, y_train)
680
- pred = clf.predict(X_test)
681
- st.write(
682
- "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
683
- )
684
- try:
685
- st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
686
- except ValueError:
687
- st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
688
-
689
- plot_confusion_matrix(
690
- pred, y_test, "DecisionTree Classifier Confusion Matrix "
691
- )
692
-
693
- if st.download_button(
694
- label="Download Trained Model",
695
- key="trained_model",
696
- on_click=None,
697
- data=pickle.dumps(clf),
698
- file_name="decision_tree_classifier_model.pkl",
699
- mime="application/octet-stream",
700
- ):
701
- with open(
702
- "decision_tree_classifier_model.pkl", "wb"
703
- ) as model_file:
704
- pickle.dump(clf, model_file)
705
-
706
- if model == "Random Forest Classifier":
707
- clf = RandomForestClassifier(n_estimators=100, max_depth=5)
708
- clf.fit(X_train, y_train)
709
- pred = clf.predict(X_test)
710
- st.write(
711
- "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
712
- )
713
- try:
714
- st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
715
- except ValueError:
716
- st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
717
-
718
- plot_confusion_matrix(
719
- pred, y_test, "RandomForest Classifier Confusion Matrix "
720
- )
721
-
722
- if st.download_button(
723
- label="Download Trained Model",
724
- key="trained_model",
725
- on_click=None,
726
- data=pickle.dumps(clf),
727
- file_name="random_forest_classifier_model.pkl",
728
- mime="application/octet-stream",
729
- ):
730
- with open(
731
- "random_forest_classifier_model.pkl", "wb"
732
- ) as model_file:
733
- pickle.dump(clf, model_file)
734
-
735
- if model == "SVC":
736
- clf = SVC(C=1.5)
737
- clf.fit(X_train, y_train)
738
- pred = clf.predict(X_test)
739
- st.write(
740
- "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
741
- )
742
- try:
743
- st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
744
- except ValueError:
745
- st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
746
-
747
-
748
- plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ")
749
-
750
- if st.download_button(
751
- label="Download Trained Model",
752
- key="trained_model",
753
- on_click=None,
754
- data=pickle.dumps(clf),
755
- file_name="svc_model.pkl",
756
- mime="application/octet-stream",
757
- ):
758
- with open("svc_model.pkl", "wb") as model_file:
759
- pickle.dump(clf, model_file)
760
-
761
- if model == "XGBRF Classifier":
762
- clf = XGBRFClassifier(reg_lambda=1.0)
763
- clf.fit(X_train, y_train)
764
- pred = clf.predict(X_test)
765
- st.write(
766
- "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
767
- )
768
- try:
769
- st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
770
- except ValueError:
771
- st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
772
-
773
-
774
- plot_confusion_matrix(
775
- pred, y_test, "XGBRF Classifier Confusion Matrix "
776
- )
777
-
778
- if st.download_button(
779
- label="Download Trained Model",
780
- key="trained_model",
781
- on_click=None,
782
- data=pickle.dumps(clf),
783
- file_name="xgbrf_classifier_model.pkl",
784
- mime="application/octet-stream",
785
- ):
786
- with open("xgbrf_classifier_model.pkl", "wb") as model_file:
787
- pickle.dump(clf, model_file)
788
-
789
- if model == "LGBM Classifier":
790
- clf = LGBMClassifier(reg_lambda=1.0)
791
- clf.fit(X_train, y_train)
792
- pred = clf.predict(X_test)
793
- st.write(
794
- "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
795
- )
796
- try:
797
- st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
798
- except ValueError:
799
- st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
800
-
801
- plot_confusion_matrix(
802
- pred, y_test, "LGBM Classifier Confusion Matrix "
803
- )
804
-
805
- if st.download_button(
806
- label="Download Trained Model",
807
- key="trained_model",
808
- on_click=None,
809
- data=pickle.dumps(clf),
810
- file_name="lgbm_classifier_model.pkl",
811
- mime="application/octet-stream",
812
- ):
813
- with open("lgbm_classifier_model.pkl", "wb") as model_file:
814
- pickle.dump(clf, model_file)
815
-
816
-
817
- def load_csv(file):
818
- data = pd.read_csv(file)
819
- return data
820
-
821
-
822
- def data_overview(data):
823
- r, c = data.shape
824
- st.write(f"Number of Rows: {r}")
825
- return f"Number of Columns: {c}"
826
-
827
-
828
- def missing_data(data):
829
- missing_values = data.isna().sum()
830
- missing_values = missing_values[missing_values > 0]
831
- missing_value_per = (missing_values / data.shape[0]) * 100
832
- missing_value_per = missing_value_per.round(2).astype(str) + "%"
833
- missing_df = pd.DataFrame(
834
- {"Missing Values": missing_values, "Percentage": missing_value_per}
835
- )
836
- missing_df_html = missing_df.to_html(
837
- classes="table table-striped", justify="center"
838
- )
839
- return st.markdown(missing_df_html, unsafe_allow_html=True)
840
-
841
-
842
- def display_data_info(data):
843
- dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"])
844
- dtypes.reset_index(inplace=True)
845
- nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"])
846
- nunique.reset_index(inplace=True)
847
- dtypes.columns = ["Column", "Data Type"]
848
- nunique.columns = ["Column", "Unique Counts"]
849
- combined_df = pd.merge(dtypes, nunique, on="Column")
850
- combined_df_html = combined_df.to_html(
851
- classes="table table-striped", justify="center"
852
- )
853
- return st.markdown(combined_df_html, unsafe_allow_html=True)
854
-
855
-
856
- def value_counts(data):
857
- column = st.selectbox("Select a Column", [""] + list(data.columns))
858
- if column:
859
- st.write(data[column].value_counts())
860
-
861
-
862
- def duplicate(data):
863
- if data.duplicated().any():
864
- st.write(
865
- f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped."
866
- )
867
- data.drop_duplicates(keep="first", inplace=True)
868
- return ""
869
-
870
- else:
871
- return "There are no duplicate rows in the DataFrame."
872
-
873
-
874
- def countplot(data, col):
875
- plt.figure(figsize=(10, 6))
876
- sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2)
877
- plt.title(f"Countplot of {col} Column")
878
- st.pyplot(plt)
879
-
880
-
881
- def piechart(data, col):
882
- value_counts = data[col].value_counts()
883
- plt.figure(figsize=(8, 6))
884
- plt.pie(
885
- value_counts,
886
- labels=value_counts.index,
887
- autopct="%1.1f%%",
888
- colors=palette,
889
- shadow=False,
890
- wedgeprops=dict(edgecolor="#1c1c1c"),
891
- )
892
- plt.title(f"Pie Chart of {col} Column")
893
- st.pyplot(plt)
894
-
895
-
896
- def histogram(data, col):
897
- plt.figure(figsize=(10, 6))
898
- sns.histplot(
899
- data[col],
900
- kde=True,
901
- color=palette[4],
902
- fill=True,
903
- edgecolor="#1c1c1c",
904
- linewidth=2,
905
- )
906
- plt.title(f"Histogram of {col} Column")
907
- st.pyplot(plt)
908
-
909
-
910
- def violinplot(data, col):
911
- plt.figure(figsize=(10, 6))
912
- sns.violinplot(data[col], color=palette[8])
913
- plt.title(f"Violin Plot of {col} Column")
914
- st.pyplot(plt)
915
-
916
-
917
- def scatterplot(data, col):
918
- plt.figure(figsize=(10, 8))
919
- sns.scatterplot(data[col], color=palette[3])
920
- plt.title(f"Scatter Plot of {col} Column")
921
- st.pyplot(plt)
922
-
923
-
924
- def biscatterplot(data, cols):
925
- try:
926
- plt.figure(figsize=(10, 8))
927
- sns.scatterplot(
928
- data=data,
929
- x=cols[0],
930
- y=cols[1],
931
- palette=palette[1:],
932
- edgecolor="#1c1c1c",
933
- linewidth=2,
934
- )
935
- plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns")
936
- st.pyplot(plt)
937
- except Exception as e:
938
- st.write(str(e))
939
-
940
-
941
- def bibarplot(data, cols):
942
- try:
943
- plt.figure(figsize=(10, 8))
944
- sns.barplot(
945
- data=data,
946
- x=cols[0],
947
- y=cols[1],
948
- palette=palette[1:],
949
- edgecolor="#1c1c1c",
950
- linewidth=2,
951
- )
952
- plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns")
953
- st.pyplot(plt)
954
- except Exception as e:
955
- st.write(str(e))
956
-
957
-
958
- def biboxplot(data, cols):
959
- try:
960
- plt.figure(figsize=(10, 8))
961
- sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2)
962
- plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns")
963
- st.pyplot(plt)
964
- except Exception as e:
965
- st.write(str(e))
966
-
967
-
968
- def paretoplot(data, categorical_col):
969
- try:
970
- value_counts = data[categorical_col].value_counts()
971
- cumulative_percentage = (value_counts / value_counts.sum()).cumsum()
972
- pareto_df = pd.DataFrame(
973
- {
974
- "Categories": value_counts.index,
975
- "Frequency": value_counts.values,
976
- "Cumulative Percentage": cumulative_percentage.values * 100,
977
- }
978
- )
979
- pareto_df = pareto_df.sort_values(by="Frequency", ascending=False)
980
-
981
- fig, ax1 = plt.subplots(figsize=(10, 8))
982
- ax1.bar(
983
- pareto_df["Categories"],
984
- pareto_df["Frequency"],
985
- color=palette[1:],
986
- edgecolor="#1c1c1c",
987
- linewidth=2,
988
- )
989
- ax2 = ax1.twinx()
990
- ax2.yaxis.set_major_formatter(PercentFormatter())
991
- ax2.plot(
992
- pareto_df["Categories"],
993
- pareto_df["Cumulative Percentage"],
994
- color=palette[3],
995
- marker="D",
996
- ms=10,
997
- )
998
- ax1.set_xlabel(categorical_col)
999
- ax1.set_ylabel("Frequency", color=palette[0])
1000
- ax2.set_ylabel("Cumulative Percentage", color=palette[3])
1001
- st.pyplot(fig)
1002
-
1003
- except Exception as e:
1004
- pass
1005
-
1006
-
1007
- def plot_confusion_matrix(y_true, y_pred, title):
1008
- cm = confusion_matrix(y_true, y_pred)
1009
- plt.figure(figsize=(6, 4))
1010
- sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
1011
- plt.xlabel("Predicted Label")
1012
- plt.ylabel("True Label")
1013
- plt.title(title)
1014
- st.pyplot(plt)
1015
-
1016
-
1017
- if __name__ == "__main__":
1018
- main()