CosmickVisions commited on
Commit
961a3b2
·
verified ·
1 Parent(s): 6299132

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +323 -282
app.py CHANGED
@@ -22,6 +22,8 @@ from io import BytesIO
22
  import base64
23
  import time
24
  from sklearn.cluster import KMeans
 
 
25
 
26
  # Configurations
27
  st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
@@ -243,209 +245,239 @@ if app_mode == "Data Upload":
243
  show_loader("Generating EDA Report")
244
  pr = generate_profile(df)
245
  st_profile_report(pr)
 
246
  elif app_mode == "Smart Cleaning":
247
- st.title("🧼 Intelligent Data Cleaning")
248
-
249
- if st.session_state.raw_data is not None:
250
- df = st.session_state.cleaned_data
251
-
252
- # Initialize history if not exists
253
- if 'data_history' not in st.session_state:
254
- st.session_state.data_history = [df.copy()]
255
-
256
- # Cleaning Toolkit
257
- col1, col2 = st.columns([1, 3])
258
- with col1:
259
- st.subheader("Cleaning Actions")
260
-
261
- # Add Reset and Undo buttons
262
- col1a, col1b = st.columns(2)
263
- with col1a:
264
- if st.button("Reset to Original", help="Revert all changes to the uploaded data."):
265
- st.session_state.cleaned_data = st.session_state.raw_data.copy()
266
- st.session_state.data_history = [st.session_state.raw_data.copy()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  st.experimental_rerun()
268
- with col1b:
269
- if len(st.session_state.data_history) > 1:
270
- if st.button("Undo Last Action", help="Revert to the previous state."):
271
- st.session_state.data_history.pop()
272
- st.session_state.cleaned_data = st.session_state.data_history[-1].copy()
273
- st.experimental_rerun()
274
-
275
- clean_action = st.selectbox("Choose Operation", [
276
- "Handle Missing Values",
277
- "Remove Duplicates",
278
- "Remove Column",
279
- "Normalize Data",
280
- "Encode Categories",
281
- "Outlier Removal",
282
- "Auto Clean",
283
- "Neural Network Prep"
284
- ], help="Select the data cleaning operation to perform.")
285
-
286
- # Initialize Auto Clean Variables
287
- auto_missing = False
288
- auto_normalize = False
289
- auto_encode = False
290
- missing_strategy_num = "Median"
291
- missing_strategy_cat = "Most Frequent"
292
-
293
- if clean_action == "Handle Missing Values": #Corrected indentation
294
- st.markdown("**Configure how missing values will be handled.**", unsafe_allow_html=True)
295
- all_impute_cols = ["All Columns"] + df.columns.tolist()
296
- impute_cols = st.multiselect("Columns to Impute", all_impute_cols, default=["All Columns"], help="Select the columns with missing values to impute. Choose 'All Columns' to apply to all columns with missing values.")
297
- if "All Columns" in impute_cols:
298
- impute_cols = df.columns.tolist()
299
-
300
- method = st.selectbox("Imputation Method", [
301
- "KNN Imputation",
302
- "Median Fill",
303
- "Mean Fill",
304
- "Drop Missing"
305
- ], help="Choose the method to use for imputing missing values.")
306
- elif clean_action == "Neural Network Prep":
307
- st.markdown("**Neural Network Specific Preparation**", unsafe_allow_html=True)
308
-
309
- # Make dynamic to check if the models can allow it
310
- validModels=["RNN", "CNN"]
311
 
312
- model_Choice_text = st.radio("What's a use case for Models?",
313
- options= validModels)
314
-
315
- # display a string or some other feedback
316
- st.info('Select a machine learning task below!')
317
-
318
- ## to check which text based mode
319
-
320
- validColumnNumerical_cols = df.select_dtypes(include=['int','float']).columns.tolist()
321
- numcol_cols = st.multiselect("Text use Colimns: or sequence for model usage :D, to generate the code - to understand how each one plays out D", options =validColumnNumerical_cols )
322
-
323
- #### Make different selections here now
324
- ####
325
- st.code('Code example is generated.')
326
- """ Make each configuration do an function or callback" just one press and more to learn""" ### Show code. You do need check what variables and show output that goes on
327
- #### then, what did output happen if you pick A or B variable selection.
328
-
329
- seq_length = st.number_input("Sequence Length (for RNN)", 10, 100, 30, help =" Length to do that. make them more power ")
330
-
331
- method = st.selectbox("Imputation Method", ["KNN Imputation", "Median Fill", "Mean Fill", "Drop Missing"])
332
- if method == "KNN Imputation":
333
- knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5, help="Number of neighbors for KNN Imputation.") #Parameter
334
-
335
- elif clean_action == "Normalize Data":
336
- st.markdown("**Choose a scaling method and columns to normalize.**")
337
- scaler_type = st.selectbox("Scaler Type", ["RobustScaler", "StandardScaler"], help="Select the type of scaler to use.")
338
- all_normalize_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
339
- normalize_cols = st.multiselect("Columns to Normalize", all_normalize_cols, default=["All Numerical"], help="Select the numerical columns to normalize. Choose 'All Numerical' to apply to all numerical columns.")
340
- if "All Numerical" in normalize_cols:
341
- normalize_cols = df.select_dtypes(include=np.number).columns.tolist()
342
-
343
- elif clean_action == "Encode Categories":
344
- st.markdown("**Select categorical columns to encode.**")
345
- all_encode_cols = ["All Categorical"] + df.select_dtypes(include='object').columns.tolist()
346
- encode_cols = st.multiselect("Columns to Encode", all_encode_cols, default=["All Categorical"], help="Select the categorical columns to encode. Choose 'All Categorical' to apply to all object type columns.")
347
- if "All Categorical" in encode_cols:
348
- encode_cols = df.select_dtypes(include='object').columns.tolist()
349
- encoding_method = st.selectbox("Encoding Method", ["OneHotEncoder"], help="Choose the encoding method.")
350
-
351
- elif clean_action == "Outlier Removal":
352
- st.markdown("**Configure outlier removal settings.**")
353
- all_outlier_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
354
- outlier_cols = st.multiselect("Columns to Remove Outliers From", all_outlier_cols, default=["All Numerical"], help="Select the columns to remove outliers from. Choose 'All Numerical' to apply to all numerical columns.")
355
- if "All Numerical" in outlier_cols:
356
- outlier_cols = df.select_dtypes(include=np.number).columns.tolist()
357
- outlier_method = st.selectbox("Outlier Removal Method", ["IQR", "Z-score"], help="Choose the outlier removal method.")
358
- if outlier_method == "IQR":
359
- iqr_threshold = st.slider("IQR Threshold", 1.0, 3.0, 1.5, help="Adjust the IQR threshold.")
360
- else:
361
- zscore_threshold = st.slider("Z-score Threshold", 2.0, 4.0, 3.0, help="Adjust the Z-score threshold.")
362
-
363
- elif clean_action == "Remove Column":
364
- st.markdown("**Choose Columns to Remove**")
365
- all_cols = df.columns.tolist()
366
- remove_cols = st.multiselect("Columns to Remove", all_cols, help="Select the columns to remove.")
367
-
368
- elif clean_action == "Auto Clean":
369
- st.markdown("**Automatically Impute Missing Values, Encode Categorical Variables, and Normalize Numeric Variables**", help = "These action happens automically when selected.")
370
- with st.expander("⚙️ Auto Processing Settings"):
371
- st.markdown("**Check to enable setting automatic data cleaning.**", help = "You must manually change configurations in the following setttings below.")
372
- auto_missing = st.checkbox("Auto Handle Missing Values", True, help = "Auto handle all mission values with selected configurations")
373
- auto_normalize = st.checkbox("Auto Normalize Numerical Features", True, help = "Check to automatically normalize all numerical features")
374
- auto_encode = st.checkbox("Auto Encode Categorical Features", True, help="Check to automatically Encode all catigorical columns")
375
-
376
- if auto_missing:
377
- missing_strategy_num = st.selectbox("Numerical Imputation", ["Median", "Mean"], help="Choose the numeric strategy for Auto Clean")
378
- missing_strategy_cat = st.selectbox("Categorical Imputation", ["Most Frequent", "Constant"], help="Choose strategy for auto cleaning on categorical attributes")
379
-
380
- with col2:
381
- if st.button("Apply Transformation"):
382
- with st.spinner("Applying changes..."):
383
- current_df = df.copy() # important
384
- if 'data_history' not in st.session_state:
385
- st.session_state.data_history = [df.copy()]
386
- # Store the current state in history BEFORE processing
387
- st.session_state.data_history.append(current_df)
388
-
389
- # Auto Processing
390
- if auto_missing and clean_action != "Auto Clean":
391
- num_cols = current_df.select_dtypes(include=np.number).columns
392
- cat_cols = current_df.select_dtypes(include='object').columns
393
-
394
- if missing_strategy_num == "Median":
395
- current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].median())
396
- else:
397
- current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].mean())
398
-
399
- if missing_strategy_cat == "Most Frequent":
400
- current_df[cat_cols] = current_df[cat_cols].fillna(current_df[cat_cols].mode().iloc[0])
401
- else:
402
- current_df[cat_cols] = current_df[cat_cols].fillna("Missing")
403
-
404
- if auto_normalize and clean_action != "Auto Clean":
405
- num_cols = current_df.select_dtypes(include=np.number).columns
406
- scaler = StandardScaler()
407
- current_df[num_cols] = scaler.fit_transform(current_df[num_cols])
408
-
409
- if auto_encode and clean_action != "Auto Clean":
410
- cat_cols = current_df.select_dtypes(include='object').columns
411
- if len(cat_cols) > 0:
412
- encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
413
- encoded_data = encoder.fit_transform(current_df[cat_cols])
414
- encoded_df = pd.DataFrame(encoded_data,columns=encoder.get_feature_names_out(cat_cols))
415
- current_df = pd.concat([current_df.drop(columns=cat_cols), encoded_df], axis=1)
416
-
417
- # Manual Processing
418
- if clean_action == "Handle Missing Values":
419
- if method == "KNN Imputation":
420
- imputer = KNNImputer(n_neighbors=knn_neighbors)
421
- current_df[impute_cols] = imputer.fit_transform(current_df[impute_cols])
422
- elif method == "Median Fill":
423
- current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].median())
424
- elif method == "Mean Fill":
425
- current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].mean())
426
- else:
427
- current_df = current_df.dropna(subset=impute_cols)
428
 
429
- elif clean_action == "Remove Columns":
430
- if remove_cols:
431
- current_df = current_df.drop(columns=remove_cols)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
 
433
- st.session_state.cleaned_data = current_df
434
- st.success("Transformation applied!")
 
435
 
436
- # Data Comparison
437
- st.subheader("Data Version Comparison")
438
- col_orig, col_clean = st.columns(2)
439
-
440
- with col_orig:
441
- st.markdown("**Original Data**")
442
- if st.session_state.raw_data is not None:
443
- st.dataframe(st.session_state.raw_data.head(5), use_container_width=True)
444
- else:
445
- st.write("No original data uploaded yet.")
446
- with col_clean:
447
- st.markdown("**Cleaned Data**")
448
- st.dataframe(df.head(5), use_container_width=True)
 
 
 
449
 
450
  elif app_mode == "Advanced EDA":
451
  st.title("🔍 Advanced Exploratory Analysis")
@@ -789,106 +821,115 @@ elif app_mode == "Predictions":
789
  else:
790
  st.write("Please train a model first in the 'Model Training' section.")
791
 
792
- elif app_mode == "Visualization Lab":
793
- st.title("🔬 Advanced Visualizations")
794
-
795
- if st.session_state.cleaned_data is not None:
796
- df = st.session_state.cleaned_data.copy()
797
-
798
- # Visualization Type Selection
799
- visualization_type = st.selectbox("Select Visualization Type", [
800
- "Pair Plot", "Parallel Coordinates Plot", "Andrews Curves", "Pie Chart",
801
- "Area Chart", "Density Contour", "Sunburst Chart", "Funnel Chart"
802
- ])
803
 
804
- if visualization_type == "Pair Plot":
805
- cols_for_pairplot = st.multiselect("Select Columns for Pair Plot", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
806
- if cols_for_pairplot:
807
- fig = px.scatter_matrix(df, dimensions=cols_for_pairplot)
808
- st.plotly_chart(fig, use_container_width=True)
809
 
810
- elif visualization_type == "Parallel Coordinates Plot":
811
- cols_for_parallel = st.multiselect("Select Columns for Parallel Coordinates", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
812
- if cols_for_parallel:
813
- fig = px.parallel_coordinates(df[cols_for_parallel], color=df[cols_for_parallel[0]] if cols_for_parallel else None)
814
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
815
 
816
- elif visualization_type == "Andrews Curves":
817
- cols_for_andrews = st.multiselect("Select Columns for Andrews Curves", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
818
- if cols_for_andrews:
819
- fig = px.andrews_curves(df[cols_for_andrews + [df.columns[0]]], class_column=df.columns[0])
820
- st.plotly_chart(fig, use_container_width=True)
821
 
822
- elif visualization_type == "Pie Chart":
823
- col_for_pie = st.selectbox("Select Column for Pie Chart", df.columns)
824
- fig = px.pie(df, names=col_for_pie)
 
 
 
 
 
 
 
 
825
  st.plotly_chart(fig, use_container_width=True)
826
 
827
- elif visualization_type == "Area Chart":
828
- cols_for_area = st.multiselect("Select Columns for Area Chart", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
829
- if cols_for_area:
830
- fig = px.area(df[cols_for_area])
831
- st.plotly_chart(fig, use_container_width=True)
832
-
833
- elif visualization_type == "Density Contour":
834
- x_col = st.selectbox("Select X Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
835
- y_col = st.selectbox("Select Y Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
836
- fig = px.density_contour(df, x=x_col, y=y_col)
837
  st.plotly_chart(fig, use_container_width=True)
838
 
839
- elif visualization_type == "Sunburst Chart":
840
- path_cols = st.multiselect("Select Path Columns for Sunburst Chart", df.columns)
841
- if path_cols:
842
- fig = px.sunburst(df, path=path_cols)
843
- st.plotly_chart(fig, use_container_width=True)
 
844
 
845
- elif visualization_type == "Funnel Chart":
846
- x_col_funnel = st.selectbox("Select X Column for Funnel Chart", df.columns)
847
- y_col_funnel = st.selectbox("Select Y Column for Funnel Chart", df.columns)
848
- fig = px.funnel(df, x=x_col_funnel, y=y_col_funnel)
 
 
 
 
 
 
 
849
  st.plotly_chart(fig, use_container_width=True)
850
 
851
- elif app_mode == "Visualization Lab" and st.session_state.cleaned_data is not None:
852
- st.subheader("Clustering Analysis")
853
- df = st.session_state.cleaned_data.copy()
 
 
 
 
 
 
 
 
 
 
854
 
855
- # Select columns for clustering
856
- numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
 
 
 
 
857
 
858
- if not numerical_cols:
859
- st.warning("No numerical columns found for clustering.")
860
- else:
861
- cluster_cols = st.multiselect("Select Columns for Clustering", numerical_cols, default=numerical_cols[:2] if len(numerical_cols) >= 2 else numerical_cols)
862
 
863
- if cluster_cols:
864
- try:
865
- # Scale the data
866
- scaler = StandardScaler()
867
- scaled_data = scaler.fit_transform(df[cluster_cols])
868
-
869
- # Number of clusters
870
- n_clusters = st.slider("Number of Clusters", 2, 10, 3, help="Number of clusters to form.")
871
-
872
- # Apply K-Means clustering
873
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
874
- clusters = kmeans.fit_predict(scaled_data)
875
-
876
- # Add cluster labels to the DataFrame
877
- df['Cluster'] = clusters
878
-
879
- # Visualize clusters
880
- if len(cluster_cols) == 2:
881
- fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
882
- st.plotly_chart(fig, use_container_width=True)
883
- elif len(cluster_cols) == 3:
884
- fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
885
- st.plotly_chart(fig, use_container_width=True)
886
- else:
887
- st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
888
 
889
- st.success("Clustering applied successfully!")
890
- except Exception as e:
891
- st.error(f"An error occurred during clustering: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892
 
893
  elif app_mode == "Neural Network Studio":
894
  st.title("🧠 Neural Network Studio")
 
22
  import base64
23
  import time
24
  from sklearn.cluster import KMeans
25
+ import keras
26
+
27
 
28
  # Configurations
29
  st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
 
245
  show_loader("Generating EDA Report")
246
  pr = generate_profile(df)
247
  st_profile_report(pr)
248
+
249
  elif app_mode == "Smart Cleaning":
250
+ st.subheader("Data Cleaning and Preprocessing")
251
+
252
+ if st.checkbox("Clean Data using Neural Network (Imputation)"):
253
+ numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
254
+ for col in numerical_cols:
255
+ if df[col].isnull().sum() > 0:
256
+ st.write(f"Imputing missing values in {col} using a Neural Network...")
257
+ train_df = df.dropna(subset=[col]).copy()
258
+ test_df = df[df[col].isnull()].drop(col, axis=1).copy()
259
+ train_X = train_df.drop(col, axis=1).select_dtypes(include=np.number)
260
+ train_y = train_df[col]
261
+
262
+ if not train_X.empty:
263
+ # Enhanced Model Selection (Simple Additions)
264
+ model_type = st.selectbox(f"Model for {col}", ["Simple Feedforward", "Slightly Deeper"])
265
+ if model_type == "Simple Feedforward":
266
+ model = keras.Sequential([
267
+ keras.layers.Dense(64, activation='relu', input_shape=(train_X.shape[1],)),
268
+ keras.layers.Dense(32, activation='relu'),
269
+ keras.layers.Dense(1)
270
+ ])
271
+ else:
272
+ model = keras.Sequential([
273
+ keras.layers.Dense(128, activation='relu', input_shape=(train_X.shape[1],)),
274
+ keras.layers.Dense(64, activation='relu'),
275
+ keras.layers.Dense(32, activation='relu'),
276
+ keras.layers.Dense(1)
277
+ ])
278
+
279
+ model.compile(optimizer='adam', loss='mse')
280
+ model.fit(train_X, train_y, epochs=50, verbose=0)
281
+ imputed_values = model.predict(test_df.select_dtypes(include=np.number))
282
+ df.loc[df[col].isnull(), col] = imputed_values.flatten()
283
+ st.success(f"Imputation in {col} completed.")
284
+ else:
285
+ st.warning(f"Skipping imputation for {col} due to insufficient data.")
286
+
287
+ if st.checkbox("Standardize Numerical Columns"):
288
+ numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
289
+ scaler = StandardScaler()
290
+ df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
291
+ st.success("Numerical columns standardized.")
292
+
293
+ if st.checkbox("Encode Categorical Columns"):
294
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
295
+ for col in categorical_cols:
296
+ le = LabelEncoder()
297
+ df[col] = le.fit_transform(df[col])
298
+ st.success("Categorical columns encoded.")
299
+
300
+ st.session_state.cleaned_data = df #Update cleaned data after cleaning operations.
301
+
302
+ # Cleaning Toolkit
303
+ col1, col2 = st.columns([1, 3])
304
+ with col1:
305
+ st.subheader("Cleaning Actions")
306
+
307
+ # Add Reset and Undo buttons
308
+ col1a, col1b = st.columns(2)
309
+ with col1a:
310
+ if st.button("Reset to Original", help="Revert all changes to the uploaded data."):
311
+ st.session_state.cleaned_data = st.session_state.raw_data.copy()
312
+ st.session_state.data_history = [st.session_state.raw_data.copy()]
313
+ st.experimental_rerun()
314
+ with col1b:
315
+ if len(st.session_state.data_history) > 1:
316
+ if st.button("Undo Last Action", help="Revert to the previous state."):
317
+ st.session_state.data_history.pop()
318
+ st.session_state.cleaned_data = st.session_state.data_history[-1].copy()
319
  st.experimental_rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
+ clean_action = st.selectbox("Choose Operation", [
322
+ "Handle Missing Values",
323
+ "Remove Duplicates",
324
+ "Remove Column",
325
+ "Normalize Data",
326
+ "Encode Categories",
327
+ "Outlier Removal",
328
+ "Auto Clean",
329
+ "Neural Network Prep"
330
+ ], help="Select the data cleaning operation to perform.")
331
+
332
+ # Initialize Auto Clean Variables
333
+ auto_missing = False
334
+ auto_normalize = False
335
+ auto_encode = False
336
+ missing_strategy_num = "Median"
337
+ missing_strategy_cat = "Most Frequent"
338
+
339
+ if clean_action == "Handle Missing Values":
340
+ st.markdown("**Configure how missing values will be handled.**", unsafe_allow_html=True)
341
+ all_impute_cols = ["All Columns"] + df.columns.tolist()
342
+ impute_cols = st.multiselect("Columns to Impute", all_impute_cols, default=["All Columns"], help="Select the columns with missing values to impute. Choose 'All Columns' to apply to all columns with missing values.")
343
+ if "All Columns" in impute_cols:
344
+ impute_cols = df.columns.tolist()
345
+
346
+ method = st.selectbox("Imputation Method", [
347
+ "KNN Imputation",
348
+ "Median Fill",
349
+ "Mean Fill",
350
+ "Drop Missing"
351
+ ], help="Choose the method to use for imputing missing values.")
352
+ if method == "KNN Imputation":
353
+ knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5, help="Number of neighbors for KNN Imputation.") #Parameter
354
+
355
+ elif clean_action == "Neural Network Prep":
356
+ st.markdown("**Neural Network Specific Preparation**", unsafe_allow_html=True)
357
+ validModels=["RNN", "CNN"]
358
+ model_Choice_text = st.radio("What's a use case for Models?", options= validModels)
359
+ st.info('Select a machine learning task below!')
360
+ validColumnNumerical_cols = df.select_dtypes(include=['int','float']).columns.tolist()
361
+ numcol_cols = st.multiselect("Text use Colimns: or sequence for model usage :D, to generate the code - to understand how each one plays out D", options =validColumnNumerical_cols )
362
+ st.code('Code example is generated.')
363
+ """ Make each configuration do an function or callback" just one press and more to learn"""
364
+ seq_length = st.number_input("Sequence Length (for RNN)", 10, 100, 30, help =" Length to do that. make them more power ")
365
+ method = st.selectbox("Imputation Method", ["KNN Imputation", "Median Fill", "Mean Fill", "Drop Missing"])
366
+
367
+ elif clean_action == "Normalize Data":
368
+ st.markdown("**Choose a scaling method and columns to normalize.**")
369
+ scaler_type = st.selectbox("Scaler Type", ["RobustScaler", "StandardScaler"], help="Select the type of scaler to use.")
370
+ all_normalize_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
371
+ normalize_cols = st.multiselect("Columns to Normalize", all_normalize_cols, default=["All Numerical"], help="Select the numerical columns to normalize. Choose 'All Numerical' to apply to all numerical columns.")
372
+ if "All Numerical" in normalize_cols:
373
+ normalize_cols = df.select_dtypes(include=np.number).columns.tolist()
374
+
375
+ elif clean_action == "Encode Categories":
376
+ st.markdown("**Select categorical columns to encode.**")
377
+ all_encode_cols = ["All Categorical"] + df.select_dtypes(include='object').columns.tolist()
378
+ encode_cols = st.multiselect("Columns to Encode", all_encode_cols, default=["All Categorical"], help="Select the categorical columns to encode. Choose 'All Categorical' to apply to all object type columns.")
379
+ if "All Categorical" in encode_cols:
380
+ encode_cols = df.select_dtypes(include='object').columns.tolist()
381
+ encoding_method = st.selectbox("Encoding Method", ["OneHotEncoder"], help="Choose the encoding method.")
382
+
383
+ elif clean_action == "Outlier Removal":
384
+ st.markdown("**Configure outlier removal settings.**")
385
+ all_outlier_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
386
+ outlier_cols = st.multiselect("Columns to Remove Outliers From", all_outlier_cols, default=["All Numerical"], help="Select the columns to remove outliers from. Choose 'All Numerical' to apply to all numerical columns.")
387
+ if "All Numerical" in outlier_cols:
388
+ outlier_cols = df.select_dtypes(include=np.number).columns.tolist()
389
+ outlier_method = st.selectbox("outlier_method = st.selectbox("Outlier Removal Method", ["IQR", "Z-score"], help="Choose the outlier removal method.")
390
+ if outlier_method == "IQR":
391
+ iqr_threshold = st.slider("IQR Threshold", 1.0, 3.0, 1.5, help="Adjust the IQR threshold.")
392
+ else:
393
+ zscore_threshold = st.slider("Z-score Threshold", 2.0, 4.0, 3.0, help="Adjust the Z-score threshold.")
394
+
395
+ elif clean_action == "Remove Column":
396
+ st.markdown("**Choose Columns to Remove**")
397
+ all_cols = df.columns.tolist()
398
+ remove_cols = st.multiselect("Columns to Remove", all_cols, help="Select the columns to remove.")
399
+
400
+ elif clean_action == "Auto Clean":
401
+ st.markdown("**Automatically Impute Missing Values, Encode Categorical Variables, and Normalize Numeric Variables**", help = "These action happens automically when selected.")
402
+ with st.expander("⚙️ Auto Processing Settings"):
403
+ st.markdown("**Check to enable setting automatic data cleaning.**", help = "You must manually change configurations in the following setttings below.")
404
+ auto_missing = st.checkbox("Auto Handle Missing Values", True, help = "Auto handle all mission values with selected configurations")
405
+ auto_normalize = st.checkbox("Auto Normalize Numerical Features", True, help = "Check to automatically normalize all numerical features")
406
+ auto_encode = st.checkbox("Auto Encode Categorical Features", True, help="Check to automatically Encode all catigorical columns")
407
+
408
+ if auto_missing:
409
+ missing_strategy_num = st.selectbox("Numerical Imputation", ["Median", "Mean"], help="Choose the numeric strategy for Auto Clean")
410
+ missing_strategy_cat = st.selectbox("Categorical Imputation", ["Most Frequent", "Constant"], help="Choose strategy for auto cleaning on categorical attributes")
411
+
412
+ with col2:
413
+ if st.button("Apply Transformation"):
414
+ with st.spinner("Applying changes..."):
415
+ current_df = df.copy() # important
416
+ if 'data_history' not in st.session_state:
417
+ st.session_state.data_history = [df.copy()]
418
+ # Store the current state in history BEFORE processing
419
+ st.session_state.data_history.append(current_df)
420
+
421
+ # Auto Processing
422
+ if auto_missing and clean_action != "Auto Clean":
423
+ num_cols = current_df.select_dtypes(include=np.number).columns
424
+ cat_cols = current_df.select_dtypes(include='object').columns
425
+
426
+ if missing_strategy_num == "Median":
427
+ current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].median())
428
+ else:
429
+ current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].mean())
 
 
 
 
 
 
 
430
 
431
+ if missing_strategy_cat == "Most Frequent":
432
+ current_df[cat_cols] = current_df[cat_cols].fillna(current_df[cat_cols].mode().iloc[0])
433
+ else:
434
+ current_df[cat_cols] = current_df[cat_cols].fillna("Missing")
435
+
436
+ if auto_normalize and clean_action != "Auto Clean":
437
+ num_cols = current_df.select_dtypes(include=np.number).columns
438
+ scaler = StandardScaler()
439
+ current_df[num_cols] = scaler.fit_transform(current_df[num_cols])
440
+
441
+ if auto_encode and clean_action != "Auto Clean":
442
+ cat_cols = current_df.select_dtypes(include='object').columns
443
+ if len(cat_cols) > 0:
444
+ encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
445
+ encoded_data = encoder.fit_transform(current_df[cat_cols])
446
+ encoded_df = pd.DataFrame(encoded_data,columns=encoder.get_feature_names_out(cat_cols))
447
+ current_df = pd.concat([current_df.drop(columns=cat_cols), encoded_df], axis=1)
448
+
449
+ # Manual Processing
450
+ if clean_action == "Handle Missing Values":
451
+ if method == "KNN Imputation":
452
+ imputer = KNNImputer(n_neighbors=knn_neighbors)
453
+ current_df[impute_cols] = imputer.fit_transform(current_df[impute_cols])
454
+ elif method == "Median Fill":
455
+ current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].median())
456
+ elif method == "Mean Fill":
457
+ current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].mean())
458
+ else:
459
+ current_df = current_df.dropna(subset=impute_cols)
460
 
461
+ elif clean_action == "Remove Column":
462
+ if remove_cols:
463
+ current_df = current_df.drop(columns=remove_cols)
464
 
465
+ st.session_state.cleaned_data = current_df
466
+ st.success("Transformation applied!")
467
+
468
+ # Data Comparison
469
+ st.subheader("Data Version Comparison")
470
+ col_orig, col_clean = st.columns(2)
471
+
472
+ with col_orig:
473
+ st.markdown("**Original Data**")
474
+ if st.session_state.raw_data is not None:
475
+ st.dataframe(st.session_state.raw_data.head(5), use_container_width=True)
476
+ else:
477
+ st.write("No original data uploaded yet.")
478
+ with col_clean:
479
+ st.markdown("**Cleaned Data**")
480
+ st.dataframe(df.head(5), use_container_width=True)
481
 
482
  elif app_mode == "Advanced EDA":
483
  st.title("🔍 Advanced Exploratory Analysis")
 
821
  else:
822
  st.write("Please train a model first in the 'Model Training' section.")
823
 
824
+ st.title("🔬 Advanced Data Visualization and Clustering Lab")
 
 
 
 
 
 
 
 
 
 
825
 
826
+ # Initialize session state for cleaned data
827
+ if 'cleaned_data' not in st.session_state:
828
+ st.session_state.cleaned_data = None
 
 
829
 
830
+ # Sample data upload (replace with your data loading logic)
831
+ uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
832
+ if uploaded_file is not None:
833
+ try:
834
+ df = pd.read_csv(uploaded_file)
835
+ st.session_state.cleaned_data = df
836
+ st.success("Data loaded successfully!")
837
+ except Exception as e:
838
+ st.error(f"Error loading data: {e}")
839
 
840
+ if st.session_state.cleaned_data is not None:
841
+ df = st.session_state.cleaned_data.copy()
 
 
 
842
 
843
+ # Visualization Type Selection
844
+ visualization_type = st.selectbox("Select Visualization Type", [
845
+ "Pair Plot", "Parallel Coordinates Plot", "Andrews Curves", "Pie Chart",
846
+ "Area Chart", "Density Contour", "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
847
+ ])
848
+
849
+ if visualization_type == "Pair Plot":
850
+ st.subheader("Pair Plot")
851
+ cols_for_pairplot = st.multiselect("Select Columns for Pair Plot", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
852
+ if cols_for_pairplot:
853
+ fig = px.scatter_matrix(df, dimensions=cols_for_pairplot)
854
  st.plotly_chart(fig, use_container_width=True)
855
 
856
+ elif visualization_type == "Parallel Coordinates Plot":
857
+ st.subheader("Parallel Coordinates Plot")
858
+ cols_for_parallel = st.multiselect("Select Columns for Parallel Coordinates", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
859
+ if cols_for_parallel:
860
+ fig = px.parallel_coordinates(df[cols_for_parallel], color=df[cols_for_parallel[0]] if cols_for_parallel else None)
 
 
 
 
 
861
  st.plotly_chart(fig, use_container_width=True)
862
 
863
+ elif visualization_type == "Andrews Curves":
864
+ st.subheader("Andrews Curves")
865
+ cols_for_andrews = st.multiselect("Select Columns for Andrews Curves", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
866
+ if cols_for_andrews:
867
+ fig = px.andrews_curves(df[cols_for_andrews + [df.columns[0]]], class_column=df.columns[0])
868
+ st.plotly_chart(fig, use_container_width=True)
869
 
870
+ elif visualization_type == "Pie Chart":
871
+ st.subheader("Pie Chart")
872
+ col_for_pie = st.selectbox("Select Column for Pie Chart", df.columns)
873
+ fig = px.pie(df, names=col_for_pie)
874
+ st.plotly_chart(fig, use_container_width=True)
875
+
876
+ elif visualization_type == "Area Chart":
877
+ st.subheader("Area Chart")
878
+ cols_for_area = st.multiselect("Select Columns for Area Chart", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
879
+ if cols_for_area:
880
+ fig = px.area(df[cols_for_area])
881
  st.plotly_chart(fig, use_container_width=True)
882
 
883
+ elif visualization_type == "Density Contour":
884
+ st.subheader("Density Contour")
885
+ x_col = st.selectbox("Select X Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
886
+ y_col = st.selectbox("Select Y Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
887
+ fig = px.density_contour(df, x=x_col, y=y_col)
888
+ st.plotly_chart(fig, use_container_width=True)
889
+
890
+ elif visualization_type == "Sunburst Chart":
891
+ st.subheader("Sunburst Chart")
892
+ path_cols = st.multiselect("Select Path Columns for Sunburst Chart", df.columns)
893
+ if path_cols:
894
+ fig = px.sunburst(df, path=path_cols)
895
+ st.plotly_chart(fig, use_container_width=True)
896
 
897
+ elif visualization_type == "Funnel Chart":
898
+ st.subheader("Funnel Chart")
899
+ x_col = st.selectbox("Select X Column for Funnel Chart (Values)", df.select_dtypes(include=np.number).columns.tolist())
900
+ y_col = st.selectbox("Select Y Column for Funnel Chart (Categories)", df.columns)
901
+ fig = px.funnel(df, x=x_col, y=y_col)
902
+ st.plotly_chart(fig, use_container_width=True)
903
 
904
+ elif visualization_type == "Clustering Analysis":
905
+ st.subheader("Clustering Analysis")
906
+ numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
 
907
 
908
+ if not numerical_cols:
909
+ st.warning("No numerical columns found for clustering.")
910
+ else:
911
+ cluster_cols = st.multiselect("Select Columns for Clustering", numerical_cols, default=numerical_cols[:2] if len(numerical_cols) >= 2 else numerical_cols)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
912
 
913
+ if cluster_cols:
914
+ try:
915
+ scaler = StandardScaler()
916
+ scaled_data = scaler.fit_transform(df[cluster_cols])
917
+ n_clusters = st.slider("Number of Clusters", 2, 10, 3, help="Number of clusters to form.")
918
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
919
+ clusters = kmeans.fit_predict(scaled_data)
920
+ df['Cluster'] = clusters
921
+
922
+ if len(cluster_cols) == 2:
923
+ fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
924
+ st.plotly_chart(fig, use_container_width=True)
925
+ elif len(cluster_cols) == 3:
926
+ fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
927
+ st.plotly_chart(fig, use_container_width=True)
928
+ else:
929
+ st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
930
+ st.success("Clustering applied successfully!")
931
+ except Exception as e:
932
+ st.error(f"An error occurred during clustering: {e}")
933
 
934
  elif app_mode == "Neural Network Studio":
935
  st.title("🧠 Neural Network Studio")