Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,8 @@ from io import BytesIO
|
|
22 |
import base64
|
23 |
import time
|
24 |
from sklearn.cluster import KMeans
|
|
|
|
|
25 |
|
26 |
# Configurations
|
27 |
st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
|
@@ -243,209 +245,239 @@ if app_mode == "Data Upload":
|
|
243 |
show_loader("Generating EDA Report")
|
244 |
pr = generate_profile(df)
|
245 |
st_profile_report(pr)
|
|
|
246 |
elif app_mode == "Smart Cleaning":
|
247 |
-
st.
|
248 |
-
|
249 |
-
if st.
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
st.experimental_rerun()
|
268 |
-
with col1b:
|
269 |
-
if len(st.session_state.data_history) > 1:
|
270 |
-
if st.button("Undo Last Action", help="Revert to the previous state."):
|
271 |
-
st.session_state.data_history.pop()
|
272 |
-
st.session_state.cleaned_data = st.session_state.data_history[-1].copy()
|
273 |
-
st.experimental_rerun()
|
274 |
-
|
275 |
-
clean_action = st.selectbox("Choose Operation", [
|
276 |
-
"Handle Missing Values",
|
277 |
-
"Remove Duplicates",
|
278 |
-
"Remove Column",
|
279 |
-
"Normalize Data",
|
280 |
-
"Encode Categories",
|
281 |
-
"Outlier Removal",
|
282 |
-
"Auto Clean",
|
283 |
-
"Neural Network Prep"
|
284 |
-
], help="Select the data cleaning operation to perform.")
|
285 |
-
|
286 |
-
# Initialize Auto Clean Variables
|
287 |
-
auto_missing = False
|
288 |
-
auto_normalize = False
|
289 |
-
auto_encode = False
|
290 |
-
missing_strategy_num = "Median"
|
291 |
-
missing_strategy_cat = "Most Frequent"
|
292 |
-
|
293 |
-
if clean_action == "Handle Missing Values": #Corrected indentation
|
294 |
-
st.markdown("**Configure how missing values will be handled.**", unsafe_allow_html=True)
|
295 |
-
all_impute_cols = ["All Columns"] + df.columns.tolist()
|
296 |
-
impute_cols = st.multiselect("Columns to Impute", all_impute_cols, default=["All Columns"], help="Select the columns with missing values to impute. Choose 'All Columns' to apply to all columns with missing values.")
|
297 |
-
if "All Columns" in impute_cols:
|
298 |
-
impute_cols = df.columns.tolist()
|
299 |
-
|
300 |
-
method = st.selectbox("Imputation Method", [
|
301 |
-
"KNN Imputation",
|
302 |
-
"Median Fill",
|
303 |
-
"Mean Fill",
|
304 |
-
"Drop Missing"
|
305 |
-
], help="Choose the method to use for imputing missing values.")
|
306 |
-
elif clean_action == "Neural Network Prep":
|
307 |
-
st.markdown("**Neural Network Specific Preparation**", unsafe_allow_html=True)
|
308 |
-
|
309 |
-
# Make dynamic to check if the models can allow it
|
310 |
-
validModels=["RNN", "CNN"]
|
311 |
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
st.
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
current_df[impute_cols] = imputer.fit_transform(current_df[impute_cols])
|
422 |
-
elif method == "Median Fill":
|
423 |
-
current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].median())
|
424 |
-
elif method == "Mean Fill":
|
425 |
-
current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].mean())
|
426 |
-
else:
|
427 |
-
current_df = current_df.dropna(subset=impute_cols)
|
428 |
|
429 |
-
|
430 |
-
|
431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
432 |
|
433 |
-
|
434 |
-
|
|
|
435 |
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
st.
|
|
|
|
|
|
|
449 |
|
450 |
elif app_mode == "Advanced EDA":
|
451 |
st.title("🔍 Advanced Exploratory Analysis")
|
@@ -789,106 +821,115 @@ elif app_mode == "Predictions":
|
|
789 |
else:
|
790 |
st.write("Please train a model first in the 'Model Training' section.")
|
791 |
|
792 |
-
|
793 |
-
st.title("🔬 Advanced Visualizations")
|
794 |
-
|
795 |
-
if st.session_state.cleaned_data is not None:
|
796 |
-
df = st.session_state.cleaned_data.copy()
|
797 |
-
|
798 |
-
# Visualization Type Selection
|
799 |
-
visualization_type = st.selectbox("Select Visualization Type", [
|
800 |
-
"Pair Plot", "Parallel Coordinates Plot", "Andrews Curves", "Pie Chart",
|
801 |
-
"Area Chart", "Density Contour", "Sunburst Chart", "Funnel Chart"
|
802 |
-
])
|
803 |
|
804 |
-
|
805 |
-
|
806 |
-
|
807 |
-
fig = px.scatter_matrix(df, dimensions=cols_for_pairplot)
|
808 |
-
st.plotly_chart(fig, use_container_width=True)
|
809 |
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
|
|
|
|
|
|
|
|
815 |
|
816 |
-
|
817 |
-
|
818 |
-
if cols_for_andrews:
|
819 |
-
fig = px.andrews_curves(df[cols_for_andrews + [df.columns[0]]], class_column=df.columns[0])
|
820 |
-
st.plotly_chart(fig, use_container_width=True)
|
821 |
|
822 |
-
|
823 |
-
|
824 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
825 |
st.plotly_chart(fig, use_container_width=True)
|
826 |
|
827 |
-
|
828 |
-
|
829 |
-
|
830 |
-
|
831 |
-
|
832 |
-
|
833 |
-
elif visualization_type == "Density Contour":
|
834 |
-
x_col = st.selectbox("Select X Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
|
835 |
-
y_col = st.selectbox("Select Y Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
|
836 |
-
fig = px.density_contour(df, x=x_col, y=y_col)
|
837 |
st.plotly_chart(fig, use_container_width=True)
|
838 |
|
839 |
-
|
840 |
-
|
841 |
-
|
842 |
-
|
843 |
-
|
|
|
844 |
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
849 |
st.plotly_chart(fig, use_container_width=True)
|
850 |
|
851 |
-
elif
|
852 |
-
|
853 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
854 |
|
855 |
-
|
856 |
-
|
|
|
|
|
|
|
|
|
857 |
|
858 |
-
|
859 |
-
st.
|
860 |
-
|
861 |
-
cluster_cols = st.multiselect("Select Columns for Clustering", numerical_cols, default=numerical_cols[:2] if len(numerical_cols) >= 2 else numerical_cols)
|
862 |
|
863 |
-
if
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
-
scaled_data = scaler.fit_transform(df[cluster_cols])
|
868 |
-
|
869 |
-
# Number of clusters
|
870 |
-
n_clusters = st.slider("Number of Clusters", 2, 10, 3, help="Number of clusters to form.")
|
871 |
-
|
872 |
-
# Apply K-Means clustering
|
873 |
-
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
874 |
-
clusters = kmeans.fit_predict(scaled_data)
|
875 |
-
|
876 |
-
# Add cluster labels to the DataFrame
|
877 |
-
df['Cluster'] = clusters
|
878 |
-
|
879 |
-
# Visualize clusters
|
880 |
-
if len(cluster_cols) == 2:
|
881 |
-
fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
|
882 |
-
st.plotly_chart(fig, use_container_width=True)
|
883 |
-
elif len(cluster_cols) == 3:
|
884 |
-
fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
|
885 |
-
st.plotly_chart(fig, use_container_width=True)
|
886 |
-
else:
|
887 |
-
st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
|
888 |
|
889 |
-
|
890 |
-
|
891 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
892 |
|
893 |
elif app_mode == "Neural Network Studio":
|
894 |
st.title("🧠 Neural Network Studio")
|
|
|
22 |
import base64
|
23 |
import time
|
24 |
from sklearn.cluster import KMeans
|
25 |
+
import keras
|
26 |
+
|
27 |
|
28 |
# Configurations
|
29 |
st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
|
|
|
245 |
show_loader("Generating EDA Report")
|
246 |
pr = generate_profile(df)
|
247 |
st_profile_report(pr)
|
248 |
+
|
249 |
elif app_mode == "Smart Cleaning":
|
250 |
+
st.subheader("Data Cleaning and Preprocessing")
|
251 |
+
|
252 |
+
if st.checkbox("Clean Data using Neural Network (Imputation)"):
|
253 |
+
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
|
254 |
+
for col in numerical_cols:
|
255 |
+
if df[col].isnull().sum() > 0:
|
256 |
+
st.write(f"Imputing missing values in {col} using a Neural Network...")
|
257 |
+
train_df = df.dropna(subset=[col]).copy()
|
258 |
+
test_df = df[df[col].isnull()].drop(col, axis=1).copy()
|
259 |
+
train_X = train_df.drop(col, axis=1).select_dtypes(include=np.number)
|
260 |
+
train_y = train_df[col]
|
261 |
+
|
262 |
+
if not train_X.empty:
|
263 |
+
# Enhanced Model Selection (Simple Additions)
|
264 |
+
model_type = st.selectbox(f"Model for {col}", ["Simple Feedforward", "Slightly Deeper"])
|
265 |
+
if model_type == "Simple Feedforward":
|
266 |
+
model = keras.Sequential([
|
267 |
+
keras.layers.Dense(64, activation='relu', input_shape=(train_X.shape[1],)),
|
268 |
+
keras.layers.Dense(32, activation='relu'),
|
269 |
+
keras.layers.Dense(1)
|
270 |
+
])
|
271 |
+
else:
|
272 |
+
model = keras.Sequential([
|
273 |
+
keras.layers.Dense(128, activation='relu', input_shape=(train_X.shape[1],)),
|
274 |
+
keras.layers.Dense(64, activation='relu'),
|
275 |
+
keras.layers.Dense(32, activation='relu'),
|
276 |
+
keras.layers.Dense(1)
|
277 |
+
])
|
278 |
+
|
279 |
+
model.compile(optimizer='adam', loss='mse')
|
280 |
+
model.fit(train_X, train_y, epochs=50, verbose=0)
|
281 |
+
imputed_values = model.predict(test_df.select_dtypes(include=np.number))
|
282 |
+
df.loc[df[col].isnull(), col] = imputed_values.flatten()
|
283 |
+
st.success(f"Imputation in {col} completed.")
|
284 |
+
else:
|
285 |
+
st.warning(f"Skipping imputation for {col} due to insufficient data.")
|
286 |
+
|
287 |
+
if st.checkbox("Standardize Numerical Columns"):
|
288 |
+
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
|
289 |
+
scaler = StandardScaler()
|
290 |
+
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
|
291 |
+
st.success("Numerical columns standardized.")
|
292 |
+
|
293 |
+
if st.checkbox("Encode Categorical Columns"):
|
294 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
295 |
+
for col in categorical_cols:
|
296 |
+
le = LabelEncoder()
|
297 |
+
df[col] = le.fit_transform(df[col])
|
298 |
+
st.success("Categorical columns encoded.")
|
299 |
+
|
300 |
+
st.session_state.cleaned_data = df #Update cleaned data after cleaning operations.
|
301 |
+
|
302 |
+
# Cleaning Toolkit
|
303 |
+
col1, col2 = st.columns([1, 3])
|
304 |
+
with col1:
|
305 |
+
st.subheader("Cleaning Actions")
|
306 |
+
|
307 |
+
# Add Reset and Undo buttons
|
308 |
+
col1a, col1b = st.columns(2)
|
309 |
+
with col1a:
|
310 |
+
if st.button("Reset to Original", help="Revert all changes to the uploaded data."):
|
311 |
+
st.session_state.cleaned_data = st.session_state.raw_data.copy()
|
312 |
+
st.session_state.data_history = [st.session_state.raw_data.copy()]
|
313 |
+
st.experimental_rerun()
|
314 |
+
with col1b:
|
315 |
+
if len(st.session_state.data_history) > 1:
|
316 |
+
if st.button("Undo Last Action", help="Revert to the previous state."):
|
317 |
+
st.session_state.data_history.pop()
|
318 |
+
st.session_state.cleaned_data = st.session_state.data_history[-1].copy()
|
319 |
st.experimental_rerun()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
|
321 |
+
clean_action = st.selectbox("Choose Operation", [
|
322 |
+
"Handle Missing Values",
|
323 |
+
"Remove Duplicates",
|
324 |
+
"Remove Column",
|
325 |
+
"Normalize Data",
|
326 |
+
"Encode Categories",
|
327 |
+
"Outlier Removal",
|
328 |
+
"Auto Clean",
|
329 |
+
"Neural Network Prep"
|
330 |
+
], help="Select the data cleaning operation to perform.")
|
331 |
+
|
332 |
+
# Initialize Auto Clean Variables
|
333 |
+
auto_missing = False
|
334 |
+
auto_normalize = False
|
335 |
+
auto_encode = False
|
336 |
+
missing_strategy_num = "Median"
|
337 |
+
missing_strategy_cat = "Most Frequent"
|
338 |
+
|
339 |
+
if clean_action == "Handle Missing Values":
|
340 |
+
st.markdown("**Configure how missing values will be handled.**", unsafe_allow_html=True)
|
341 |
+
all_impute_cols = ["All Columns"] + df.columns.tolist()
|
342 |
+
impute_cols = st.multiselect("Columns to Impute", all_impute_cols, default=["All Columns"], help="Select the columns with missing values to impute. Choose 'All Columns' to apply to all columns with missing values.")
|
343 |
+
if "All Columns" in impute_cols:
|
344 |
+
impute_cols = df.columns.tolist()
|
345 |
+
|
346 |
+
method = st.selectbox("Imputation Method", [
|
347 |
+
"KNN Imputation",
|
348 |
+
"Median Fill",
|
349 |
+
"Mean Fill",
|
350 |
+
"Drop Missing"
|
351 |
+
], help="Choose the method to use for imputing missing values.")
|
352 |
+
if method == "KNN Imputation":
|
353 |
+
knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5, help="Number of neighbors for KNN Imputation.") #Parameter
|
354 |
+
|
355 |
+
elif clean_action == "Neural Network Prep":
|
356 |
+
st.markdown("**Neural Network Specific Preparation**", unsafe_allow_html=True)
|
357 |
+
validModels=["RNN", "CNN"]
|
358 |
+
model_Choice_text = st.radio("What's a use case for Models?", options= validModels)
|
359 |
+
st.info('Select a machine learning task below!')
|
360 |
+
validColumnNumerical_cols = df.select_dtypes(include=['int','float']).columns.tolist()
|
361 |
+
numcol_cols = st.multiselect("Text use Colimns: or sequence for model usage :D, to generate the code - to understand how each one plays out D", options =validColumnNumerical_cols )
|
362 |
+
st.code('Code example is generated.')
|
363 |
+
""" Make each configuration do an function or callback" just one press and more to learn"""
|
364 |
+
seq_length = st.number_input("Sequence Length (for RNN)", 10, 100, 30, help =" Length to do that. make them more power ")
|
365 |
+
method = st.selectbox("Imputation Method", ["KNN Imputation", "Median Fill", "Mean Fill", "Drop Missing"])
|
366 |
+
|
367 |
+
elif clean_action == "Normalize Data":
|
368 |
+
st.markdown("**Choose a scaling method and columns to normalize.**")
|
369 |
+
scaler_type = st.selectbox("Scaler Type", ["RobustScaler", "StandardScaler"], help="Select the type of scaler to use.")
|
370 |
+
all_normalize_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
|
371 |
+
normalize_cols = st.multiselect("Columns to Normalize", all_normalize_cols, default=["All Numerical"], help="Select the numerical columns to normalize. Choose 'All Numerical' to apply to all numerical columns.")
|
372 |
+
if "All Numerical" in normalize_cols:
|
373 |
+
normalize_cols = df.select_dtypes(include=np.number).columns.tolist()
|
374 |
+
|
375 |
+
elif clean_action == "Encode Categories":
|
376 |
+
st.markdown("**Select categorical columns to encode.**")
|
377 |
+
all_encode_cols = ["All Categorical"] + df.select_dtypes(include='object').columns.tolist()
|
378 |
+
encode_cols = st.multiselect("Columns to Encode", all_encode_cols, default=["All Categorical"], help="Select the categorical columns to encode. Choose 'All Categorical' to apply to all object type columns.")
|
379 |
+
if "All Categorical" in encode_cols:
|
380 |
+
encode_cols = df.select_dtypes(include='object').columns.tolist()
|
381 |
+
encoding_method = st.selectbox("Encoding Method", ["OneHotEncoder"], help="Choose the encoding method.")
|
382 |
+
|
383 |
+
elif clean_action == "Outlier Removal":
|
384 |
+
st.markdown("**Configure outlier removal settings.**")
|
385 |
+
all_outlier_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
|
386 |
+
outlier_cols = st.multiselect("Columns to Remove Outliers From", all_outlier_cols, default=["All Numerical"], help="Select the columns to remove outliers from. Choose 'All Numerical' to apply to all numerical columns.")
|
387 |
+
if "All Numerical" in outlier_cols:
|
388 |
+
outlier_cols = df.select_dtypes(include=np.number).columns.tolist()
|
389 |
+
outlier_method = st.selectbox("outlier_method = st.selectbox("Outlier Removal Method", ["IQR", "Z-score"], help="Choose the outlier removal method.")
|
390 |
+
if outlier_method == "IQR":
|
391 |
+
iqr_threshold = st.slider("IQR Threshold", 1.0, 3.0, 1.5, help="Adjust the IQR threshold.")
|
392 |
+
else:
|
393 |
+
zscore_threshold = st.slider("Z-score Threshold", 2.0, 4.0, 3.0, help="Adjust the Z-score threshold.")
|
394 |
+
|
395 |
+
elif clean_action == "Remove Column":
|
396 |
+
st.markdown("**Choose Columns to Remove**")
|
397 |
+
all_cols = df.columns.tolist()
|
398 |
+
remove_cols = st.multiselect("Columns to Remove", all_cols, help="Select the columns to remove.")
|
399 |
+
|
400 |
+
elif clean_action == "Auto Clean":
|
401 |
+
st.markdown("**Automatically Impute Missing Values, Encode Categorical Variables, and Normalize Numeric Variables**", help = "These action happens automically when selected.")
|
402 |
+
with st.expander("⚙️ Auto Processing Settings"):
|
403 |
+
st.markdown("**Check to enable setting automatic data cleaning.**", help = "You must manually change configurations in the following setttings below.")
|
404 |
+
auto_missing = st.checkbox("Auto Handle Missing Values", True, help = "Auto handle all mission values with selected configurations")
|
405 |
+
auto_normalize = st.checkbox("Auto Normalize Numerical Features", True, help = "Check to automatically normalize all numerical features")
|
406 |
+
auto_encode = st.checkbox("Auto Encode Categorical Features", True, help="Check to automatically Encode all catigorical columns")
|
407 |
+
|
408 |
+
if auto_missing:
|
409 |
+
missing_strategy_num = st.selectbox("Numerical Imputation", ["Median", "Mean"], help="Choose the numeric strategy for Auto Clean")
|
410 |
+
missing_strategy_cat = st.selectbox("Categorical Imputation", ["Most Frequent", "Constant"], help="Choose strategy for auto cleaning on categorical attributes")
|
411 |
+
|
412 |
+
with col2:
|
413 |
+
if st.button("Apply Transformation"):
|
414 |
+
with st.spinner("Applying changes..."):
|
415 |
+
current_df = df.copy() # important
|
416 |
+
if 'data_history' not in st.session_state:
|
417 |
+
st.session_state.data_history = [df.copy()]
|
418 |
+
# Store the current state in history BEFORE processing
|
419 |
+
st.session_state.data_history.append(current_df)
|
420 |
+
|
421 |
+
# Auto Processing
|
422 |
+
if auto_missing and clean_action != "Auto Clean":
|
423 |
+
num_cols = current_df.select_dtypes(include=np.number).columns
|
424 |
+
cat_cols = current_df.select_dtypes(include='object').columns
|
425 |
+
|
426 |
+
if missing_strategy_num == "Median":
|
427 |
+
current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].median())
|
428 |
+
else:
|
429 |
+
current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].mean())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
|
431 |
+
if missing_strategy_cat == "Most Frequent":
|
432 |
+
current_df[cat_cols] = current_df[cat_cols].fillna(current_df[cat_cols].mode().iloc[0])
|
433 |
+
else:
|
434 |
+
current_df[cat_cols] = current_df[cat_cols].fillna("Missing")
|
435 |
+
|
436 |
+
if auto_normalize and clean_action != "Auto Clean":
|
437 |
+
num_cols = current_df.select_dtypes(include=np.number).columns
|
438 |
+
scaler = StandardScaler()
|
439 |
+
current_df[num_cols] = scaler.fit_transform(current_df[num_cols])
|
440 |
+
|
441 |
+
if auto_encode and clean_action != "Auto Clean":
|
442 |
+
cat_cols = current_df.select_dtypes(include='object').columns
|
443 |
+
if len(cat_cols) > 0:
|
444 |
+
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
445 |
+
encoded_data = encoder.fit_transform(current_df[cat_cols])
|
446 |
+
encoded_df = pd.DataFrame(encoded_data,columns=encoder.get_feature_names_out(cat_cols))
|
447 |
+
current_df = pd.concat([current_df.drop(columns=cat_cols), encoded_df], axis=1)
|
448 |
+
|
449 |
+
# Manual Processing
|
450 |
+
if clean_action == "Handle Missing Values":
|
451 |
+
if method == "KNN Imputation":
|
452 |
+
imputer = KNNImputer(n_neighbors=knn_neighbors)
|
453 |
+
current_df[impute_cols] = imputer.fit_transform(current_df[impute_cols])
|
454 |
+
elif method == "Median Fill":
|
455 |
+
current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].median())
|
456 |
+
elif method == "Mean Fill":
|
457 |
+
current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].mean())
|
458 |
+
else:
|
459 |
+
current_df = current_df.dropna(subset=impute_cols)
|
460 |
|
461 |
+
elif clean_action == "Remove Column":
|
462 |
+
if remove_cols:
|
463 |
+
current_df = current_df.drop(columns=remove_cols)
|
464 |
|
465 |
+
st.session_state.cleaned_data = current_df
|
466 |
+
st.success("Transformation applied!")
|
467 |
+
|
468 |
+
# Data Comparison
|
469 |
+
st.subheader("Data Version Comparison")
|
470 |
+
col_orig, col_clean = st.columns(2)
|
471 |
+
|
472 |
+
with col_orig:
|
473 |
+
st.markdown("**Original Data**")
|
474 |
+
if st.session_state.raw_data is not None:
|
475 |
+
st.dataframe(st.session_state.raw_data.head(5), use_container_width=True)
|
476 |
+
else:
|
477 |
+
st.write("No original data uploaded yet.")
|
478 |
+
with col_clean:
|
479 |
+
st.markdown("**Cleaned Data**")
|
480 |
+
st.dataframe(df.head(5), use_container_width=True)
|
481 |
|
482 |
elif app_mode == "Advanced EDA":
|
483 |
st.title("🔍 Advanced Exploratory Analysis")
|
|
|
821 |
else:
|
822 |
st.write("Please train a model first in the 'Model Training' section.")
|
823 |
|
824 |
+
st.title("🔬 Advanced Data Visualization and Clustering Lab")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
825 |
|
826 |
+
# Initialize session state for cleaned data
|
827 |
+
if 'cleaned_data' not in st.session_state:
|
828 |
+
st.session_state.cleaned_data = None
|
|
|
|
|
829 |
|
830 |
+
# Sample data upload (replace with your data loading logic)
|
831 |
+
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
832 |
+
if uploaded_file is not None:
|
833 |
+
try:
|
834 |
+
df = pd.read_csv(uploaded_file)
|
835 |
+
st.session_state.cleaned_data = df
|
836 |
+
st.success("Data loaded successfully!")
|
837 |
+
except Exception as e:
|
838 |
+
st.error(f"Error loading data: {e}")
|
839 |
|
840 |
+
if st.session_state.cleaned_data is not None:
|
841 |
+
df = st.session_state.cleaned_data.copy()
|
|
|
|
|
|
|
842 |
|
843 |
+
# Visualization Type Selection
|
844 |
+
visualization_type = st.selectbox("Select Visualization Type", [
|
845 |
+
"Pair Plot", "Parallel Coordinates Plot", "Andrews Curves", "Pie Chart",
|
846 |
+
"Area Chart", "Density Contour", "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
|
847 |
+
])
|
848 |
+
|
849 |
+
if visualization_type == "Pair Plot":
|
850 |
+
st.subheader("Pair Plot")
|
851 |
+
cols_for_pairplot = st.multiselect("Select Columns for Pair Plot", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
|
852 |
+
if cols_for_pairplot:
|
853 |
+
fig = px.scatter_matrix(df, dimensions=cols_for_pairplot)
|
854 |
st.plotly_chart(fig, use_container_width=True)
|
855 |
|
856 |
+
elif visualization_type == "Parallel Coordinates Plot":
|
857 |
+
st.subheader("Parallel Coordinates Plot")
|
858 |
+
cols_for_parallel = st.multiselect("Select Columns for Parallel Coordinates", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
|
859 |
+
if cols_for_parallel:
|
860 |
+
fig = px.parallel_coordinates(df[cols_for_parallel], color=df[cols_for_parallel[0]] if cols_for_parallel else None)
|
|
|
|
|
|
|
|
|
|
|
861 |
st.plotly_chart(fig, use_container_width=True)
|
862 |
|
863 |
+
elif visualization_type == "Andrews Curves":
|
864 |
+
st.subheader("Andrews Curves")
|
865 |
+
cols_for_andrews = st.multiselect("Select Columns for Andrews Curves", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
|
866 |
+
if cols_for_andrews:
|
867 |
+
fig = px.andrews_curves(df[cols_for_andrews + [df.columns[0]]], class_column=df.columns[0])
|
868 |
+
st.plotly_chart(fig, use_container_width=True)
|
869 |
|
870 |
+
elif visualization_type == "Pie Chart":
|
871 |
+
st.subheader("Pie Chart")
|
872 |
+
col_for_pie = st.selectbox("Select Column for Pie Chart", df.columns)
|
873 |
+
fig = px.pie(df, names=col_for_pie)
|
874 |
+
st.plotly_chart(fig, use_container_width=True)
|
875 |
+
|
876 |
+
elif visualization_type == "Area Chart":
|
877 |
+
st.subheader("Area Chart")
|
878 |
+
cols_for_area = st.multiselect("Select Columns for Area Chart", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
|
879 |
+
if cols_for_area:
|
880 |
+
fig = px.area(df[cols_for_area])
|
881 |
st.plotly_chart(fig, use_container_width=True)
|
882 |
|
883 |
+
elif visualization_type == "Density Contour":
|
884 |
+
st.subheader("Density Contour")
|
885 |
+
x_col = st.selectbox("Select X Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
|
886 |
+
y_col = st.selectbox("Select Y Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
|
887 |
+
fig = px.density_contour(df, x=x_col, y=y_col)
|
888 |
+
st.plotly_chart(fig, use_container_width=True)
|
889 |
+
|
890 |
+
elif visualization_type == "Sunburst Chart":
|
891 |
+
st.subheader("Sunburst Chart")
|
892 |
+
path_cols = st.multiselect("Select Path Columns for Sunburst Chart", df.columns)
|
893 |
+
if path_cols:
|
894 |
+
fig = px.sunburst(df, path=path_cols)
|
895 |
+
st.plotly_chart(fig, use_container_width=True)
|
896 |
|
897 |
+
elif visualization_type == "Funnel Chart":
|
898 |
+
st.subheader("Funnel Chart")
|
899 |
+
x_col = st.selectbox("Select X Column for Funnel Chart (Values)", df.select_dtypes(include=np.number).columns.tolist())
|
900 |
+
y_col = st.selectbox("Select Y Column for Funnel Chart (Categories)", df.columns)
|
901 |
+
fig = px.funnel(df, x=x_col, y=y_col)
|
902 |
+
st.plotly_chart(fig, use_container_width=True)
|
903 |
|
904 |
+
elif visualization_type == "Clustering Analysis":
|
905 |
+
st.subheader("Clustering Analysis")
|
906 |
+
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
|
|
|
907 |
|
908 |
+
if not numerical_cols:
|
909 |
+
st.warning("No numerical columns found for clustering.")
|
910 |
+
else:
|
911 |
+
cluster_cols = st.multiselect("Select Columns for Clustering", numerical_cols, default=numerical_cols[:2] if len(numerical_cols) >= 2 else numerical_cols)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
912 |
|
913 |
+
if cluster_cols:
|
914 |
+
try:
|
915 |
+
scaler = StandardScaler()
|
916 |
+
scaled_data = scaler.fit_transform(df[cluster_cols])
|
917 |
+
n_clusters = st.slider("Number of Clusters", 2, 10, 3, help="Number of clusters to form.")
|
918 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
919 |
+
clusters = kmeans.fit_predict(scaled_data)
|
920 |
+
df['Cluster'] = clusters
|
921 |
+
|
922 |
+
if len(cluster_cols) == 2:
|
923 |
+
fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
|
924 |
+
st.plotly_chart(fig, use_container_width=True)
|
925 |
+
elif len(cluster_cols) == 3:
|
926 |
+
fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
|
927 |
+
st.plotly_chart(fig, use_container_width=True)
|
928 |
+
else:
|
929 |
+
st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
|
930 |
+
st.success("Clustering applied successfully!")
|
931 |
+
except Exception as e:
|
932 |
+
st.error(f"An error occurred during clustering: {e}")
|
933 |
|
934 |
elif app_mode == "Neural Network Studio":
|
935 |
st.title("🧠 Neural Network Studio")
|