CosmickVisions commited on
Commit
68a3b7e
·
verified ·
1 Parent(s): d139bf8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +331 -286
app.py CHANGED
@@ -7,6 +7,11 @@ from sklearn.linear_model import LinearRegression, LogisticRegression
7
  from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
8
  from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
9
  from sklearn.svm import SVR, SVC
 
 
 
 
 
10
  from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
11
  from sklearn.impute import KNNImputer, SimpleImputer
12
  from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
@@ -22,6 +27,7 @@ from io import BytesIO
22
  import base64
23
  import time
24
  from sklearn.cluster import KMeans
 
25
 
26
  # Configurations
27
  st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
@@ -100,20 +106,38 @@ def show_loader(message="Loading..."):
100
  unsafe_allow_html=True
101
  )
102
 
103
- # Cache decorators
104
- @st.cache_data(ttl=3600)
105
  def load_data(uploaded_file):
106
  """Load and cache dataset, with file type validation."""
107
  if uploaded_file is not None:
108
  file_extension = uploaded_file.name.split(".")[-1].lower()
 
109
 
110
- if file_extension == "csv":
111
- return pd.read_csv(uploaded_file)
112
- elif file_extension in ["xlsx", "xls"]:
113
- return pd.read_excel(uploaded_file)
114
- else:
115
- st.error("Unsupported file type. Please upload a CSV or Excel file.")
116
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  else:
118
  return None
119
 
@@ -160,17 +184,6 @@ app_mode = st.sidebar.radio("Navigate", [
160
  "Neural Network Studio" # New option
161
  ])
162
 
163
- # --- Progress Bar ----
164
- def animated_progress_bar(progress_var, message="Processing..."):
165
- """Displays an animated progress bar with a message."""
166
- progress_bar = st.progress(0)
167
- status_text = st.empty() # Empty element to update the status message
168
-
169
- for i in range(progress_var): #progress will increment
170
- status_text.text(f"{message} ({i+1}/{progress_var})")
171
- progress_bar.progress((i+1)/progress_var) #progress incrementally.
172
- time.sleep(0.01)
173
-
174
  # --- Main App Logic ---
175
  if app_mode == "Data Upload":
176
  st.title("📤 Data Upload & Initial Analysis")
@@ -193,16 +206,19 @@ if app_mode == "Data Upload":
193
  unsafe_allow_html=True,
194
  )
195
 
196
- uploaded_file = st.file_uploader("Choose a CSV or Excel file", type=["csv", "xlsx"], help="Upload your dataset here. Supported formats: CSV, XLSX")
197
-
 
 
 
198
  if uploaded_file:
199
  df = load_data(uploaded_file)
200
- if df is not None: # only proceed if load_data returned a valid dataframe
 
201
  st.session_state.raw_data = df
202
  st.session_state.cleaned_data = df.copy()
203
-
204
  st.subheader("Data Overview")
205
-
206
  # Data Overview Cards with more context
207
  col1, col2, col3 = st.columns(3)
208
  with col1:
@@ -212,180 +228,100 @@ if app_mode == "Data Upload":
212
  with col3:
213
  num_missing = df.isna().sum().sum()
214
  st.metric("Total Missing Values", num_missing, help="Total number of missing entries across the entire dataset.")
215
-
216
  # Display Data Types
217
  st.write("Column Data Types:")
218
  dtype_counts = df.dtypes.value_counts().to_dict()
219
  for dtype, count in dtype_counts.items():
220
  st.write(f"- {dtype}: {count} column(s)")
221
-
222
  # Sample Data Table with improved display
223
  st.subheader("Sample Data")
224
  num_rows_preview = st.slider("Number of Rows to Preview", 5, 20, 10, help="Adjust the number of rows displayed in the sample data.")
225
- st.dataframe(df.head(num_rows_preview), use_container_width=True) #full container usage
226
-
227
- # Column Statistics Expander
228
  with st.expander("📊 Column Statistics"):
229
  for col in df.columns:
230
  st.subheader(f"Column: {col}")
231
  st.write(f"Data type: {df[col].dtype}")
232
-
233
  if pd.api.types.is_numeric_dtype(df[col]):
234
  st.write("Summary Statistics:")
235
  st.write(df[col].describe())
236
  else:
237
  st.write("Value Counts:")
238
  st.write(df[col].value_counts())
239
-
240
  # Automated EDA Report
241
  with st.expander("🚀 Automated Data Report"):
242
  if st.button("Generate Smart Report"):
243
  show_loader("Generating EDA Report")
244
  pr = generate_profile(df)
245
  st_profile_report(pr)
246
-
247
  elif app_mode == "Smart Cleaning":
248
  st.title("🧼 Intelligent Data Cleaning")
249
-
250
- if st.session_state.raw_data is not None:
251
- df = st.session_state.cleaned_data
252
-
253
- # Initialize history if not exists
254
- if 'data_history' not in st.session_state:
255
- st.session_state.data_history = [df.copy()]
256
-
257
- # Cleaning Toolkit
258
- col1, col2 = st.columns([1, 3])
259
- with col1:
260
- st.subheader("Cleaning Actions")
261
-
262
- # Reset and Undo buttons
263
- col1a, col1b = st.columns(2)
264
- with col1a:
265
- if st.button("Reset to Original"):
266
- st.session_state.cleaned_data = st.session_state.raw_data.copy()
267
- st.session_state.data_history = [st.session_state.raw_data.copy()]
268
- st.experimental_rerun()
269
- with col1b:
270
- if len(st.session_state.data_history) > 1:
271
- if st.button("Undo Last Action"):
272
- st.session_state.data_history.pop()
273
- st.session_state.cleaned_data = st.session_state.data_history[-1].copy()
274
- st.experimental_rerun()
275
-
276
- # Cleaning Operations
277
- clean_action = st.selectbox("Choose Operation", [
278
- "Handle Missing Values",
279
- "Remove Duplicates",
280
- "Remove Columns",
281
- "Normalize Data",
282
- "Encode Categories",
283
- "Outlier Removal",
284
- "Neural Network Prep"
285
- ])
286
-
287
- # Dynamic Configuration
288
- if clean_action == "Handle Missing Values":
289
- method = st.selectbox("Imputation Method", [
290
- "KNN Imputation",
291
- "Median Fill",
292
- "Mean Fill",
293
- "Drop Missing"
294
- ])
295
- if method == "KNN Imputation":
296
- knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5)
297
-
298
- elif clean_action == "Neural Network Prep":
299
- st.markdown("**Neural Network Specific Preparation**")
300
- model_choice = st.radio("Model Type", ["RNN", "CNN"])
301
- seq_length = st.number_input("Sequence Length (for RNN)", 10, 100, 30)
302
- st.info("Prepares data for neural network training.")
303
-
304
- elif clean_action == "Normalize Data":
305
- scaler_type = st.selectbox("Scaler Type", ["RobustScaler", "StandardScaler"])
306
-
307
- elif clean_action == "Encode Categories":
308
- encoding_method = st.selectbox("Encoding Method", ["OneHotEncoder"])
309
-
310
- elif clean_action == "Outlier Removal":
311
- outlier_method = st.selectbox("Outlier Removal Method", ["IQR", "Z-score"])
312
- if outlier_method == "IQR":
313
- iqr_threshold = st.slider("IQR Threshold", 1.0, 3.0, 1.5)
314
- else:
315
- zscore_threshold = st.slider("Z-score Threshold", 2.0, 4.0, 3.0)
316
-
317
- elif clean_action == "Remove Columns":
318
- remove_cols = st.multiselect("Columns to Remove", df.columns)
319
-
320
- with col2:
321
- if st.button("Apply Transformation"):
322
- with st.spinner("Applying changes..."):
323
- current_df = df.copy()
324
- st.session_state.data_history.append(current_df)
325
-
326
- # Handle Missing Values
327
- if clean_action == "Handle Missing Values":
328
- if method == "KNN Imputation":
329
- imputer = KNNImputer(n_neighbors=knn_neighbors)
330
- current_df = pd.DataFrame(imputer.fit_transform(current_df), columns=current_df.columns)
331
- elif method == "Median Fill":
332
- current_df = current_df.fillna(current_df.median())
333
- elif method == "Mean Fill":
334
- current_df = current_df.fillna(current_df.mean())
335
- else:
336
- current_df = current_df.dropna()
337
-
338
- # Remove Columns
339
- elif clean_action == "Remove Columns":
340
- if remove_cols:
341
- current_df = current_df.drop(columns=remove_cols)
342
-
343
- # Normalize Data
344
- elif clean_action == "Normalize Data":
345
- scaler = RobustScaler() if scaler_type == "RobustScaler" else StandardScaler()
346
- num_cols = current_df.select_dtypes(include=np.number).columns
347
- current_df[num_cols] = scaler.fit_transform(current_df[num_cols])
348
-
349
- # Encode Categories
350
- elif clean_action == "Encode Categories":
351
- cat_cols = current_df.select_dtypes(include='object').columns
352
- if len(cat_cols) > 0:
353
- encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
354
- encoded_data = encoder.fit_transform(current_df[cat_cols])
355
- encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(cat_cols))
356
- current_df = pd.concat([current_df.drop(columns=cat_cols), encoded_df], axis=1)
357
-
358
- # Outlier Removal
359
- elif clean_action == "Outlier Removal":
360
- num_cols = current_df.select_dtypes(include=np.number).columns
361
- for col in num_cols:
362
- if outlier_method == "IQR":
363
- Q1 = current_df[col].quantile(0.25)
364
- Q3 = current_df[col].quantile(0.75)
365
- IQR = Q3 - Q1
366
- lower_bound = Q1 - iqr_threshold * IQR
367
- upper_bound = Q3 + iqr_threshold * IQR
368
- current_df = current_df[(current_df[col] >= lower_bound) & (current_df[col] <= upper_bound)]
369
- else:
370
- z_scores = np.abs((current_df[col] - current_df[col].mean()) / current_df[col].std())
371
- current_df = current_df[z_scores <= zscore_threshold]
372
-
373
- # Neural Network Prep
374
- elif clean_action == "Neural Network Prep":
375
- st.info("Data prepared for neural network training.")
376
-
377
- st.session_state.cleaned_data = current_df
378
- st.success("Transformation applied!")
379
-
380
- # Data Comparison
381
- st.subheader("Data Version Comparison")
382
- col_orig, col_clean = st.columns(2)
383
- with col_orig:
384
- st.markdown("**Original Data**")
385
- st.dataframe(st.session_state.raw_data.head(5), use_container_width=True)
386
- with col_clean:
387
- st.markdown("**Cleaned Data**")
388
- st.dataframe(df.head(5), use_container_width=True)
389
 
390
  elif app_mode == "Advanced EDA":
391
  st.title("🔍 Advanced Exploratory Analysis")
@@ -586,109 +522,114 @@ elif app_mode == "Advanced EDA":
586
  st.plotly_chart(fig, use_container_width=True)
587
  except Exception as e:
588
  st.error(f"An error occurred while generating the plot: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
  elif app_mode == "Model Training":
591
  st.title("🚂 Model Training")
592
 
593
- if st.session_state.cleaned_data is not None:
594
- df = st.session_state.cleaned_data.copy()
595
-
596
- # Target Variable Selection
597
- target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
598
-
599
- # Problem Type Selection
600
- problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of problem.")
601
-
602
- # Feature Selection
603
- feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose features for training.")
604
-
605
- # Model Selection
606
- model_name = st.selectbox("Select Model", [
607
- "Linear Regression", "Logistic Regression", "Decision Tree",
608
- "Random Forest", "Gradient Boosting", "SVM"
609
- ], help="Choose a model.")
610
-
611
- # Hyperparameter Tuning (Example - Add more as needed)
612
- if model_name == "Random Forest":
613
- n_estimators = st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest.")
614
- max_depth = st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree.")
615
-
616
- # Train-Test Split
617
- test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
618
-
619
- if st.button("Train Model"):
620
- with st.spinner("Training model..."):
621
- try:
622
- X = df[feature_columns]
623
- y = df[target_column]
624
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
625
-
626
- # Preprocessing Pipeline
627
- numeric_features = X.select_dtypes(include=np.number).columns
628
- categorical_features = X.select_dtypes(exclude=np.number).columns
629
-
630
- numeric_transformer = Pipeline(steps=[
631
- ('imputer', SimpleImputer(strategy='median')),
632
- ('scaler', StandardScaler())
633
- ])
634
 
635
- categorical_transformer = Pipeline(steps=[
636
- ('imputer', SimpleImputer(strategy='most_frequent')),
637
- ('onehot', OneHotEncoder(handle_unknown='ignore'))
638
- ])
639
-
640
- preprocessor = ColumnTransformer(
641
- transformers=[
642
- ('num', numeric_transformer, numeric_features),
643
- ('cat', categorical_transformer, categorical_features)
644
- ])
645
-
646
- X_train_processed = preprocessor.fit_transform(X_train)
647
- X_test_processed = preprocessor.transform(X_test)
648
 
649
- # Model Training
650
- if model_name == "Linear Regression":
651
- model = LinearRegression()
652
- elif model_name == "Logistic Regression":
653
- model = LogisticRegression(max_iter=1000)
654
- elif model_name == "Decision Tree":
655
- if problem_type == "Regression":
656
- model = DecisionTreeRegressor()
657
- else:
658
- model = DecisionTreeClassifier()
659
- elif model_name == "Random Forest":
660
- if problem_type == "Regression":
661
- model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
662
- else:
663
- model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
664
- elif model_name == "Gradient Boosting":
665
- model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
666
- elif model_name == "SVM":
667
- model = SVR() if problem_type == "Regression" else SVC()
668
-
669
- model.fit(X_train_processed, y_train)
670
-
671
- # Store model and preprocessor
672
- st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
673
- st.session_state.preprocessor = preprocessor
674
-
675
- # Model Evaluation
676
- y_pred = model.predict(X_test_processed)
677
- if problem_type == "Regression":
678
- mse = mean_squared_error(y_test, y_pred)
679
- r2 = r2_score(y_test, y_pred)
680
- st.write(f"Mean Squared Error: {mse:.4f}")
681
- st.write(f"R-squared: {r2:.4f}")
682
- else:
683
- accuracy = accuracy_score(y_test, y_pred)
684
- st.write(f"Accuracy: {accuracy:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
 
686
- st.success("Model trained successfully!")
 
687
 
688
- except Exception as e:
689
- st.error(f"An error occurred: {e}")
690
- else:
691
- st.write("Please upload and clean data first.")
 
 
 
 
692
 
693
  elif app_mode == "Predictions":
694
  st.title("🔮 Make Predictions")
@@ -729,6 +670,29 @@ elif app_mode == "Predictions":
729
  else:
730
  st.write("Please train a model first in the 'Model Training' section.")
731
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
732
  elif app_mode == "Visualization Lab":
733
  st.title("🔬 Advanced Data Visualization and Clustering Lab")
734
 
@@ -839,6 +803,46 @@ if st.session_state.cleaned_data is not None:
839
  st.success("Clustering applied successfully!")
840
  except Exception as e:
841
  st.error(f"An error occurred during clustering: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
842
 
843
  elif app_mode == "Neural Network Studio":
844
  st.title("🧠 Neural Network Studio")
@@ -882,7 +886,7 @@ elif app_mode == "Neural Network Studio":
882
  test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the data to use for testing.")
883
 
884
  # Model Training Button
885
- if st.button("Train Neural Network Model"):
886
  with st.spinner("Training neural network model..."):
887
  try:
888
  # Split data
@@ -900,8 +904,8 @@ elif app_mode == "Neural Network Studio":
900
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
901
  ])
902
 
903
- numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
904
- categorical_features = X_train.select_dtypes(include=['object']).columns
905
 
906
  preprocessor = ColumnTransformer(
907
  transformers=[
@@ -913,18 +917,29 @@ elif app_mode == "Neural Network Studio":
913
  X_test_processed = preprocessor.transform(X_test)
914
 
915
  # Neural Network Model Selection and Training
 
 
 
 
 
916
  if model_type == "Simple Neural Network":
917
  model = keras.Sequential()
918
  model.add(layers.Input(shape=(X_train_processed.shape[1],)))
919
  for _ in range(hidden_layers):
920
- model.add(layers.Dense(neurons_per_layer, activation='relu'))
921
- model.add(layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), activation='linear' if problem_type == "Regression" else 'softmax'))
 
 
922
 
923
- model.compile(optimizer='adam',
 
 
924
  loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
925
  metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
926
 
927
- model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)
 
 
928
 
929
  y_pred = model.predict(X_test_processed)
930
  if problem_type == "Classification":
@@ -935,17 +950,23 @@ elif app_mode == "Neural Network Studio":
935
  X_test_cnn = np.expand_dims(X_test_processed, axis=2)
936
 
937
  model = keras.Sequential()
938
- model.add(layers.Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)))
939
- model.add(layers.MaxPooling1D(pool_size=2))
 
940
  model.add(layers.Flatten())
941
  model.add(layers.Dense(50, activation='relu'))
942
- model.add(layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), activation='linear' if problem_type == "Regression" else 'softmax'))
 
 
943
 
944
- model.compile(optimizer='adam',
 
945
  loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
946
  metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
947
 
948
- model.fit(X_train_cnn, y_train, epochs=epochs_cnn, batch_size=batch_size_cnn, validation_split=0.2, verbose=0)
 
 
949
 
950
  y_pred = model.predict(X_test_cnn)
951
  if problem_type == "Classification":
@@ -953,18 +974,28 @@ elif app_mode == "Neural Network Studio":
953
 
954
  elif model_type == "Recurrent Neural Network (RNN)":
955
  try:
956
- X_train_rnn = np.reshape(X_train_processed, (X_train_processed.shape[0], sequence_length, X_train_processed.shape[1] // sequence_length))
957
- X_test_rnn = np.reshape(X_test_processed, (X_test_processed.shape[0], sequence_length, X_test_processed.shape[1] // sequence_length))
 
 
 
958
 
959
  model = keras.Sequential()
960
- model.add(layers.SimpleRNN(50, activation='relu', input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])))
961
- model.add(layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), activation='linear' if problem_type == "Regression" else 'softmax'))
962
-
963
- model.compile(optimizer='adam',
 
 
 
 
 
964
  loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
965
  metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
966
 
967
- model.fit(X_train_rnn, y_train, epochs=epochs_rnn, batch_size=batch_size_rnn, validation_split=0.2, verbose=0)
 
 
968
 
969
  y_pred = model.predict(X_test_rnn)
970
  if problem_type == "Classification":
@@ -995,7 +1026,21 @@ elif app_mode == "Neural Network Studio":
995
  st.write("Classification Report:")
996
  st.text(classification_report(y_test, y_pred))
997
 
 
 
 
 
 
 
 
 
 
 
 
998
  st.success("Model trained successfully!")
999
 
 
 
 
1000
  except Exception as e:
1001
  st.error(f"An error occurred during training: {e}")
 
7
  from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
8
  from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
9
  from sklearn.svm import SVR, SVC
10
+ from sklearn.decomposition import PCA #Import at top
11
+ from sklearn.metrics import silhouette_score #Import at top
12
+ from sklearn.cluster import DBSCAN #Import at top
13
+ from sklearn.feature_selection import SelectKBest #Import at top
14
+ import joblib #Import at top
15
  from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
16
  from sklearn.impute import KNNImputer, SimpleImputer
17
  from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
 
27
  import base64
28
  import time
29
  from sklearn.cluster import KMeans
30
+ import scipy.stats as stats
31
 
32
  # Configurations
33
  st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
 
106
  unsafe_allow_html=True
107
  )
108
 
109
+ @st.cache_data(ttl=3600, allow_output_mutation=True) #Added allow_output_mutation
 
110
  def load_data(uploaded_file):
111
  """Load and cache dataset, with file type validation."""
112
  if uploaded_file is not None:
113
  file_extension = uploaded_file.name.split(".")[-1].lower()
114
+ mime_type = mimetypes.guess_type(uploaded_file.name)[0]
115
 
116
+ max_file_size_mb = 50 # Set a maximum file size (adjust as needed)
117
+ file_size_mb = uploaded_file.size / (1024 * 1024)
118
+ if file_size_mb > max_file_size_mb:
119
+ st.error(f"File size exceeds the limit of {max_file_size_mb} MB.")
 
 
120
  return None
121
+
122
+
123
+ try: # Wrap file reading in a try...except
124
+ if file_extension == "csv" or mime_type == 'text/csv':
125
+ df = pd.read_csv(uploaded_file)
126
+ return df
127
+ elif file_extension in ["xlsx", "xls"] or mime_type in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:
128
+ df = pd.read_excel(uploaded_file)
129
+ return df
130
+ else:
131
+ st.error("Unsupported file type. Please upload a CSV or Excel file.")
132
+ return None
133
+ except FileNotFoundError:
134
+ st.error("File not found. Please check the file path.")
135
+ except pd.errors.ParserError: # Catch pandas-specific parsing errors
136
+ st.error("Error parsing the file. Make sure it's a valid CSV or Excel file.")
137
+ except Exception as e:
138
+ st.error(f"An unexpected error occurred: {type(e).__name__} - {str(e)}")
139
+ return None # Handle other potential exceptions
140
+
141
  else:
142
  return None
143
 
 
184
  "Neural Network Studio" # New option
185
  ])
186
 
 
 
 
 
 
 
 
 
 
 
 
187
  # --- Main App Logic ---
188
  if app_mode == "Data Upload":
189
  st.title("📤 Data Upload & Initial Analysis")
 
206
  unsafe_allow_html=True,
207
  )
208
 
209
+ uploaded_file = st.file_uploader(
210
+ "Choose a CSV or Excel file", type=["csv", "xlsx"],
211
+ help="Upload your dataset here. Supported formats: CSV, XLSX"
212
+ )
213
+
214
  if uploaded_file:
215
  df = load_data(uploaded_file)
216
+ if df is not None:
217
+ # only proceed if load_data returned a valid dataframe
218
  st.session_state.raw_data = df
219
  st.session_state.cleaned_data = df.copy()
220
+
221
  st.subheader("Data Overview")
 
222
  # Data Overview Cards with more context
223
  col1, col2, col3 = st.columns(3)
224
  with col1:
 
228
  with col3:
229
  num_missing = df.isna().sum().sum()
230
  st.metric("Total Missing Values", num_missing, help="Total number of missing entries across the entire dataset.")
231
+
232
  # Display Data Types
233
  st.write("Column Data Types:")
234
  dtype_counts = df.dtypes.value_counts().to_dict()
235
  for dtype, count in dtype_counts.items():
236
  st.write(f"- {dtype}: {count} column(s)")
237
+
238
  # Sample Data Table with improved display
239
  st.subheader("Sample Data")
240
  num_rows_preview = st.slider("Number of Rows to Preview", 5, 20, 10, help="Adjust the number of rows displayed in the sample data.")
241
+ st.dataframe(df.head(num_rows_preview), use_container_width=True)
242
+
243
+ # Column Statistics
244
  with st.expander("📊 Column Statistics"):
245
  for col in df.columns:
246
  st.subheader(f"Column: {col}")
247
  st.write(f"Data type: {df[col].dtype}")
 
248
  if pd.api.types.is_numeric_dtype(df[col]):
249
  st.write("Summary Statistics:")
250
  st.write(df[col].describe())
251
  else:
252
  st.write("Value Counts:")
253
  st.write(df[col].value_counts())
254
+
255
  # Automated EDA Report
256
  with st.expander("🚀 Automated Data Report"):
257
  if st.button("Generate Smart Report"):
258
  show_loader("Generating EDA Report")
259
  pr = generate_profile(df)
260
  st_profile_report(pr)
261
+
262
  elif app_mode == "Smart Cleaning":
263
  st.title("🧼 Intelligent Data Cleaning")
264
+ elif clean_action == "Handle Missing Values":
265
+ columns_with_missing = df.columns[df.isnull().any()].tolist()
266
+ column_to_impute = st.selectbox("Column to Impute", ["All Columns"] + columns_with_missing) #Choose column
267
+
268
+ method = st.selectbox("Imputation Method", [
269
+ "KNN Imputation",
270
+ "Median Fill",
271
+ "Mean Fill",
272
+ "Drop Missing",
273
+ "Constant Value Fill" #new
274
+ ])
275
+ if method == "KNN Imputation":
276
+ knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5)
277
+ elif method == "Constant Value Fill":
278
+ constant_value = st.text_input("Constant Value")
279
+
280
+ elif clean_action == "Clean Text":
281
+ text_column = st.selectbox("Text Column", df.select_dtypes(include='object').columns)
282
+ cleaning_operation = st.selectbox("Cleaning Operation", ["Remove Special Characters", "Lowercase", "Uppercase", "Remove Extra Spaces"])
283
+ if cleaning_operation == "Remove Special Characters":
284
+ chars_to_remove = st.text_input("Characters to Remove", r'[^a-zA-Z0-9\s]')
285
+
286
+ #Inside the Apply Transformations button section
287
+ elif clean_action == "Handle Missing Values":
288
+ if method == "KNN Imputation":
289
+ imputer = KNNImputer(n_neighbors=knn_neighbors)
290
+ if column_to_impute == "All Columns":
291
+ current_df = pd.DataFrame(imputer.fit_transform(current_df), columns=current_df.columns)
292
+ else:
293
+ current_df[[column_to_impute]] = imputer.fit_transform(current_df[[column_to_impute]])
294
+ elif method == "Median Fill":
295
+ if column_to_impute == "All Columns":
296
+ current_df = current_df.fillna(current_df.median())
297
+ else:
298
+ current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].median())
299
+ elif method == "Mean Fill":
300
+ if column_to_impute == "All Columns":
301
+ current_df = current_df.fillna(current_df.mean())
302
+ else:
303
+ current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].mean())
304
+ elif method == "Constant Value Fill":
305
+ if column_to_impute == "All Columns":
306
+ current_df = current_df.fillna(constant_value)
307
+ else:
308
+ current_df[column_to_impute] = current_df[column_to_impute].fillna(constant_value)
309
+ else:
310
+ current_df = current_df.dropna()
311
+
312
+ elif clean_action == "Clean Text":
313
+ def clean_text(text, operation, chars_to_remove=r'[^a-zA-Z0-9\s]'):
314
+ if operation == "Remove Special Characters":
315
+ text = re.sub(chars_to_remove, '', str(text)) #Need to import re at top
316
+ elif operation == "Lowercase":
317
+ text = str(text).lower()
318
+ elif operation == "Uppercase":
319
+ text = str(text).upper()
320
+ elif operation == "Remove Extra Spaces":
321
+ text = " ".join(str(text).split())
322
+ return text
323
+
324
+ current_df[text_column] = current_df[text_column].apply(lambda x: clean_text(x, cleaning_operation, chars_to_remove))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
  elif app_mode == "Advanced EDA":
327
  st.title("🔍 Advanced Exploratory Analysis")
 
522
  st.plotly_chart(fig, use_container_width=True)
523
  except Exception as e:
524
  st.error(f"An error occurred while generating the plot: {e}")
525
+ with st.expander("🧪 Hypothesis Testing"):
526
+ test_type = st.selectbox("Select Test Type", ["T-test", "Chi-Squared Test"])
527
+
528
+ if test_type == "T-test":
529
+ col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
530
+ col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
531
+ if st.button("Run T-test"):
532
+ # Example: Split data by category and perform t-test
533
+ groups = df.groupby(col2)[col1].apply(list)
534
+ if len(groups) == 2:
535
+ t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
536
+ st.write(f"T-statistic: {t_stat:.4f}")
537
+ st.write(f"P-value: {p_value:.4f}")
538
+ if p_value < 0.05:
539
+ st.write("Reject the null hypothesis.")
540
+ else:
541
+ st.write("Fail to reject the null hypothesis.")
542
+ else:
543
+ st.write("Select a categorical column with exactly two categories.")
544
 
545
  elif app_mode == "Model Training":
546
  st.title("🚂 Model Training")
547
 
548
+ feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
 
550
+ if model_name == "Random Forest":
551
+ param_grid = {
552
+ 'n_estimators': st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest."),
553
+ 'max_depth': st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree."),
554
+ 'min_samples_split': st.slider("Minimum Samples Split", 2, 10, 2, help="Minimum samples required to split an internal node"), #New hyperparameter
555
+ 'min_samples_leaf': st.slider("Minimum Samples Leaf", 1, 10, 1, help="Minimum samples required to be at a leaf node"), #New hyperparameter
556
+ }
 
 
 
 
 
 
557
 
558
+ #Inside the train model button
559
+ if st.button("Train Model"):
560
+ #Feature Selection
561
+ if feature_selection_method == "SelectKBest":
562
+ k = st.slider("Number of Features to Select", 1, len(feature_columns), len(feature_columns))
563
+ selector = SelectKBest(k=k)
564
+ X_train_selected = selector.fit_transform(X_train_processed, y_train)
565
+ X_test_selected = selector.transform(X_test_processed)
566
+ else:
567
+ X_train_selected = X_train_processed
568
+ X_test_selected = X_test_processed
569
+ # Model Training and Hyperparameter Tuning
570
+ if model_name == "Linear Regression":
571
+ model = LinearRegression()
572
+ elif model_name == "Logistic Regression":
573
+ model = LogisticRegression(max_iter=1000)
574
+ elif model_name == "Decision Tree":
575
+ if problem_type == "Regression":
576
+ model = DecisionTreeRegressor()
577
+ else:
578
+ model = DecisionTreeClassifier()
579
+ elif model_name == "Random Forest":
580
+ if problem_type == "Regression":
581
+ model = RandomForestRegressor(random_state=42)
582
+ grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') # Example scoring
583
+ grid_search.fit(X_train_selected, y_train)
584
+ model = grid_search.best_estimator_
585
+ st.write("Best Parameters:", grid_search.best_params_)
586
+ else:
587
+ model = RandomForestClassifier(random_state=42)
588
+ grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
589
+ grid_search.fit(X_train_selected, y_train)
590
+ model = grid_search.best_estimator_
591
+ st.write("Best Parameters:", grid_search.best_params_)
592
+
593
+ elif model_name == "Gradient Boosting":
594
+ model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
595
+ elif model_name == "SVM":
596
+ model = SVR() if problem_type == "Regression" else SVC()
597
+
598
+ # Cross-validation
599
+ cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5) #example, adjust cv
600
+ st.write(f"Cross-validation scores: {cv_scores}")
601
+ st.write(f"Mean cross-validation score: {cv_scores.mean():.4f}")
602
+
603
+ model.fit(X_train_selected, y_train)
604
+
605
+ # Model Saving
606
+ model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model")
607
+ if st.button("Save Model"):
608
+ try:
609
+ joblib.dump(st.session_state.model, f"{model_filename}.joblib")
610
+ st.success(f"Model saved as {model_filename}.joblib")
611
+ except Exception as e:
612
+ st.error(f"Error saving model: {e}")
613
+ # Model loading in a different section
614
+ model_file = st.file_uploader("Upload Trained Model", type=["joblib"])
615
+ if model_file is not None:
616
+ try:
617
+ st.session_state.model = joblib.load(model_file)
618
+ st.success("Model loaded successfully!")
619
+ except Exception as e:
620
+ st.error(f"Error loading model: {e}")
621
 
622
+ #Model Evaluation Section
623
+ y_pred = model.predict(X_test_selected)
624
 
625
+ if problem_type == "Regression":
626
+ mse = mean_squared_error(y_test, y_pred)
627
+ r2 = r2_score(y_test, y_pred)
628
+ st.write(f"Mean Squared Error: {mse:.4f}")
629
+ st.write(f"R-squared: {r2:.4f}")
630
+ else:
631
+ accuracy = accuracy_score(y_test, y_pred)
632
+ st.write(f"Accuracy: {accuracy:.4f}")
633
 
634
  elif app_mode == "Predictions":
635
  st.title("🔮 Make Predictions")
 
670
  else:
671
  st.write("Please train a model first in the 'Model Training' section.")
672
 
673
+ #Add batch prediction section in prediction tab
674
+ st.subheader("Batch Predictions")
675
+ batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"])
676
+ if batch_file is not None:
677
+ try:
678
+ batch_df = pd.read_csv(batch_file)
679
+ # Preprocess the batch data
680
+ batch_processed = st.session_state.preprocessor.transform(batch_df)
681
+ # Make predictions
682
+ batch_predictions = st.session_state.model.predict(batch_processed)
683
+ batch_df['Prediction'] = batch_predictions
684
+ st.dataframe(batch_df)
685
+
686
+ # Download predictions
687
+ csv = batch_df.to_csv(index=False)
688
+ b64 = base64.b64encode(csv.encode()).decode() # some strings
689
+ href = f'<a href="data:file/csv;base64,{b64}" download="predictions.csv">Download Predictions CSV</a>'
690
+ st.markdown(href, unsafe_allow_html=True)
691
+
692
+ except Exception as e:
693
+ st.error(f"Error processing batch file: {e}")
694
+
695
+
696
  elif app_mode == "Visualization Lab":
697
  st.title("🔬 Advanced Data Visualization and Clustering Lab")
698
 
 
803
  st.success("Clustering applied successfully!")
804
  except Exception as e:
805
  st.error(f"An error occurred during clustering: {e}")
806
+ #Add clustering performance in clustering analysis
807
+ if len(cluster_cols) >= 2: # Evaluate Silhouette Score
808
+ try:
809
+ silhouette_avg = silhouette_score(scaled_data, clusters)
810
+ st.write(f"Silhouette Score: {silhouette_avg:.4f}")
811
+ except:
812
+ st.write("Could not compute silhouette score")
813
+
814
+ #Add dimensionality reduction option and 2d/3d plots
815
+
816
+ dimension_reduction = st.selectbox("Dimensionality Reduction", ["None", "PCA"])
817
+ if dimension_reduction == "PCA":
818
+ n_components = st.slider("Number of Components", 2, min(3, len(cluster_cols)), 2)
819
+ pca = PCA(n_components=n_components)
820
+ principal_components = pca.fit_transform(scaled_data)
821
+ pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i + 1}' for i in range(n_components)])
822
+ pca_df['Cluster'] = clusters # Add Cluster
823
+
824
+ if len(cluster_cols) >= 2: #plotting section
825
+ fig = None #Initialize fig
826
+ if dimension_reduction == "None":
827
+ if len(cluster_cols) == 2:
828
+ fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
829
+ st.plotly_chart(fig, use_container_width=True)
830
+ elif len(cluster_cols) == 3:
831
+ fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
832
+ st.plotly_chart(fig, use_container_width=True)
833
+ else:
834
+ st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
835
+
836
+ elif dimension_reduction == "PCA":
837
+ if n_components == 2:
838
+ fig = px.scatter(pca_df, x='PC1', y='PC2', color='Cluster', title="K-Means Clustering (PCA - 2D)")
839
+ st.plotly_chart(fig, use_container_width=True)
840
+ elif n_components == 3:
841
+ fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Cluster', title="K-Means Clustering (PCA - 3D)")
842
+ st.plotly_chart(fig, use_container_width=True)
843
+
844
+ else:
845
+ st.write("PCA visualization is only supported for 2 or 3 components.")
846
 
847
  elif app_mode == "Neural Network Studio":
848
  st.title("🧠 Neural Network Studio")
 
886
  test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the data to use for testing.")
887
 
888
  # Model Training Button
889
+ if st.button("Train Neural Network Model"):
890
  with st.spinner("Training neural network model..."):
891
  try:
892
  # Split data
 
904
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
905
  ])
906
 
907
+ numeric_features = X_train.select_dtypes(include=np.number).columns
908
+ categorical_features = X_train.select_dtypes(include='object').columns
909
 
910
  preprocessor = ColumnTransformer(
911
  transformers=[
 
917
  X_test_processed = preprocessor.transform(X_test)
918
 
919
  # Neural Network Model Selection and Training
920
+ tf.random.set_seed(42) # for reproducibility
921
+
922
+ # Callbacks (Early Stopping)
923
+ early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
924
+
925
  if model_type == "Simple Neural Network":
926
  model = keras.Sequential()
927
  model.add(layers.Input(shape=(X_train_processed.shape[1],)))
928
  for _ in range(hidden_layers):
929
+ model.add(layers.Dense(neurons_per_layer, activation=activation)) # Use the selected activation
930
+ model.add(
931
+ layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
932
+ activation='linear' if problem_type == "Regression" else 'softmax'))
933
 
934
+ optimizer = keras.optimizers.Adam(learning_rate=learning_rate) # Use the learning rate
935
+
936
+ model.compile(optimizer=optimizer,
937
  loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
938
  metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
939
 
940
+ history = model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size,
941
+ validation_split=0.2, verbose=0,
942
+ callbacks=[early_stopping]) # Added early stopping
943
 
944
  y_pred = model.predict(X_test_processed)
945
  if problem_type == "Classification":
 
950
  X_test_cnn = np.expand_dims(X_test_processed, axis=2)
951
 
952
  model = keras.Sequential()
953
+ model.add(layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu',
954
+ input_shape=(X_train_cnn.shape[1], 1)))
955
+ model.add(layers.MaxPooling1D(pool_size=pooling_size))
956
  model.add(layers.Flatten())
957
  model.add(layers.Dense(50, activation='relu'))
958
+ model.add(
959
+ layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
960
+ activation='linear' if problem_type == "Regression" else 'softmax'))
961
 
962
+ optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
963
+ model.compile(optimizer=optimizer,
964
  loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
965
  metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
966
 
967
+ history = model.fit(X_train_cnn, y_train, epochs=epochs_cnn, batch_size=batch_size_cnn,
968
+ validation_split=0.2, verbose=0,
969
+ callbacks=[early_stopping])
970
 
971
  y_pred = model.predict(X_test_cnn)
972
  if problem_type == "Classification":
 
974
 
975
  elif model_type == "Recurrent Neural Network (RNN)":
976
  try:
977
+ X_train_rnn = np.reshape(X_train_processed, (
978
+ X_train_processed.shape[0], sequence_length,
979
+ X_train_processed.shape[1] // sequence_length))
980
+ X_test_rnn = np.reshape(X_test_processed, (
981
+ X_test_processed.shape[0], sequence_length, X_test_processed.shape[1] // sequence_length))
982
 
983
  model = keras.Sequential()
984
+ model.add(layers.SimpleRNN(units, activation='relu', # Use the selected units
985
+ dropout=dropout_rate,
986
+ input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])))
987
+ model.add(
988
+ layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
989
+ activation='linear' if problem_type == "Regression" else 'softmax'))
990
+
991
+ optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
992
+ model.compile(optimizer=optimizer,
993
  loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
994
  metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
995
 
996
+ history = model.fit(X_train_rnn, y_train, epochs=epochs_rnn, batch_size=batch_size_rnn,
997
+ validation_split=0.2, verbose=0,
998
+ callbacks=[early_stopping])
999
 
1000
  y_pred = model.predict(X_test_rnn)
1001
  if problem_type == "Classification":
 
1026
  st.write("Classification Report:")
1027
  st.text(classification_report(y_test, y_pred))
1028
 
1029
+ # Visualization
1030
+ st.subheader("Training History")
1031
+ fig, ax = plt.subplots() # Use matplotlib directly
1032
+
1033
+ ax.plot(history.history['loss'], label='loss')
1034
+ ax.plot(history.history['val_loss'], label='val_loss')
1035
+ ax.set_xlabel('Epoch')
1036
+ ax.set_ylabel('Loss')
1037
+ ax.legend()
1038
+ st.pyplot(fig) # Display with st.pyplot
1039
+
1040
  st.success("Model trained successfully!")
1041
 
1042
+ except Exception as e:
1043
+ st.error(f"An error occurred during training: {e}")
1044
+
1045
  except Exception as e:
1046
  st.error(f"An error occurred during training: {e}")