CosmickVisions commited on
Commit
b9d21cf
Β·
verified Β·
1 Parent(s): 5cb75ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -174
app.py CHANGED
@@ -153,20 +153,20 @@ app_mode = st.sidebar.selectbox(
153
  help="Choose the section to navigate to."
154
  )
155
 
 
 
 
 
156
  # --- Data Upload Page ---
 
157
  if app_mode == "Data Upload":
158
- st.title("πŸ“€ Smart Data Hub")
159
  st.markdown("""
160
- **Upload your dataset** (CSV, Excel, Parquet) for comprehensive analysis.
161
- Get instant data health insights and quality assessment.
162
  """)
163
 
164
- # File upload with enhanced UI
165
- uploaded_file = st.file_uploader(
166
- "Drag & drop or browse files",
167
- type=list(ALLOWED_EXTENSIONS),
168
- help=f"Max file size: {MAX_FILE_SIZE_MB}MB. Supported formats: {', '.join(ALLOWED_EXTENSIONS)}"
169
- )
170
 
171
  if uploaded_file:
172
  # Validate file
@@ -174,9 +174,9 @@ if app_mode == "Data Upload":
174
  if not is_valid:
175
  st.error(f"Upload error: {message}")
176
  st.stop()
177
-
178
  # Load data with progress
179
- with st.spinner(f"Loading {uploaded_file.name}..."):
180
  try:
181
  if uploaded_file.name.endswith('.csv'):
182
  df = pd.read_csv(uploaded_file, low_memory=False)
@@ -186,10 +186,8 @@ if app_mode == "Data Upload":
186
  df = pd.read_parquet(uploaded_file)
187
  elif uploaded_file.name.endswith('.feather'):
188
  df = pd.read_feather(uploaded_file)
189
-
190
  st.session_state.raw_data = df
191
  st.success("Dataset loaded successfully!")
192
-
193
  except Exception as e:
194
  st.error(f"Error loading file: {str(e)}")
195
  st.stop()
@@ -260,171 +258,124 @@ if app_mode == "Data Upload":
260
  st_profile_report(pr)
261
 
262
 
263
- tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
264
-
265
-
266
- # 1. Missing Value Handling
267
- with tab1:
268
- st.markdown("### πŸ•³οΈ Handle Missing Values")
269
- missing_cols = df.columns[df.isna().any()].tolist()
270
- if missing_cols:
271
- st.write("Columns with missing values:")
272
- cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
273
-
274
- method = st.radio("Imputation Method", [
275
- "Keep Missing",
276
- "Drop Missing",
277
- "Mean/Median/Mode",
278
- "KNN Imputation",
279
- "MICE Imputation",
280
- "Deep Learning Imputation"
281
- ], horizontal=True)
282
-
283
- if st.button(f"Apply {method}"):
284
- try:
285
- original_df = df.copy() # Store the original df before applying any change
286
- if missing_value_method == "Drop Missing":
287
- df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
288
- cleaning_actions.append(f"Dropped missing values in selected columns")
289
- elif missing_value_method == "Mean/Median/Mode":
290
- # Allow the user to select the specific imputation method
291
- imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
292
-
293
- # Imputation logic here, added to perform the imputation in multiple columns
294
- for col in cols:
295
- if df[col].isnull().any(): # Check if missing values exist before imputing
296
- if pd.api.types.is_numeric_dtype(df[col]):
297
- if imputation_choice == "Mean":
298
- df[col] = df[col].fillna(df[col].mean())
299
- elif imputation_choice == "Median":
300
- df[col] = df[col].fillna(df[col].median())
301
- elif imputation_choice == "Mode":
302
- df[col] = df[col].fillna(df[col].mode()[0])
303
- else: # Impute strings with mode
304
  df[col] = df[col].fillna(df[col].mode()[0])
305
- cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
306
-
307
- elif missing_value_method == "KNN Imputation":
308
- from sklearn.impute import KNNImputer
309
- imputer = KNNImputer(n_neighbors=5)
310
- # Ensure numeric data for KNN, select only numeric columns to impute
311
- numeric_cols = df[cols].select_dtypes(include=np.number).columns
312
- if not numeric_cols.empty: # Check if there are numeric columns to impute
313
- df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
314
- cleaning_actions.append(f"Applied KNN Imputation on {cols}")
315
- else:
316
- st.warning("No numeric columns to apply KNN imputation")
317
- elif missing_value_method == "MICE Imputation":
318
- from sklearn.impute import IterativeImputer
319
- # Select numeric columns for MICE
320
- numeric_cols = df[cols].select_dtypes(include=np.number).columns
321
- if not numeric_cols.empty: # Check if there are numeric columns to impute
322
- imputer = IterativeImputer()
323
- df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
324
- cleaning_actions.append(f"Applied MICE Imputation on {cols}")
325
- else:
326
- st.warning("No numeric columns to apply MICE imputation")
327
-
328
- elif missing_value_method == "Deep Learning Imputation":
329
- st.warning("Deep Learning Imputation is not implemented in this example. Please use other methods.")
330
-
331
- update_version(df) # Update the version after cleaning
332
- st.success(f"{missing_value_method} applied successfully! βœ…")
333
- except Exception as e:
334
- st.error(f"Error: {str(e)}")
335
- else:
336
- st.success("✨ No missing values found!")
337
-
338
- # 2. Duplicate Handling
339
- with tab2:
340
- st.markdown("### πŸ”„ Handle Duplicates")
341
- duplicates = df.duplicated().sum()
342
- if duplicates > 0:
343
- st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
344
- dup_strategy = st.radio("Duplicate Strategy", [
345
- "Remove All Duplicates",
346
- "Keep First Occurrence",
347
- "Keep Last Occurrence"
348
- ])
349
- if st.button("Handle Duplicates"):
350
- original_count = len(df)
351
- df = df.drop_duplicates(keep={
352
- "Remove All Duplicates": False,
353
- "Keep First Occurrence": 'first',
354
- "Keep Last Occurrence": 'last'
355
- }[dup_strategy])
356
- cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
357
  update_version(df)
358
- st.success(f"Removed {original_count - len(df)} duplicates! βœ…")
359
- else:
360
- st.success("✨ No duplicates found!")
361
-
362
- # 3. Data Type Conversion
363
- with tab3:
364
- st.markdown("### πŸ”„ Convert Data Types")
365
- col1, col2 = st.columns(2)
366
- with col1:
367
- st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
368
- with col2:
369
- col_to_convert = st.selectbox("Select column to convert", df.columns)
370
- new_type = st.selectbox("New Data Type", [
371
- "String", "Integer", "Float",
372
- "Boolean", "Datetime", "Category"
373
- ])
374
- if st.button("Convert Data Type"):
375
- try:
376
- if new_type == "String":
377
- df[col_to_convert] = df[col_to_convert].astype(str)
378
- elif new_type == "Integer":
379
- df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
380
- elif new_type == "Float":
381
- df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
382
- elif new_type == "Boolean":
383
- df[col_to_convert] = df[col_to_convert].astype(bool)
384
- elif new_type == "Datetime":
385
- df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
386
- elif new_type == "Category":
387
- df[col_to_convert] = df[col_to_convert].astype('category')
388
 
389
- cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
390
- update_version(df)
391
- st.success("Data type converted successfully! βœ…")
392
- except Exception as e:
393
- st.error(f"Conversion failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
- # 4. Outlier Handling
396
- with tab4:
397
- st.markdown("### πŸ“ˆ Handle Outliers")
398
- numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
399
- if numeric_cols:
400
- outlier_col = st.selectbox("Select numeric column", numeric_cols)
401
- st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
402
- outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
403
- if st.button("Remove Outliers"):
404
- try:
405
- original_df = df.copy()
406
- if outlier_method == "Z-score":
407
- from scipy import stats
408
- z_scores = np.abs(stats.zscore(df[outlier_col]))
409
- df = df[(z_scores < 3)] # Keep only values with zscore less than 3
410
- cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
411
- elif outlier_method == "IQR":
412
- Q1 = df[outlier_col].quantile(0.25)
413
- Q3 = df[outlier_col].quantile(0.75)
414
- IQR = Q3 - Q1
415
- df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
416
- cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
417
- elif outlier_method == "Manual":
418
- lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
419
- upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
420
- df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
421
- cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
422
- update_version(df)
423
- st.success("Outliers removed successfully! βœ…")
424
- except Exception as e:
425
- st.error(f"Outlier removal failed: {str(e)}")
426
- else:
427
- st.info("ℹ️ No numeric columns found for outlier detection")
428
 
429
  # Drop Column Functionality with Interface
430
  st.subheader("πŸ—‘οΈ Drop Specific Columns")
 
153
  help="Choose the section to navigate to."
154
  )
155
 
156
+
157
+ # Initialize df globally
158
+ df = pd.DataFrame()
159
+
160
  # --- Data Upload Page ---
161
+ # Data Upload Page
162
  if app_mode == "Data Upload":
163
+ st.title("πŸ“₯ Smart Data Hub")
164
  st.markdown("""
165
+ **Upload your dataset** (CSV, Excel, Parquet) for comprehensive analysis. Get instant data health insights and quality assessment.
 
166
  """)
167
 
168
+ # File upload
169
+ uploaded_file = st.file_uploader("Drag & drop or browse files", type=list(ALLOWED_EXTENSIONS))
 
 
 
 
170
 
171
  if uploaded_file:
172
  # Validate file
 
174
  if not is_valid:
175
  st.error(f"Upload error: {message}")
176
  st.stop()
177
+
178
  # Load data with progress
179
+ with st.spinner(f"Loading {uploaded_file.name} ..."):
180
  try:
181
  if uploaded_file.name.endswith('.csv'):
182
  df = pd.read_csv(uploaded_file, low_memory=False)
 
186
  df = pd.read_parquet(uploaded_file)
187
  elif uploaded_file.name.endswith('.feather'):
188
  df = pd.read_feather(uploaded_file)
 
189
  st.session_state.raw_data = df
190
  st.success("Dataset loaded successfully!")
 
191
  except Exception as e:
192
  st.error(f"Error loading file: {str(e)}")
193
  st.stop()
 
258
  st_profile_report(pr)
259
 
260
 
261
+ # Cleaning Operations with Tabs
262
+ st.subheader("πŸ”§ Cleaning Operations")
263
+ tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
264
+
265
+ # 1. Missing Value Handling
266
+ with tab1:
267
+ st.markdown("### πŸ•³οΈ Handle Missing Values")
268
+ missing_cols = df.columns[df.isna().any()].tolist()
269
+ if missing_cols:
270
+ st.write("Columns with missing values:")
271
+ cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
272
+
273
+ method = st.radio("Imputation Method", [
274
+ "Drop Missing",
275
+ "Mean/Median/Mode",
276
+ "KNN Imputation",
277
+ "MICE Imputation",
278
+ "Deep Learning Imputation"
279
+ ], horizontal=True)
280
+
281
+ if method == "Mean/Median/Mode":
282
+ imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
283
+
284
+ if st.button(f"Apply {method}"):
285
+ try:
286
+ original_df = df.copy()
287
+ if method == "Mean/Median/Mode":
288
+ for col in cols:
289
+ if df[col].isnull().any(): # Check if missing values exist before imputing
290
+ if pd.api.types.is_numeric_dtype(df[col]):
291
+ if imputation_choice == "Mean":
292
+ df[col] = df[col].fillna(df[col].mean())
293
+ elif imputation_choice == "Median":
294
+ df[col] = df[col].fillna(df[col].median())
295
+ elif imputation_choice == "Mode":
 
 
 
 
 
 
296
  df[col] = df[col].fillna(df[col].mode()[0])
297
+ else: # Impute strings with mode
298
+ df[col] = df[col].fillna(df[col].mode()[0])
299
+ # Add logic for other methods here...
300
+ cleaning_actions.append(f"Applied {method} on {cols}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  update_version(df)
302
+ st.success(f"{method} applied successfully! βœ…")
303
+ except Exception as e:
304
+ st.error(f"Error: {str(e)}")
305
+ else:
306
+ st.success("✨ No missing values found!")
307
+
308
+ # 2. Duplicate Handling
309
+ with tab2:
310
+ st.markdown("### πŸ”„ Handle Duplicates")
311
+ duplicates = df.duplicated().sum()
312
+ if duplicates > 0:
313
+ st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
314
+ dup_strategy = st.radio("Duplicate Strategy", [
315
+ "Remove All Duplicates",
316
+ "Keep First Occurrence",
317
+ "Keep Last Occurrence"
318
+ ])
319
+ if st.button("Handle Duplicates"):
320
+ original_count = len(df)
321
+ df = df.drop_duplicates(keep={
322
+ "Remove All Duplicates": False,
323
+ "Keep First Occurrence": 'first',
324
+ "Keep Last Occurrence": 'last'
325
+ }[dup_strategy])
326
+ cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
327
+ update_version(df)
328
+ st.success(f"Removed {original_count - len(df)} duplicates! βœ…")
329
+ else:
330
+ st.success("✨ No duplicates found!")
 
331
 
332
+ # 3. Data Type Conversion
333
+ with tab3:
334
+ st.markdown("### πŸ”„ Convert Data Types")
335
+ col1, col2 = st.columns(2)
336
+ with col1:
337
+ st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
338
+ with col2:
339
+ col_to_convert = st.selectbox("Select column to convert", df.columns)
340
+ new_type = st.selectbox("New Data Type", [
341
+ "String", "Integer", "Float",
342
+ "Boolean", "Datetime", "Category"
343
+ ])
344
+ if st.button("Convert Data Type"):
345
+ try:
346
+ if new_type == "String":
347
+ df[col_to_convert] = df[col_to_convert].astype(str)
348
+ elif new_type == "Integer":
349
+ df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
350
+ elif new_type == "Float":
351
+ df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
352
+ elif new_type == "Boolean":
353
+ df[col_to_convert] = df[col_to_convert].astype(bool)
354
+ elif new_type == "Datetime":
355
+ df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
356
+ elif new_type == "Category":
357
+ df[col_to_convert] = df[col_to_convert].astype('category')
358
+ cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
359
+ update_version(df)
360
+ st.success("Data type converted successfully! βœ…")
361
+ except Exception as e:
362
+ st.error(f"Conversion failed: {str(e)}")
363
+
364
+ # 4. Outlier Handling
365
+ with tab4:
366
+ st.markdown("### πŸ“ˆ Handle Outliers")
367
+ numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
368
+ if numeric_cols:
369
+ outlier_col = st.selectbox("Select numeric column", numeric_cols)
370
+ st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
371
+ if st.button("Remove Outliers"):
372
+ # Outlier removal logic here...
373
+ cleaning_actions.append(f"Removed outliers from {outlier_col}")
374
+ update_version(df)
375
+ st.success("Outliers removed successfully! βœ…")
376
+ else:
377
+ st.info("ℹ️ No numeric columns found for outlier detection")
378
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
  # Drop Column Functionality with Interface
381
  st.subheader("πŸ—‘οΈ Drop Specific Columns")