CosmickVisions commited on
Commit
bc73880
ยท
verified ยท
1 Parent(s): 35d44ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +222 -748
app.py CHANGED
@@ -2,787 +2,261 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import plotly.express as px
5
- import seaborn as sns
6
- import matplotlib.pyplot as plt
7
- from io import StringIO
8
- from sklearn.impute import KNNImputer, SimpleImputer
9
- from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
10
- from sklearn.decomposition import PCA
11
- from sklearn.cluster import KMeans
12
  from sklearn.model_selection import train_test_split
13
- from pycaret.classification import setup, compare_models, pull
14
- from scipy.stats import zscore
15
- import matplotlib
16
- from sklearn.feature_selection import SelectKBest, f_classif
17
- from ydata_profiling import ProfileReport
18
- from ydata_profiling.config import Settings
19
- from functools import lru_cache
20
- # ================== ๐Ÿ”น ENHANCED STYLING ==================
21
- def load_custom_css():
22
- st.markdown("""
23
- <style>
24
- /* ๐ŸŒŒ Cosmic Nebula Background */
25
- body, .main {
26
- background: radial-gradient(circle at top, #10002b 0%, #240046 50%, #3c096c 100%);
27
- color: #ffffff;
28
- font-family: 'Poppins', sans-serif;
29
- }
30
- /* ๐ŸŒ  Animated Starfield Effect */
31
- body::before {
32
- content: "";
33
- position: fixed;
34
- top: 0;
35
- left: 0;
36
- width: 100%;
37
- height: 100%;
38
- background: url('https://source.unsplash.com/random/1600x900/?stars,galaxy,nebula') center/cover no-repeat;
39
- opacity: 0.1;
40
- z-index: -1;
41
- }
42
- /* ๐Ÿช Glassmorphism Containers */
43
- .stContainer, .stExpander, .stDataFrame {
44
- background: rgba(255, 255, 255, 0.08) !important;
45
- backdrop-filter: blur(15px);
46
- border-radius: 15px;
47
- border: 1px solid rgba(255, 255, 255, 0.12);
48
- padding: 1.5rem;
49
- box-shadow: 0 10px 30px rgba(255, 255, 255, 0.12);
50
- }
51
- /* ๐Ÿ”ฎ Cyberpunk Buttons */
52
- .stButton>button {
53
- background: linear-gradient(90deg, #ff00ff, #00ffff);
54
- color: white !important;
55
- border: none;
56
- border-radius: 12px;
57
- padding: 0.8rem 1.5rem;
58
- font-weight: bold;
59
- letter-spacing: 0.05rem;
60
- transition: all 0.4s ease;
61
- text-transform: uppercase;
62
- width: 100%;
63
- }
64
- .stButton>button:hover {
65
- transform: scale(1.05);
66
- box-shadow: 0 0 20px rgba(0, 255, 255, 0.8);
67
- }
68
- /* ๐ŸŽ† Neon Headers */
69
- h1, h2, h3, h4, h5, h6 {
70
- font-weight: bold;
71
- text-transform: uppercase;
72
- text-shadow: 0 0 10px rgba(0, 255, 255, 0.6);
73
- color: #00ffff;
74
- padding: 0.5rem 0;
75
- }
76
- /* ๐Ÿ” Interactive Inputs */
77
- .stTextInput>div>div>input,
78
- .stSelectbox>div>div>div,
79
- .stSlider>div>div>div {
80
- background: rgba(0, 0, 0, 0.5) !important;
81
- border-radius: 10px !important;
82
- padding: 0.75rem !important;
83
- color: white !important;
84
- border: 1px solid rgba(255, 255, 255, 0.3) !important;
85
- transition: all 0.3s ease;
86
- }
87
- .stTextInput>div>div>input:focus,
88
- .stSelectbox>div>div>div:hover {
89
- border-color: #ff00ff !important;
90
- box-shadow: 0 0 12px rgba(255, 0, 255, 0.6);
91
- }
92
- /* ๐ŸŽญ Data Grid Styling */
93
- [data-testid="stDataFrame"] {
94
- border: 1px solid rgba(255, 255, 255, 0.2);
95
- border-radius: 10px;
96
- background: rgba(255, 255, 255, 0.05);
97
- padding: 1rem;
98
- color: white !important;
99
- }
100
- /* ๐Ÿ“Š Graph Enhancements */
101
- .stPlotlyChart, .stPydeckChart {
102
- border-radius: 15px;
103
- border: 1px solid rgba(255, 255, 255, 0.1);
104
- padding: 1rem;
105
- box-shadow: 0 8px 20px rgba(255, 255, 255, 0.15);
106
- }
107
- /* ๐ŸŽ›๏ธ Consistent Spacing */
108
- .stContainer > *,
109
- .stExpander > * {
110
- margin: 1rem 0;
111
- }
112
- /* ๐Ÿš€ Futuristic Scrollbars */
113
- ::-webkit-scrollbar {
114
- width: 8px;
115
- height: 8px;
116
- }
117
- ::-webkit-scrollbar-track {
118
- background: rgba(25, 25, 45, 0.5);
119
- }
120
- ::-webkit-scrollbar-thumb {
121
- background: linear-gradient(180deg, #ff00ff, #00ffff);
122
- border-radius: 4px;
123
- box-shadow: 0 0 10px rgba(255, 255, 255, 0.3);
124
- }
125
- /* โœจ Smooth Animations */
126
- * {
127
- transition: all 0.25s ease-in-out;
128
- }
129
- </style>
130
- """, unsafe_allow_html=True)
131
-
132
- load_custom_css()
133
-
134
 
 
 
135
 
136
- # ================== ๐Ÿ”น CACHED FUNCTIONS ==================
137
- # ================== ๐Ÿ”น CACHED FUNCTIONS ==================
138
  @st.cache_data(ttl=3600)
139
- def calculate_statistics(df, column):
140
- """Calculate and cache statistics for a column."""
141
- if pd.api.types.is_numeric_dtype(df[column]):
142
- return {
143
- "mean": df[column].mean(),
144
- "median": df[column].median(),
145
- "std": df[column].std(),
146
- "min": df[column].min(),
147
- "max": df[column].max()
148
- }
149
- else:
150
- return {
151
- "unique_values": df[column].nunique(),
152
- "most_common": df[column].mode()[0]
153
- }
154
 
155
  @st.cache_data(ttl=3600)
156
- def generate_chart(df, chart_type, x_col, y_col=None, z_col=None):
157
- """Generate and cache Plotly charts."""
158
- if chart_type == "Histogram":
159
- return px.histogram(df, x=x_col, nbins=30, title=f"Distribution of {x_col}",
160
- color_discrete_sequence=['#00cc96'], template="plotly_dark")
161
- elif chart_type == "Box Plot":
162
- return px.box(df, y=x_col, title=f"Box Plot of {x_col}",
163
- color_discrete_sequence=['#ff7f0e'], template="plotly_dark")
164
- elif chart_type == "Violin Plot":
165
- return px.violin(df, y=x_col, title=f"Violin Plot of {x_col}",
166
- color_discrete_sequence=['#9467bd'], template="plotly_dark")
167
- elif chart_type == "Scatter Plot":
168
- return px.scatter(df, x=x_col, y=y_col, title=f"{x_col} vs {y_col}",
169
- color_discrete_sequence=['#1f77b4'], template="plotly_dark")
170
- elif chart_type == "3D Scatter":
171
- return px.scatter_3d(df, x=x_col, y=y_col, z=z_col,
172
- title=f"3D Analysis: {x_col} vs {y_col} vs {z_col}",
173
- color_discrete_sequence=['#2ca02c'], template="plotly_dark")
174
- elif chart_type == "Heatmap":
175
- corr_matrix = df[[x_col, y_col]].corr()
176
- return px.imshow(corr_matrix, text_auto=True, title="Correlation Heatmap",
177
- color_continuous_scale='Viridis', template="plotly_dark")
178
-
179
- # ================== ๐Ÿ”น LAZY-LOADING COMPONENTS ==================
180
- def lazy_load_chart(df, chart_type, x_col, y_col=None):
181
- """Lazy-load a chart with a spinner."""
182
- with st.spinner(f"Generating {chart_type}..."):
183
- return generate_chart(df, chart_type, x_col, y_col)
184
 
185
- def lazy_load_statistics(df, column):
186
- """Lazy-load statistics with a spinner."""
187
- with st.spinner("Calculating statistics..."):
188
- return calculate_statistics(df, column)
189
-
190
-
191
- # ================== ๐Ÿ”น SESSION STATE ==================
192
- if 'df' not in st.session_state:
193
- st.session_state.df = None
194
- if 'cleaned_df' not in st.session_state:
195
- st.session_state.cleaned_df = None
196
- if 'X_train' not in st.session_state:
197
- st.session_state.X_train = None
198
- if 'X_test' not in st.session_state:
199
- st.session_state.X_test = None
200
- if 'y_train' not in st.session_state:
201
- st.session_state.y_train = None
202
- if 'y_test' not in st.session_state:
203
- st.session_state.y_test = None
204
  if 'model' not in st.session_state:
205
  st.session_state.model = None
206
 
207
- # ================== ๐Ÿ”น GLOBAL NAVIGATION ==================
208
- st.sidebar.title("๐Ÿš€ Nexus Analytics")
209
- choice = st.sidebar.radio("Go to", ["Home", "Data Cleaning", "EDA", "Train-Test Split",
210
- "Machine Learning", "Predictions", "Visualization"])
211
- if choice == "Home":
212
- st.title("๐Ÿ“‚ Upload Your Dataset")
213
-
214
- # Dataset Control Buttons
215
- control_col1, control_col2 = st.columns([1, 2])
216
- with control_col1:
217
- if st.session_state.df is not None:
218
- if st.button("๐Ÿงน Clear Dataset", help="Remove current dataset from memory"):
219
- st.session_state.df = None
220
- st.session_state.cleaned_df = None
221
- st.success("Dataset cleared from memory!")
222
 
223
- with control_col2:
224
- replace_file = st.file_uploader("Replace Dataset", type=["csv", "xlsx"],
225
- help="Upload a new dataset to replace current one",
226
- key="replace_uploader")
 
227
 
228
- if replace_file:
229
- df = pd.read_csv(replace_file) if replace_file.name.endswith('.csv') else pd.read_excel(replace_file)
230
- st.session_state.df = df
231
- st.session_state.cleaned_df = df.copy()
232
- st.success("โœ… Dataset replaced successfully!")
 
 
 
 
 
 
 
 
 
233
 
234
- # Main Dataset Upload
235
- if st.session_state.df is None:
236
- with st.container():
237
- uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx"],
238
- help="Drag and drop your dataset file here")
239
-
240
- if uploaded_file:
241
- df = pd.read_csv(uploaded_file) if uploaded_file.name.endswith('.csv') else pd.read_excel(uploaded_file)
242
- st.session_state.df = df
243
- st.session_state.cleaned_df = df.copy()
244
- st.success("โœ… Data uploaded successfully!")
245
 
246
- # Show dataset information if loaded
247
- if st.session_state.df is not None:
248
- df = st.session_state.df
249
 
250
- # Dataset Overview Cards
251
- with st.container():
252
- col1, col2, col3 = st.columns(3)
253
- with col1:
254
- with st.container():
255
- st.markdown("### ๐Ÿ“ Dataset Shape")
256
- st.markdown(f"**{df.shape[0]}** Rows | **{df.shape[1]}** Columns")
257
-
258
- with col2:
259
- with st.container():
260
- st.markdown("### โš ๏ธ Data Issues")
261
- st.markdown(f"**{df.isnull().sum().sum()}** Missing Values | **{df.duplicated().sum()}** Duplicates")
262
-
263
- with col3:
264
- with st.container():
265
- st.markdown("### ๐Ÿงฌ Data Types")
266
- num_cols = len(df.select_dtypes(include=np.number).columns)
267
- cat_cols = len(df.select_dtypes(include=['object']).columns)
268
- st.markdown(f"**{num_cols}** Numerical | **{cat_cols}** Categorical")
269
-
270
- # Automated Data Report
271
- with st.expander("๐Ÿ“Š Automated Data Report", expanded=True):
272
- if st.button("โœจ Generate Smart Report"):
273
- with st.spinner("๐Ÿ” Analyzing dataset..."):
274
- # Configure minimal report
275
- config = Settings()
276
- config.title = " "
277
- config.variables.descriptions = False
278
- config.show_variable_description = False
279
- config.samples.head = 0
280
- config.samples.tail = 0
281
-
282
- # Generate report with dark mode
283
- profile = ProfileReport(
284
- df,
285
- config=config,
286
- minimal=True,
287
- )
288
-
289
- # Apply custom color scheme
290
- report_html = profile.to_html()
291
- report_html = report_html.replace(
292
- ':root {',
293
- ':root { --primary-color: #00f7ff; --secondary-color: #0066ff;'
294
- )
295
- report_html = report_html.replace('<h1', '<h1 style="display:none"')
296
-
297
- st.components.v1.html(report_html, height=800, scrolling=True)
298
-
299
- # Interactive Data Explorer
300
- st.subheader("๐Ÿ” Data Explorer")
301
 
302
- # Data Samples Tabs
303
- with st.expander("๐Ÿ“‘ Data Samples", expanded=True):
304
- sample_type = st.selectbox("View Data Samples",
305
- ["First 5 Rows", "Last 5 Rows", "Random Sample"],
306
- key="sample_selector")
307
-
308
- if sample_type == "First 5 Rows":
309
- st.dataframe(df.head().style.highlight_null(color='#FF6666'), use_container_width=True)
310
- elif sample_type == "Last 5 Rows":
311
- st.dataframe(df.tail().style.highlight_null(color='#FF6666'), use_container_width=True)
312
- else:
313
- sample_size = st.slider("Sample Size", 5, min(100, len(df)), 10)
314
- st.dataframe(df.sample(sample_size).style.highlight_null(color='#FF6666'), use_container_width=True)
315
-
316
- # Column Analysis
317
- with st.expander("๐Ÿ“ˆ Column Insights", expanded=True):
318
- col1, col2 = st.columns(2)
319
- with col1:
320
- selected_col = st.selectbox("Select Column", df.columns)
321
-
322
- if pd.api.types.is_numeric_dtype(df[selected_col]):
323
- fig = px.histogram(df, x=selected_col,
324
- title=f"Distribution of {selected_col}",
325
- color_discrete_sequence=['#00f7ff'])
326
- st.plotly_chart(fig, use_container_width=True)
327
- else:
328
- value_counts = df[selected_col].value_counts().nlargest(10)
329
- fig = px.bar(value_counts,
330
- title=f"Top 10 Values in {selected_col}",
331
- color_discrete_sequence=['#0066ff'])
332
- st.plotly_chart(fig, use_container_width=True)
333
 
334
- with col2:
335
- st.markdown("#### Column Summary")
336
- st.write(f"**Data Type:** {df[selected_col].dtype}")
337
- st.write(f"**Unique Values:** {df[selected_col].nunique()}")
 
 
 
 
 
 
 
 
 
338
 
339
- if pd.api.types.is_numeric_dtype(df[selected_col]):
340
- st.write(f"**Min Value:** {df[selected_col].min():.2f}")
341
- st.write(f"**Max Value:** {df[selected_col].max():.2f}")
342
- st.write(f"**Mean Value:** {df[selected_col].mean():.2f}")
343
- else:
344
- st.write("**Most Common Value:**")
345
- st.write(df[selected_col].mode()[0])
346
-
347
- # Data Summary Tabs
348
- tab1, tab2, tab3 = st.tabs(["๐Ÿ“‹ Full Summary", "๐Ÿ“Š Statistics", "๐Ÿง  AI Insights"])
349
- with tab1:
350
- buffer = StringIO()
351
- df.info(buf=buffer)
352
- st.text(buffer.getvalue())
353
-
354
- with tab2:
355
- st.write(df.describe().style.background_gradient(cmap='Blues'))
356
-
357
- with tab3:
358
- st.markdown("### Automated Insights")
359
- if st.button("๐Ÿ”ฎ Generate AI-Powered Insights"):
360
- with st.spinner("๐Ÿค– Analyzing patterns..."):
361
- profile = ProfileReport(df, minimal=True)
362
- st.write(profile.to_html(), unsafe_allow_html=True)
363
 
364
- # ================== ๐Ÿ”น ENHANCED DATA CLEANING SECTION ==================
365
- elif choice == "Data Cleaning":
366
- st.header("๐Ÿงผ Intelligent Data Wrangling")
367
 
368
- if st.session_state.df is not None:
369
- df = st.session_state.cleaned_df.copy()
370
 
371
- # AI-Powered Cleaning Assistant
372
- st.subheader("๐Ÿค– Smart Cleaning Advisor")
373
- if st.button("Run Full Data Diagnosis", type="primary"):
374
- with st.spinner("๐Ÿš€ Performing multidimensional analysis..."):
375
- try:
376
- # Advanced data quality assessment
377
- numeric_cols = df.select_dtypes(include=np.number).columns
378
- diagnosis = pd.DataFrame({
379
- 'Metric': ['Missing Values', 'Duplicate Rows',
380
- 'Zero Variance', 'Data Leakage Risk'],
381
- 'Value': [
382
- f"{df.isnull().sum().sum()} ({df.isnull().mean().mean():.1%})",
383
- df.duplicated().sum(),
384
- df[numeric_cols].std()[df[numeric_cols].std() == 0].count(),
385
- "High" if df.skew().abs().max() > 5 else "Low"
386
- ],
387
- 'Severity': ['Critical' if df.isnull().sum().sum() > 0 else 'OK',
388
- 'Warning' if df.duplicated().sum() > 0 else 'OK',
389
- 'Critical' if df[numeric_cols].std()[df[numeric_cols].std() == 0].count() > 0 else 'OK',
390
- 'Warning' if df.skew().abs().max() > 5 else 'OK']
391
- })
392
-
393
- # Visualize data health
394
- fig = px.bar(diagnosis, x='Metric', y='Value', color='Severity',
395
- color_discrete_map={'Critical':'#ff2b2b','Warning':'#f0c929','OK':'#00ff87'},
396
- template="plotly_dark")
397
- st.plotly_chart(fig, use_container_width=True)
398
-
399
- except Exception as e:
400
- st.error(f"Diagnostic failed: {str(e)}")
401
-
402
- # Professional-Grade Cleaning Tools
403
- st.subheader("๐Ÿ”ง Enterprise Cleaning Toolkit")
404
- tab1, tab2, tab3, tab4 = st.tabs(["๐Ÿงฉ Missing Data", "๐Ÿ“ Normalization", "๐Ÿ“Š Outliers", "๐Ÿ”€ Encoding"])
405
 
406
- with tab1:
407
- cols = st.columns([1,3])
408
- with cols[0]:
409
- imp_method = st.selectbox("Imputation Strategy",
410
- ["ML Impute (Iterative)", "KNN", "MICE", "Matrix Factorization"],
411
- help="Select advanced imputation technique")
412
- if imp_method == "KNN":
413
- n_neighbors = st.slider("Neighbors", 3, 15, 5, help="Number of similar records to consider")
414
- with cols[1]:
415
- if st.button("Execute Smart Imputation", type="primary"):
416
- with st.spinner(f"โš™๏ธ Running {imp_method}..."):
417
- # Advanced imputation logic
418
- numeric_cols = df.select_dtypes(include=np.number).columns
419
- if imp_method == "KNN":
420
- imputer = KNNImputer(n_neighbors=n_neighbors)
421
- df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
422
- else:
423
- df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
424
- st.session_state.cleaned_df = df
425
- st.toast("Imputation complete!", icon="โœ…")
426
-
427
- with tab2:
428
- cols = st.columns([1,3])
429
- with cols[0]:
430
- scale_method = st.selectbox("Scaling Algorithm",
431
- ["Robust Scaling", "Quantum Normalization",
432
- "Adaptive MinMax", "Power Transform"],
433
- index=0)
434
- if scale_method == "Power Transform":
435
- lambda_val = st.slider("Lambda Parameter", -3.0, 3.0, 0.0)
436
- with cols[1]:
437
- if st.button("Apply Feature Engineering", type="primary"):
438
- with st.spinner("Transforming features..."):
439
- # Advanced scaling logic
440
- numeric_cols = df.select_dtypes(include=np.number).columns
441
- if scale_method == "Robust Scaling":
442
- scaler = RobustScaler()
443
- df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
444
- st.session_state.cleaned_df = df
445
- st.toast("Features transformed!", icon="โœ…")
446
-
447
- # Real-time Data Diff Viewer
448
- st.subheader("๐Ÿ” Version Comparison")
449
- cols = st.columns(2)
450
  with cols[0]:
451
- st.write("Original Data Snapshot")
452
- st.dataframe(st.session_state.df.head(3).style.highlight_null(color='#ff2b2b'))
453
  with cols[1]:
454
- st.write("Processed Version")
455
- st.dataframe(df.head(3).style.highlight_null(color='#00ff87'))
456
-
457
- # ================== ๏ฟฝ๏ฟฝ๏ฟฝ๏ฟฝ EDA SECTION ==================
458
- elif choice == "EDA":
459
- st.header("๐Ÿ” Advanced Exploratory Data Analysis")
460
-
461
- if st.session_state.cleaned_df is not None:
462
- df = st.session_state.cleaned_df
463
-
464
- # ================== ๐Ÿ”น USER INPUTS ==================
465
- st.subheader("๐Ÿ“Š Select Analysis Type")
466
- analysis_type = st.radio(
467
- "Choose Analysis Type",
468
- ["Single Variable", "Multi-Variable", "3D Analysis"],
469
- horizontal=True,
470
- help="Select the type of analysis you want to perform"
471
- )
472
-
473
- # Dynamic Column Selection Based on Analysis Type
474
- if analysis_type == "Single Variable":
475
- selected_columns = st.multiselect(
476
- "Select Columns for Analysis",
477
- df.columns,
478
- default=df.columns[:1],
479
- help="Choose one or more columns for single-variable analysis"
480
- )
481
- chart_type = st.selectbox(
482
- "Select Chart Type",
483
- ["Auto-Detect", "Histogram", "Box Plot", "Violin Plot"]
484
- )
485
-
486
- elif analysis_type == "Multi-Variable":
487
- selected_columns = st.multiselect(
488
- "Select Columns for Analysis",
489
- df.columns,
490
- default=df.columns[:2],
491
- help="Choose two or more columns for multi-variable analysis"
492
- )
493
- chart_type = st.selectbox(
494
- "Select Chart Type",
495
- ["Auto-Detect", "Scatter Plot", "Heatmap", "Box Plot", "Violin Plot"]
496
- )
497
 
498
- else: # 3D Analysis
499
- col1, col2, col3 = st.columns(3)
500
- with col1:
501
- x_col = st.selectbox("X Axis", df.columns)
502
- with col2:
503
- y_col = st.selectbox("Y Axis", df.columns)
504
- with col3:
505
- z_col = st.selectbox("Z Axis", df.columns)
506
- chart_type = "3D Scatter"
507
 
508
- # ================== ๐Ÿ”น AUTO-PLOT BUTTON ==================
509
- if st.button("โœจ Generate Advanced Visualizations", type="primary"):
510
- with st.spinner("๐Ÿš€ Generating insights..."):
511
- try:
512
- # Auto-Detect Logic
513
- if chart_type == "Auto-Detect":
514
- if analysis_type == "Single Variable":
515
- if pd.api.types.is_numeric_dtype(df[selected_columns[0]]):
516
- chart_type = "Histogram"
517
- else:
518
- chart_type = "Bar Chart"
519
-
520
- elif analysis_type == "Multi-Variable":
521
- if all(pd.api.types.is_numeric_dtype(df[col]) for col in selected_columns[:2]):
522
- chart_type = "Scatter Plot"
523
- else:
524
- chart_type = "Box Plot"
525
-
526
- # Generate Visualization
527
- if analysis_type == "Single Variable":
528
- col = selected_columns[0]
529
- fig = generate_chart(df, chart_type, col)
530
- stats = calculate_statistics(df, col)
531
-
532
- # Display results
533
- col1, col2 = st.columns([2, 1])
534
- with col1:
535
- st.plotly_chart(fig, use_container_width=True)
536
- with col2:
537
- st.subheader("๐Ÿ“Œ Key Insights")
538
- if pd.api.types.is_numeric_dtype(df[col]):
539
- st.metric("Mean", f"{stats['mean']:.2f}")
540
- st.metric("Median", f"{stats['median']:.2f}")
541
- st.metric("Std Dev", f"{stats['std']:.2f}")
542
- else:
543
- st.metric("Unique Values", stats['unique_values'])
544
- st.metric("Most Common", stats['most_common'])
545
-
546
- elif analysis_type == "Multi-Variable":
547
- if len(selected_columns) < 2:
548
- st.warning("Please select at least two columns")
549
- else:
550
- fig = generate_chart(df, chart_type, selected_columns[0], selected_columns[1])
551
- st.plotly_chart(fig, use_container_width=True)
552
-
553
- # Correlation insights
554
- if chart_type in ["Scatter Plot", "Heatmap"]:
555
- st.subheader("๐Ÿ“Œ Correlation Insights")
556
- try:
557
- corr = df[selected_columns[0]].corr(df[selected_columns[1]])
558
- st.write(f"**Correlation Coefficient:** {corr:.2f}")
559
- st.progress(abs(corr))
560
- st.caption("Absolute correlation strength")
561
- except:
562
- st.warning("Could not calculate correlation for selected columns")
563
-
564
- elif analysis_type == "3D Analysis":
565
- fig = generate_chart(df, "3D Scatter", x_col, y_col, z_col)
566
- st.plotly_chart(fig, use_container_width=True)
567
-
568
- # 3D Analysis Insights
569
- st.subheader("๐Ÿ“Œ 3D Analysis Insights")
570
- col1, col2, col3 = st.columns(3)
571
- with col1:
572
- st.metric("X Range", f"{df[x_col].min():.2f} - {df[x_col].max():.2f}")
573
- with col2:
574
- st.metric("Y Range", f"{df[y_col].min():.2f} - {df[y_col].max():.2f}")
575
- with col3:
576
- st.metric("Z Range", f"{df[z_col].min():.2f} - {df[z_col].max():.2f}")
577
-
578
- except Exception as e:
579
- st.error(f"Visualization error: {str(e)}")
580
- # ================== ๐Ÿ”น PRODUCTION-GRADE ML SECTION ==================
581
- elif choice == "Machine Learning":
582
- st.header("๐Ÿค– Enterprise ML Studio")
583
 
584
- if st.session_state.cleaned_df is not None:
585
- df = st.session_state.cleaned_df
586
 
587
- # Model Factory
588
- st.subheader("๐Ÿญ Model Orchestration")
589
- tabs = st.tabs(["AutoML", "Custom Training", "Model Registry"])
590
-
591
- with tabs[0]:
592
- if st.button("Launch Hyperparameter Optimization", type="primary"):
593
- with st.spinner("โšก Training 25 model variants..."):
594
- try:
595
- target = st.selectbox("Target Variable", df.columns)
596
- setup(df, target=target, session_id=42,
597
- feature_interaction=True,
598
- polynomial_features=True)
599
- best_model = compare_models(n_select=3)
600
-
601
- # Visual Leaderboard
602
- results = pull()
603
- fig = px.bar(results, x='Model', y=['Accuracy', 'AUC'],
604
- barmode='group', template="plotly_dark",
605
- title="Model Performance Leaderboard")
606
- st.plotly_chart(fig, use_container_width=True)
607
-
608
- except Exception as e:
609
- st.error(f"AutoML failed: {str(e)}")
610
- # ================== ๐Ÿ”น PREDICTIONS PAGE COMPLETION ==================
611
- elif choice == "Predictions":
612
- st.title("๐Ÿ”ฎ Make Predictions on New Data")
613
-
614
- if st.session_state.get("model"):
615
- uploaded_file = st.file_uploader("Upload New Data for Prediction", type=["csv", "xlsx"])
616
 
617
- if uploaded_file:
618
- new_data = pd.read_csv(uploaded_file) if uploaded_file.name.endswith('.csv') else pd.read_excel(uploaded_file)
619
- st.write("๐Ÿ“Š Preview of New Data:")
620
- st.dataframe(new_data.head())
621
-
622
- try:
623
- predictions = st.session_state.model.predict(new_data)
624
- proba = st.session_state.model.predict_proba(new_data) if hasattr(st.session_state.model, 'predict_proba') else None
625
-
626
- st.subheader("๐Ÿ“ข Predictions:")
627
- result_df = pd.DataFrame({
628
- 'Prediction': predictions,
629
- 'Confidence': proba.max(axis=1) if proba is not None else [1.0]*len(predictions)
630
- })
631
- st.dataframe(result_df.style.background_gradient(cmap='Blues'))
632
 
633
- # Download predictions
634
- csv = result_df.to_csv(index=False).encode('utf-8')
635
- st.download_button(
636
- label="๐Ÿ“ฅ Download Predictions",
637
- data=csv,
638
- file_name='predictions.csv',
639
- mime='text/csv'
640
  )
641
 
642
- except Exception as e:
643
- st.error(f"Prediction error: {str(e)}")
644
- else:
645
- st.warning("โš ๏ธ No trained model found. Please train a model first.")
 
 
 
 
 
 
 
 
 
 
 
 
646
 
647
- # ================== ๐Ÿ”น VISUALIZATION PAGE COMPLETION ==================
648
- # ================== ๐Ÿ”น VISUALIZATION PAGE COMPLETION ==================
649
- elif choice == "Visualization":
650
- st.header("๐Ÿ“Š Advanced Visualization Lab")
651
 
652
- if st.session_state.cleaned_df is not None:
653
- df = st.session_state.cleaned_df
654
 
655
- # Smart Visualization Assistant
656
- col1, col2 = st.columns([1, 3])
657
- with col1:
658
- if st.button("โœจ Suggest Visualizations", help="Generate smart visualization recommendations"):
659
- with st.spinner("๐ŸŽจ Generating recommendations..."):
660
- try:
661
- numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
662
- cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
663
-
664
- # Auto-detect visualization types
665
- if len(numeric_cols) >= 3:
666
- st.session_state.viz_type = "3D Scatter"
667
- elif len(cat_cols) > 0:
668
- st.session_state.viz_type = "Pie"
669
- else:
670
- st.session_state.viz_type = "Histogram"
671
-
672
- st.success(f"Recommended visualization type: {st.session_state.viz_type}")
673
-
674
- except Exception as e:
675
- st.error(f"Recommendation failed: {str(e)}")
676
 
677
- # Manual Visualization Controls
678
- with st.expander("๐ŸŽจ Custom Visualization", expanded=True):
679
- plot_options = ["3D Scatter", "Line", "Bar", "Pie", "Histogram", "Box", "Violin", "Heatmap"]
680
- plot_type = st.selectbox("Select Plot Type", plot_options,
681
- index=plot_options.index(st.session_state.viz_type) if 'viz_type' in st.session_state else 0)
682
-
683
- # Dynamic Axis Selection
684
- col1, col2, col3 = st.columns(3)
685
- fig = None
686
-
687
- # 3D Scatter Plot
688
- if plot_type == "3D Scatter":
689
- with col1:
690
- x_axis = st.selectbox("X Axis", df.columns, index=0)
691
- with col2:
692
- y_axis = st.selectbox("Y Axis", df.columns, index=min(1, len(df.columns)-1))
693
- with col3:
694
- z_axis = st.selectbox("Z Axis", df.columns, index=min(2, len(df.columns)-1))
695
- color_by = st.selectbox("Color By", [None] + df.columns.tolist())
696
- fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, color=color_by,
697
- color_continuous_scale=px.colors.cyclical.IceFire)
698
-
699
- # Line Chart
700
- elif plot_type == "Line":
701
- with col1:
702
- x_axis = st.selectbox("X Axis", df.columns, index=0)
703
- with col2:
704
- y_axis = st.selectbox("Y Axis", df.select_dtypes(include=np.number).columns.tolist())
705
- with col3:
706
- color_by = st.selectbox("Group By", [None] + df.columns.tolist())
707
- fig = px.line(df, x=x_axis, y=y_axis, color=color_by,
708
- line_group=color_by if color_by else None)
709
-
710
- # Bar Chart
711
- elif plot_type == "Bar":
712
- with col1:
713
- x_axis = st.selectbox("X Axis", df.columns, index=0)
714
- with col2:
715
- y_axis = st.selectbox("Y Axis", df.select_dtypes(include=np.number).columns.tolist())
716
- with col3:
717
- color_by = st.selectbox("Color By", [None] + df.columns.tolist())
718
- fig = px.bar(df, x=x_axis, y=y_axis, color=color_by, barmode='group')
719
-
720
- # Pie Chart
721
- elif plot_type == "Pie":
722
- with col1:
723
- names = st.selectbox("Categories", df.select_dtypes(include=['object', 'category']).columns.tolist())
724
- with col2:
725
- values = st.selectbox("Values", df.select_dtypes(include=np.number).columns.tolist())
726
- fig = px.pie(df, names=names, values=values, hole=0.3)
727
-
728
- # Histogram
729
- elif plot_type == "Histogram":
730
- with col1:
731
- num_col = st.selectbox("Numerical Column", df.select_dtypes(include=np.number).columns.tolist())
732
- with col2:
733
- color_by = st.selectbox("Split By", [None] + df.columns.tolist())
734
- fig = px.histogram(df, x=num_col, color=color_by, marginal="rug",
735
- nbins=st.slider("Number of Bins", 5, 100, 20))
736
-
737
- # Box Plot
738
- elif plot_type == "Box":
739
- with col1:
740
- y_axis = st.selectbox("Y Axis", df.select_dtypes(include=np.number).columns.tolist())
741
- with col2:
742
- x_axis = st.selectbox("X Axis (Optional)", [None] + df.columns.tolist())
743
- fig = px.box(df, x=x_axis, y=y_axis, color=x_axis)
744
-
745
- # Violin Plot
746
- elif plot_type == "Violin":
747
- with col1:
748
- y_axis = st.selectbox("Y Axis", df.select_dtypes(include=np.number).columns.tolist())
749
- with col2:
750
- x_axis = st.selectbox("X Axis (Optional)", [None] + df.columns.tolist())
751
- fig = px.violin(df, x=x_axis, y=y_axis, color=x_axis, box=True)
752
-
753
- # Heatmap
754
- elif plot_type == "Heatmap":
755
- numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
756
- selected_cols = st.multiselect("Select Numerical Columns", numeric_cols, default=numeric_cols[:5])
757
- if len(selected_cols) >= 2:
758
- corr_matrix = df[selected_cols].corr()
759
- fig = px.imshow(corr_matrix, text_auto=True,
760
- color_continuous_scale=px.colors.diverging.RdBu_r)
761
- else:
762
- st.warning("Select at least 2 numerical columns for heatmap")
763
-
764
- # Plot Customization
765
- if fig:
766
- with st.expander("๐ŸŽญ Style Customization"):
767
- col1, col2 = st.columns(2)
768
- with col1:
769
- color_theme = st.selectbox("Color Theme", px.colors.named_colorscales(),
770
- index=px.colors.named_colorscales().index('Viridis'))
771
- fig.update_layout(colorway=px.colors.sequential[color_theme])
772
- with col2:
773
- fig.update_layout(
774
- template=st.selectbox("Theme Style", ["plotly", "plotly_dark", "ggplot2", "seaborn"]),
775
- font_size=st.slider("Font Size", 10, 24, 14)
776
- )
777
-
778
- # Display Plot
779
- st.plotly_chart(fig, use_container_width=True)
780
-
781
- # Download Button
782
- plot_html = fig.to_html()
783
- st.download_button(
784
- label="๐Ÿ“ฅ Download Plot",
785
- data=plot_html,
786
- file_name=f"{plot_type.replace(' ', '_')}_plot.html",
787
- mime="text/html"
788
- )
 
2
  import pandas as pd
3
  import numpy as np
4
  import plotly.express as px
 
 
 
 
 
 
 
5
  from sklearn.model_selection import train_test_split
6
+ from sklearn.linear_model import LinearRegression
7
+ from sklearn.tree import DecisionTreeRegressor
8
+ from sklearn.metrics import mean_squared_error, r2_score
9
+ from sklearn.impute import KNNImputer
10
+ from sklearn.preprocessing import RobustScaler
11
+ from pandas_profiling import ProfileReport
12
+ from streamlit_pandas_profiling import st_profile_report
13
+ from io import StringIO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # Configuration
16
+ st.set_page_config(page_title="Data Wizard Pro", layout="wide", page_icon="๐Ÿง™")
17
 
18
+ # Cache decorators
 
19
  @st.cache_data(ttl=3600)
20
+ def load_data(uploaded_file):
21
+ """Load and cache dataset"""
22
+ if uploaded_file.name.endswith('.csv'):
23
+ return pd.read_csv(uploaded_file)
24
+ return pd.read_excel(uploaded_file)
 
 
 
 
 
 
 
 
 
 
25
 
26
  @st.cache_data(ttl=3600)
27
+ def generate_profile(df):
28
+ """Generate automated EDA report"""
29
+ return ProfileReport(df, minimal=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Session State Management
32
+ if 'raw_data' not in st.session_state:
33
+ st.session_state.raw_data = None
34
+ if 'cleaned_data' not in st.session_state:
35
+ st.session_state.cleaned_data = None
36
+ if 'train_test' not in st.session_state:
37
+ st.session_state.train_test = {}
 
 
 
 
 
 
 
 
 
 
 
 
38
  if 'model' not in st.session_state:
39
  st.session_state.model = None
40
 
41
+ # Sidebar Navigation
42
+ st.sidebar.title("๐Ÿ”ฎ Data Wizard Pro")
43
+ app_mode = st.sidebar.radio("Navigate", [
44
+ "Data Upload",
45
+ "Smart Cleaning",
46
+ "Advanced EDA",
47
+ "Model Training",
48
+ "Predictions",
49
+ "Visualization Lab"
50
+ ])
51
+
52
+ # Data Upload Section
53
+ if app_mode == "Data Upload":
54
+ st.title("๐Ÿ“ค Data Upload & Analysis")
 
55
 
56
+ uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx"])
57
+ if uploaded_file:
58
+ df = load_data(uploaded_file)
59
+ st.session_state.raw_data = df
60
+ st.session_state.cleaned_data = df.copy()
61
 
62
+ # Data Overview Cards
63
+ col1, col2, col3 = st.columns(3)
64
+ with col1:
65
+ st.metric("Rows", df.shape[0])
66
+ with col2:
67
+ st.metric("Columns", df.shape[1])
68
+ with col3:
69
+ st.metric("Missing Values", df.isna().sum().sum())
70
+
71
+ # Automated EDA Report
72
+ with st.expander("๐Ÿš€ Automated Data Report"):
73
+ if st.button("Generate Smart Report"):
74
+ pr = generate_profile(df)
75
+ st_profile_report(pr)
76
 
77
+ # Smart Cleaning Section
78
+ elif app_mode == "Smart Cleaning":
79
+ st.title("๐Ÿงผ Intelligent Data Cleaning")
 
 
 
 
 
 
 
 
80
 
81
+ if st.session_state.raw_data is not None:
82
+ df = st.session_state.cleaned_data
 
83
 
84
+ # Cleaning Toolkit
85
+ col1, col2 = st.columns([1, 3])
86
+ with col1:
87
+ st.subheader("Cleaning Actions")
88
+ clean_action = st.selectbox("Choose Operation", [
89
+ "Handle Missing Values",
90
+ "Remove Duplicates",
91
+ "Normalize Data",
92
+ "Encode Categories"
93
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ if clean_action == "Handle Missing Values":
96
+ method = st.selectbox("Imputation Method", [
97
+ "KNN Imputation",
98
+ "Median Fill",
99
+ "Mean Fill",
100
+ "Drop Missing"
101
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ with col2:
104
+ if st.button("Apply Transformation"):
105
+ with st.spinner("Applying changes..."):
106
+ if clean_action == "Handle Missing Values":
107
+ if method == "KNN Imputation":
108
+ imputer = KNNImputer()
109
+ df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
110
+ elif method == "Median Fill":
111
+ df = df.fillna(df.median())
112
+ elif method == "Mean Fill":
113
+ df = df.fillna(df.mean())
114
+ else:
115
+ df = df.dropna()
116
 
117
+ st.session_state.cleaned_data = df
118
+ st.success("Transformation applied!")
119
+
120
+ # Data Comparison
121
+ st.subheader("Data Version Comparison")
122
+ col1, col2 = st.columns(2)
123
+ with col1:
124
+ st.write("Original Data", st.session_state.raw_data.head(3))
125
+ with col2:
126
+ st.write("Cleaned Data", df.head(3))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ # Advanced EDA Section
129
+ elif app_mode == "Advanced EDA":
130
+ st.title("๐Ÿ” Advanced Exploratory Analysis")
131
 
132
+ if st.session_state.cleaned_data is not None:
133
+ df = st.session_state.cleaned_data
134
 
135
+ # Visualization Selector
136
+ plot_type = st.selectbox("Choose Visualization", [
137
+ "Histogram",
138
+ "Scatter Plot",
139
+ "Box Plot",
140
+ "Correlation Heatmap",
141
+ "3D Scatter"
142
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ # Dynamic Axis Selection
145
+ cols = st.columns(3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  with cols[0]:
147
+ x_col = st.selectbox("X Axis", df.columns)
 
148
  with cols[1]:
149
+ y_col = st.selectbox("Y Axis", df.columns) if plot_type in ["Scatter", "Box"] else None
150
+ with cols[2]:
151
+ z_col = st.selectbox("Z Axis", df.columns) if plot_type == "3D Scatter" else None
152
+
153
+ # Generate Plot
154
+ if st.button("Generate Visualization"):
155
+ if plot_type == "Histogram":
156
+ fig = px.histogram(df, x=x_col, nbins=30, template="plotly_dark")
157
+ elif plot_type == "Scatter Plot":
158
+ fig = px.scatter(df, x=x_col, y=y_col, color_discrete_sequence=['#00f7ff'])
159
+ elif plot_type == "3D Scatter":
160
+ fig = px.scatter_3d(df, x=x_col, y=y_col, z=z_col)
161
+ elif plot_type == "Correlation Heatmap":
162
+ corr = df.corr()
163
+ fig = px.imshow(corr, text_auto=True, color_continuous_scale='Viridis')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
+ st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
166
 
167
+ # Model Training Section
168
+ elif app_mode == "Model Training":
169
+ st.title("๐Ÿค– Model Training Studio")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
+ if st.session_state.cleaned_data is not None:
172
+ df = st.session_state.cleaned_data
173
 
174
+ # Model Setup
175
+ col1, col2 = st.columns([1, 3])
176
+ with col1:
177
+ model_type = st.selectbox("Choose Model", [
178
+ "Linear Regression",
179
+ "Decision Tree",
180
+ "Random Forest",
181
+ "XGBoost"
182
+ ])
183
+
184
+ test_size = st.slider("Test Size", 0.1, 0.5, 0.2)
185
+ target = st.selectbox("Target Variable", df.columns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+ with col2:
188
+ if st.button("Train Model"):
189
+ X = df.drop(columns=[target])
190
+ y = df[target]
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ X_train, X_test, y_train, y_test = train_test_split(
193
+ X, y, test_size=test_size, random_state=42
 
 
 
 
 
194
  )
195
 
196
+ if model_type == "Linear Regression":
197
+ model = LinearRegression()
198
+ elif model_type == "Decision Tree":
199
+ model = DecisionTreeRegressor()
200
+
201
+ model.fit(X_train, y_train)
202
+ st.session_state.model = model
203
+ st.session_state.train_test = {
204
+ 'X_test': X_test,
205
+ 'y_test': y_test
206
+ }
207
+
208
+ # Evaluation Metrics
209
+ y_pred = model.predict(X_test)
210
+ st.metric("Rยฒ Score", round(r2_score(y_test, y_pred), 2))
211
+ st.metric("MSE", round(mean_squared_error(y_test, y_pred), 2))
212
 
213
+ # Predictions Section
214
+ elif app_mode == "Predictions":
215
+ st.title("๐Ÿ”ฎ Make Predictions")
 
216
 
217
+ if st.session_state.model is not None:
218
+ model = st.session_state.model
219
 
220
+ # Prediction Interface
221
+ input_data = {}
222
+ for col in st.session_state.train_test['X_test'].columns:
223
+ input_data[col] = st.number_input(col, value=0.0)
224
+
225
+ if st.button("Predict"):
226
+ input_df = pd.DataFrame([input_data])
227
+ prediction = model.predict(input_df)
228
+ st.success(f"Predicted Value: {prediction[0]:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ # Visualization Lab
231
+ elif app_mode == "Visualization Lab":
232
+ st.title("๐Ÿ“Š Advanced Visualization Lab")
233
+
234
+ if st.session_state.cleaned_data is not None:
235
+ df = st.session_state.cleaned_data
236
+
237
+ # Visualization Gallery
238
+ viz_type = st.selectbox("Choose Visualization Type", [
239
+ "3D Scatter Plot",
240
+ "Interactive Heatmap",
241
+ "Time Series Analysis",
242
+ "Cluster Analysis"
243
+ ])
244
+
245
+ # Dynamic Controls
246
+ cols = st.columns(3)
247
+ with cols[0]:
248
+ x_axis = st.selectbox("X Axis", df.columns)
249
+ with cols[1]:
250
+ y_axis = st.selectbox("Y Axis", df.columns)
251
+ with cols[2]:
252
+ z_axis = st.selectbox("Z Axis", df.columns) if viz_type == "3D Scatter Plot" else None
253
+
254
+ # Generate Visualization
255
+ if viz_type == "3D Scatter Plot":
256
+ fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, color=x_axis)
257
+ st.plotly_chart(fig, use_container_width=True)
258
+
259
+ elif viz_type == "Interactive Heatmap":
260
+ corr = df.corr()
261
+ fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu')
262
+ st.plotly_chart(fig, use_container_width=True)