Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,11 @@ from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
7 |
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
|
8 |
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
|
9 |
from sklearn.svm import SVR, SVC
|
|
|
|
|
|
|
|
|
|
|
10 |
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
11 |
from sklearn.impute import KNNImputer, SimpleImputer
|
12 |
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
|
@@ -22,6 +27,7 @@ from io import BytesIO
|
|
22 |
import base64
|
23 |
import time
|
24 |
from sklearn.cluster import KMeans
|
|
|
25 |
|
26 |
# Configurations
|
27 |
st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
|
@@ -100,20 +106,38 @@ def show_loader(message="Loading..."):
|
|
100 |
unsafe_allow_html=True
|
101 |
)
|
102 |
|
103 |
-
#
|
104 |
-
@st.cache_data(ttl=3600)
|
105 |
def load_data(uploaded_file):
|
106 |
"""Load and cache dataset, with file type validation."""
|
107 |
if uploaded_file is not None:
|
108 |
file_extension = uploaded_file.name.split(".")[-1].lower()
|
|
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
else:
|
115 |
-
st.error("Unsupported file type. Please upload a CSV or Excel file.")
|
116 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
else:
|
118 |
return None
|
119 |
|
@@ -160,17 +184,6 @@ app_mode = st.sidebar.radio("Navigate", [
|
|
160 |
"Neural Network Studio" # New option
|
161 |
])
|
162 |
|
163 |
-
# --- Progress Bar ----
|
164 |
-
def animated_progress_bar(progress_var, message="Processing..."):
|
165 |
-
"""Displays an animated progress bar with a message."""
|
166 |
-
progress_bar = st.progress(0)
|
167 |
-
status_text = st.empty() # Empty element to update the status message
|
168 |
-
|
169 |
-
for i in range(progress_var): #progress will increment
|
170 |
-
status_text.text(f"{message} ({i+1}/{progress_var})")
|
171 |
-
progress_bar.progress((i+1)/progress_var) #progress incrementally.
|
172 |
-
time.sleep(0.01)
|
173 |
-
|
174 |
# --- Main App Logic ---
|
175 |
if app_mode == "Data Upload":
|
176 |
st.title("📤 Data Upload & Initial Analysis")
|
@@ -193,16 +206,19 @@ if app_mode == "Data Upload":
|
|
193 |
unsafe_allow_html=True,
|
194 |
)
|
195 |
|
196 |
-
uploaded_file = st.file_uploader(
|
197 |
-
|
|
|
|
|
|
|
198 |
if uploaded_file:
|
199 |
df = load_data(uploaded_file)
|
200 |
-
if df is not None:
|
|
|
201 |
st.session_state.raw_data = df
|
202 |
st.session_state.cleaned_data = df.copy()
|
203 |
-
|
204 |
st.subheader("Data Overview")
|
205 |
-
|
206 |
# Data Overview Cards with more context
|
207 |
col1, col2, col3 = st.columns(3)
|
208 |
with col1:
|
@@ -212,180 +228,100 @@ if app_mode == "Data Upload":
|
|
212 |
with col3:
|
213 |
num_missing = df.isna().sum().sum()
|
214 |
st.metric("Total Missing Values", num_missing, help="Total number of missing entries across the entire dataset.")
|
215 |
-
|
216 |
# Display Data Types
|
217 |
st.write("Column Data Types:")
|
218 |
dtype_counts = df.dtypes.value_counts().to_dict()
|
219 |
for dtype, count in dtype_counts.items():
|
220 |
st.write(f"- {dtype}: {count} column(s)")
|
221 |
-
|
222 |
# Sample Data Table with improved display
|
223 |
st.subheader("Sample Data")
|
224 |
num_rows_preview = st.slider("Number of Rows to Preview", 5, 20, 10, help="Adjust the number of rows displayed in the sample data.")
|
225 |
-
st.dataframe(df.head(num_rows_preview), use_container_width=True)
|
226 |
-
|
227 |
-
# Column Statistics
|
228 |
with st.expander("📊 Column Statistics"):
|
229 |
for col in df.columns:
|
230 |
st.subheader(f"Column: {col}")
|
231 |
st.write(f"Data type: {df[col].dtype}")
|
232 |
-
|
233 |
if pd.api.types.is_numeric_dtype(df[col]):
|
234 |
st.write("Summary Statistics:")
|
235 |
st.write(df[col].describe())
|
236 |
else:
|
237 |
st.write("Value Counts:")
|
238 |
st.write(df[col].value_counts())
|
239 |
-
|
240 |
# Automated EDA Report
|
241 |
with st.expander("🚀 Automated Data Report"):
|
242 |
if st.button("Generate Smart Report"):
|
243 |
show_loader("Generating EDA Report")
|
244 |
pr = generate_profile(df)
|
245 |
st_profile_report(pr)
|
246 |
-
|
247 |
elif app_mode == "Smart Cleaning":
|
248 |
st.title("🧼 Intelligent Data Cleaning")
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
elif clean_action == "Outlier Removal":
|
311 |
-
outlier_method = st.selectbox("Outlier Removal Method", ["IQR", "Z-score"])
|
312 |
-
if outlier_method == "IQR":
|
313 |
-
iqr_threshold = st.slider("IQR Threshold", 1.0, 3.0, 1.5)
|
314 |
-
else:
|
315 |
-
zscore_threshold = st.slider("Z-score Threshold", 2.0, 4.0, 3.0)
|
316 |
-
|
317 |
-
elif clean_action == "Remove Columns":
|
318 |
-
remove_cols = st.multiselect("Columns to Remove", df.columns)
|
319 |
-
|
320 |
-
with col2:
|
321 |
-
if st.button("Apply Transformation"):
|
322 |
-
with st.spinner("Applying changes..."):
|
323 |
-
current_df = df.copy()
|
324 |
-
st.session_state.data_history.append(current_df)
|
325 |
-
|
326 |
-
# Handle Missing Values
|
327 |
-
if clean_action == "Handle Missing Values":
|
328 |
-
if method == "KNN Imputation":
|
329 |
-
imputer = KNNImputer(n_neighbors=knn_neighbors)
|
330 |
-
current_df = pd.DataFrame(imputer.fit_transform(current_df), columns=current_df.columns)
|
331 |
-
elif method == "Median Fill":
|
332 |
-
current_df = current_df.fillna(current_df.median())
|
333 |
-
elif method == "Mean Fill":
|
334 |
-
current_df = current_df.fillna(current_df.mean())
|
335 |
-
else:
|
336 |
-
current_df = current_df.dropna()
|
337 |
-
|
338 |
-
# Remove Columns
|
339 |
-
elif clean_action == "Remove Columns":
|
340 |
-
if remove_cols:
|
341 |
-
current_df = current_df.drop(columns=remove_cols)
|
342 |
-
|
343 |
-
# Normalize Data
|
344 |
-
elif clean_action == "Normalize Data":
|
345 |
-
scaler = RobustScaler() if scaler_type == "RobustScaler" else StandardScaler()
|
346 |
-
num_cols = current_df.select_dtypes(include=np.number).columns
|
347 |
-
current_df[num_cols] = scaler.fit_transform(current_df[num_cols])
|
348 |
-
|
349 |
-
# Encode Categories
|
350 |
-
elif clean_action == "Encode Categories":
|
351 |
-
cat_cols = current_df.select_dtypes(include='object').columns
|
352 |
-
if len(cat_cols) > 0:
|
353 |
-
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
354 |
-
encoded_data = encoder.fit_transform(current_df[cat_cols])
|
355 |
-
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(cat_cols))
|
356 |
-
current_df = pd.concat([current_df.drop(columns=cat_cols), encoded_df], axis=1)
|
357 |
-
|
358 |
-
# Outlier Removal
|
359 |
-
elif clean_action == "Outlier Removal":
|
360 |
-
num_cols = current_df.select_dtypes(include=np.number).columns
|
361 |
-
for col in num_cols:
|
362 |
-
if outlier_method == "IQR":
|
363 |
-
Q1 = current_df[col].quantile(0.25)
|
364 |
-
Q3 = current_df[col].quantile(0.75)
|
365 |
-
IQR = Q3 - Q1
|
366 |
-
lower_bound = Q1 - iqr_threshold * IQR
|
367 |
-
upper_bound = Q3 + iqr_threshold * IQR
|
368 |
-
current_df = current_df[(current_df[col] >= lower_bound) & (current_df[col] <= upper_bound)]
|
369 |
-
else:
|
370 |
-
z_scores = np.abs((current_df[col] - current_df[col].mean()) / current_df[col].std())
|
371 |
-
current_df = current_df[z_scores <= zscore_threshold]
|
372 |
-
|
373 |
-
# Neural Network Prep
|
374 |
-
elif clean_action == "Neural Network Prep":
|
375 |
-
st.info("Data prepared for neural network training.")
|
376 |
-
|
377 |
-
st.session_state.cleaned_data = current_df
|
378 |
-
st.success("Transformation applied!")
|
379 |
-
|
380 |
-
# Data Comparison
|
381 |
-
st.subheader("Data Version Comparison")
|
382 |
-
col_orig, col_clean = st.columns(2)
|
383 |
-
with col_orig:
|
384 |
-
st.markdown("**Original Data**")
|
385 |
-
st.dataframe(st.session_state.raw_data.head(5), use_container_width=True)
|
386 |
-
with col_clean:
|
387 |
-
st.markdown("**Cleaned Data**")
|
388 |
-
st.dataframe(df.head(5), use_container_width=True)
|
389 |
|
390 |
elif app_mode == "Advanced EDA":
|
391 |
st.title("🔍 Advanced Exploratory Analysis")
|
@@ -586,109 +522,114 @@ elif app_mode == "Advanced EDA":
|
|
586 |
st.plotly_chart(fig, use_container_width=True)
|
587 |
except Exception as e:
|
588 |
st.error(f"An error occurred while generating the plot: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
589 |
|
590 |
elif app_mode == "Model Training":
|
591 |
st.title("🚂 Model Training")
|
592 |
|
593 |
-
|
594 |
-
df = st.session_state.cleaned_data.copy()
|
595 |
-
|
596 |
-
# Target Variable Selection
|
597 |
-
target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
|
598 |
-
|
599 |
-
# Problem Type Selection
|
600 |
-
problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of problem.")
|
601 |
-
|
602 |
-
# Feature Selection
|
603 |
-
feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose features for training.")
|
604 |
-
|
605 |
-
# Model Selection
|
606 |
-
model_name = st.selectbox("Select Model", [
|
607 |
-
"Linear Regression", "Logistic Regression", "Decision Tree",
|
608 |
-
"Random Forest", "Gradient Boosting", "SVM"
|
609 |
-
], help="Choose a model.")
|
610 |
-
|
611 |
-
# Hyperparameter Tuning (Example - Add more as needed)
|
612 |
-
if model_name == "Random Forest":
|
613 |
-
n_estimators = st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest.")
|
614 |
-
max_depth = st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree.")
|
615 |
-
|
616 |
-
# Train-Test Split
|
617 |
-
test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
|
618 |
-
|
619 |
-
if st.button("Train Model"):
|
620 |
-
with st.spinner("Training model..."):
|
621 |
-
try:
|
622 |
-
X = df[feature_columns]
|
623 |
-
y = df[target_column]
|
624 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
|
625 |
-
|
626 |
-
# Preprocessing Pipeline
|
627 |
-
numeric_features = X.select_dtypes(include=np.number).columns
|
628 |
-
categorical_features = X.select_dtypes(exclude=np.number).columns
|
629 |
-
|
630 |
-
numeric_transformer = Pipeline(steps=[
|
631 |
-
('imputer', SimpleImputer(strategy='median')),
|
632 |
-
('scaler', StandardScaler())
|
633 |
-
])
|
634 |
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
('num', numeric_transformer, numeric_features),
|
643 |
-
('cat', categorical_transformer, categorical_features)
|
644 |
-
])
|
645 |
-
|
646 |
-
X_train_processed = preprocessor.fit_transform(X_train)
|
647 |
-
X_test_processed = preprocessor.transform(X_test)
|
648 |
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
685 |
|
686 |
-
|
|
|
687 |
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
|
|
|
|
|
|
|
|
692 |
|
693 |
elif app_mode == "Predictions":
|
694 |
st.title("🔮 Make Predictions")
|
@@ -729,6 +670,29 @@ elif app_mode == "Predictions":
|
|
729 |
else:
|
730 |
st.write("Please train a model first in the 'Model Training' section.")
|
731 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
732 |
elif app_mode == "Visualization Lab":
|
733 |
st.title("🔬 Advanced Data Visualization and Clustering Lab")
|
734 |
|
@@ -839,6 +803,46 @@ if st.session_state.cleaned_data is not None:
|
|
839 |
st.success("Clustering applied successfully!")
|
840 |
except Exception as e:
|
841 |
st.error(f"An error occurred during clustering: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
842 |
|
843 |
elif app_mode == "Neural Network Studio":
|
844 |
st.title("🧠 Neural Network Studio")
|
@@ -882,7 +886,7 @@ elif app_mode == "Neural Network Studio":
|
|
882 |
test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the data to use for testing.")
|
883 |
|
884 |
# Model Training Button
|
885 |
-
|
886 |
with st.spinner("Training neural network model..."):
|
887 |
try:
|
888 |
# Split data
|
@@ -900,8 +904,8 @@ elif app_mode == "Neural Network Studio":
|
|
900 |
('onehot', OneHotEncoder(handle_unknown='ignore'))
|
901 |
])
|
902 |
|
903 |
-
numeric_features = X_train.select_dtypes(include=
|
904 |
-
categorical_features = X_train.select_dtypes(include=
|
905 |
|
906 |
preprocessor = ColumnTransformer(
|
907 |
transformers=[
|
@@ -913,18 +917,29 @@ elif app_mode == "Neural Network Studio":
|
|
913 |
X_test_processed = preprocessor.transform(X_test)
|
914 |
|
915 |
# Neural Network Model Selection and Training
|
|
|
|
|
|
|
|
|
|
|
916 |
if model_type == "Simple Neural Network":
|
917 |
model = keras.Sequential()
|
918 |
model.add(layers.Input(shape=(X_train_processed.shape[1],)))
|
919 |
for _ in range(hidden_layers):
|
920 |
-
model.add(layers.Dense(neurons_per_layer, activation=
|
921 |
-
model.add(
|
|
|
|
|
922 |
|
923 |
-
|
|
|
|
|
924 |
loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
|
925 |
metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
|
926 |
|
927 |
-
model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size,
|
|
|
|
|
928 |
|
929 |
y_pred = model.predict(X_test_processed)
|
930 |
if problem_type == "Classification":
|
@@ -935,17 +950,23 @@ elif app_mode == "Neural Network Studio":
|
|
935 |
X_test_cnn = np.expand_dims(X_test_processed, axis=2)
|
936 |
|
937 |
model = keras.Sequential()
|
938 |
-
model.add(layers.Conv1D(filters=
|
939 |
-
|
|
|
940 |
model.add(layers.Flatten())
|
941 |
model.add(layers.Dense(50, activation='relu'))
|
942 |
-
model.add(
|
|
|
|
|
943 |
|
944 |
-
|
|
|
945 |
loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
|
946 |
metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
|
947 |
|
948 |
-
model.fit(X_train_cnn, y_train, epochs=epochs_cnn, batch_size=batch_size_cnn,
|
|
|
|
|
949 |
|
950 |
y_pred = model.predict(X_test_cnn)
|
951 |
if problem_type == "Classification":
|
@@ -953,18 +974,28 @@ elif app_mode == "Neural Network Studio":
|
|
953 |
|
954 |
elif model_type == "Recurrent Neural Network (RNN)":
|
955 |
try:
|
956 |
-
X_train_rnn = np.reshape(X_train_processed, (
|
957 |
-
|
|
|
|
|
|
|
958 |
|
959 |
model = keras.Sequential()
|
960 |
-
model.add(layers.SimpleRNN(
|
961 |
-
|
962 |
-
|
963 |
-
model.
|
|
|
|
|
|
|
|
|
|
|
964 |
loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
|
965 |
metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
|
966 |
|
967 |
-
model.fit(X_train_rnn, y_train, epochs=epochs_rnn, batch_size=batch_size_rnn,
|
|
|
|
|
968 |
|
969 |
y_pred = model.predict(X_test_rnn)
|
970 |
if problem_type == "Classification":
|
@@ -995,7 +1026,21 @@ elif app_mode == "Neural Network Studio":
|
|
995 |
st.write("Classification Report:")
|
996 |
st.text(classification_report(y_test, y_pred))
|
997 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
998 |
st.success("Model trained successfully!")
|
999 |
|
|
|
|
|
|
|
1000 |
except Exception as e:
|
1001 |
st.error(f"An error occurred during training: {e}")
|
|
|
7 |
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
|
8 |
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
|
9 |
from sklearn.svm import SVR, SVC
|
10 |
+
from sklearn.decomposition import PCA #Import at top
|
11 |
+
from sklearn.metrics import silhouette_score #Import at top
|
12 |
+
from sklearn.cluster import DBSCAN #Import at top
|
13 |
+
from sklearn.feature_selection import SelectKBest #Import at top
|
14 |
+
import joblib #Import at top
|
15 |
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
16 |
from sklearn.impute import KNNImputer, SimpleImputer
|
17 |
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
|
|
|
27 |
import base64
|
28 |
import time
|
29 |
from sklearn.cluster import KMeans
|
30 |
+
import scipy.stats as stats
|
31 |
|
32 |
# Configurations
|
33 |
st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
|
|
|
106 |
unsafe_allow_html=True
|
107 |
)
|
108 |
|
109 |
+
@st.cache_data(ttl=3600, allow_output_mutation=True) #Added allow_output_mutation
|
|
|
110 |
def load_data(uploaded_file):
|
111 |
"""Load and cache dataset, with file type validation."""
|
112 |
if uploaded_file is not None:
|
113 |
file_extension = uploaded_file.name.split(".")[-1].lower()
|
114 |
+
mime_type = mimetypes.guess_type(uploaded_file.name)[0]
|
115 |
|
116 |
+
max_file_size_mb = 50 # Set a maximum file size (adjust as needed)
|
117 |
+
file_size_mb = uploaded_file.size / (1024 * 1024)
|
118 |
+
if file_size_mb > max_file_size_mb:
|
119 |
+
st.error(f"File size exceeds the limit of {max_file_size_mb} MB.")
|
|
|
|
|
120 |
return None
|
121 |
+
|
122 |
+
|
123 |
+
try: # Wrap file reading in a try...except
|
124 |
+
if file_extension == "csv" or mime_type == 'text/csv':
|
125 |
+
df = pd.read_csv(uploaded_file)
|
126 |
+
return df
|
127 |
+
elif file_extension in ["xlsx", "xls"] or mime_type in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:
|
128 |
+
df = pd.read_excel(uploaded_file)
|
129 |
+
return df
|
130 |
+
else:
|
131 |
+
st.error("Unsupported file type. Please upload a CSV or Excel file.")
|
132 |
+
return None
|
133 |
+
except FileNotFoundError:
|
134 |
+
st.error("File not found. Please check the file path.")
|
135 |
+
except pd.errors.ParserError: # Catch pandas-specific parsing errors
|
136 |
+
st.error("Error parsing the file. Make sure it's a valid CSV or Excel file.")
|
137 |
+
except Exception as e:
|
138 |
+
st.error(f"An unexpected error occurred: {type(e).__name__} - {str(e)}")
|
139 |
+
return None # Handle other potential exceptions
|
140 |
+
|
141 |
else:
|
142 |
return None
|
143 |
|
|
|
184 |
"Neural Network Studio" # New option
|
185 |
])
|
186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
# --- Main App Logic ---
|
188 |
if app_mode == "Data Upload":
|
189 |
st.title("📤 Data Upload & Initial Analysis")
|
|
|
206 |
unsafe_allow_html=True,
|
207 |
)
|
208 |
|
209 |
+
uploaded_file = st.file_uploader(
|
210 |
+
"Choose a CSV or Excel file", type=["csv", "xlsx"],
|
211 |
+
help="Upload your dataset here. Supported formats: CSV, XLSX"
|
212 |
+
)
|
213 |
+
|
214 |
if uploaded_file:
|
215 |
df = load_data(uploaded_file)
|
216 |
+
if df is not None:
|
217 |
+
# only proceed if load_data returned a valid dataframe
|
218 |
st.session_state.raw_data = df
|
219 |
st.session_state.cleaned_data = df.copy()
|
220 |
+
|
221 |
st.subheader("Data Overview")
|
|
|
222 |
# Data Overview Cards with more context
|
223 |
col1, col2, col3 = st.columns(3)
|
224 |
with col1:
|
|
|
228 |
with col3:
|
229 |
num_missing = df.isna().sum().sum()
|
230 |
st.metric("Total Missing Values", num_missing, help="Total number of missing entries across the entire dataset.")
|
231 |
+
|
232 |
# Display Data Types
|
233 |
st.write("Column Data Types:")
|
234 |
dtype_counts = df.dtypes.value_counts().to_dict()
|
235 |
for dtype, count in dtype_counts.items():
|
236 |
st.write(f"- {dtype}: {count} column(s)")
|
237 |
+
|
238 |
# Sample Data Table with improved display
|
239 |
st.subheader("Sample Data")
|
240 |
num_rows_preview = st.slider("Number of Rows to Preview", 5, 20, 10, help="Adjust the number of rows displayed in the sample data.")
|
241 |
+
st.dataframe(df.head(num_rows_preview), use_container_width=True)
|
242 |
+
|
243 |
+
# Column Statistics
|
244 |
with st.expander("📊 Column Statistics"):
|
245 |
for col in df.columns:
|
246 |
st.subheader(f"Column: {col}")
|
247 |
st.write(f"Data type: {df[col].dtype}")
|
|
|
248 |
if pd.api.types.is_numeric_dtype(df[col]):
|
249 |
st.write("Summary Statistics:")
|
250 |
st.write(df[col].describe())
|
251 |
else:
|
252 |
st.write("Value Counts:")
|
253 |
st.write(df[col].value_counts())
|
254 |
+
|
255 |
# Automated EDA Report
|
256 |
with st.expander("🚀 Automated Data Report"):
|
257 |
if st.button("Generate Smart Report"):
|
258 |
show_loader("Generating EDA Report")
|
259 |
pr = generate_profile(df)
|
260 |
st_profile_report(pr)
|
261 |
+
|
262 |
elif app_mode == "Smart Cleaning":
|
263 |
st.title("🧼 Intelligent Data Cleaning")
|
264 |
+
elif clean_action == "Handle Missing Values":
|
265 |
+
columns_with_missing = df.columns[df.isnull().any()].tolist()
|
266 |
+
column_to_impute = st.selectbox("Column to Impute", ["All Columns"] + columns_with_missing) #Choose column
|
267 |
+
|
268 |
+
method = st.selectbox("Imputation Method", [
|
269 |
+
"KNN Imputation",
|
270 |
+
"Median Fill",
|
271 |
+
"Mean Fill",
|
272 |
+
"Drop Missing",
|
273 |
+
"Constant Value Fill" #new
|
274 |
+
])
|
275 |
+
if method == "KNN Imputation":
|
276 |
+
knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5)
|
277 |
+
elif method == "Constant Value Fill":
|
278 |
+
constant_value = st.text_input("Constant Value")
|
279 |
+
|
280 |
+
elif clean_action == "Clean Text":
|
281 |
+
text_column = st.selectbox("Text Column", df.select_dtypes(include='object').columns)
|
282 |
+
cleaning_operation = st.selectbox("Cleaning Operation", ["Remove Special Characters", "Lowercase", "Uppercase", "Remove Extra Spaces"])
|
283 |
+
if cleaning_operation == "Remove Special Characters":
|
284 |
+
chars_to_remove = st.text_input("Characters to Remove", r'[^a-zA-Z0-9\s]')
|
285 |
+
|
286 |
+
#Inside the Apply Transformations button section
|
287 |
+
elif clean_action == "Handle Missing Values":
|
288 |
+
if method == "KNN Imputation":
|
289 |
+
imputer = KNNImputer(n_neighbors=knn_neighbors)
|
290 |
+
if column_to_impute == "All Columns":
|
291 |
+
current_df = pd.DataFrame(imputer.fit_transform(current_df), columns=current_df.columns)
|
292 |
+
else:
|
293 |
+
current_df[[column_to_impute]] = imputer.fit_transform(current_df[[column_to_impute]])
|
294 |
+
elif method == "Median Fill":
|
295 |
+
if column_to_impute == "All Columns":
|
296 |
+
current_df = current_df.fillna(current_df.median())
|
297 |
+
else:
|
298 |
+
current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].median())
|
299 |
+
elif method == "Mean Fill":
|
300 |
+
if column_to_impute == "All Columns":
|
301 |
+
current_df = current_df.fillna(current_df.mean())
|
302 |
+
else:
|
303 |
+
current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].mean())
|
304 |
+
elif method == "Constant Value Fill":
|
305 |
+
if column_to_impute == "All Columns":
|
306 |
+
current_df = current_df.fillna(constant_value)
|
307 |
+
else:
|
308 |
+
current_df[column_to_impute] = current_df[column_to_impute].fillna(constant_value)
|
309 |
+
else:
|
310 |
+
current_df = current_df.dropna()
|
311 |
+
|
312 |
+
elif clean_action == "Clean Text":
|
313 |
+
def clean_text(text, operation, chars_to_remove=r'[^a-zA-Z0-9\s]'):
|
314 |
+
if operation == "Remove Special Characters":
|
315 |
+
text = re.sub(chars_to_remove, '', str(text)) #Need to import re at top
|
316 |
+
elif operation == "Lowercase":
|
317 |
+
text = str(text).lower()
|
318 |
+
elif operation == "Uppercase":
|
319 |
+
text = str(text).upper()
|
320 |
+
elif operation == "Remove Extra Spaces":
|
321 |
+
text = " ".join(str(text).split())
|
322 |
+
return text
|
323 |
+
|
324 |
+
current_df[text_column] = current_df[text_column].apply(lambda x: clean_text(x, cleaning_operation, chars_to_remove))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
|
326 |
elif app_mode == "Advanced EDA":
|
327 |
st.title("🔍 Advanced Exploratory Analysis")
|
|
|
522 |
st.plotly_chart(fig, use_container_width=True)
|
523 |
except Exception as e:
|
524 |
st.error(f"An error occurred while generating the plot: {e}")
|
525 |
+
with st.expander("🧪 Hypothesis Testing"):
|
526 |
+
test_type = st.selectbox("Select Test Type", ["T-test", "Chi-Squared Test"])
|
527 |
+
|
528 |
+
if test_type == "T-test":
|
529 |
+
col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
|
530 |
+
col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
|
531 |
+
if st.button("Run T-test"):
|
532 |
+
# Example: Split data by category and perform t-test
|
533 |
+
groups = df.groupby(col2)[col1].apply(list)
|
534 |
+
if len(groups) == 2:
|
535 |
+
t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
|
536 |
+
st.write(f"T-statistic: {t_stat:.4f}")
|
537 |
+
st.write(f"P-value: {p_value:.4f}")
|
538 |
+
if p_value < 0.05:
|
539 |
+
st.write("Reject the null hypothesis.")
|
540 |
+
else:
|
541 |
+
st.write("Fail to reject the null hypothesis.")
|
542 |
+
else:
|
543 |
+
st.write("Select a categorical column with exactly two categories.")
|
544 |
|
545 |
elif app_mode == "Model Training":
|
546 |
st.title("🚂 Model Training")
|
547 |
|
548 |
+
feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
549 |
|
550 |
+
if model_name == "Random Forest":
|
551 |
+
param_grid = {
|
552 |
+
'n_estimators': st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest."),
|
553 |
+
'max_depth': st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree."),
|
554 |
+
'min_samples_split': st.slider("Minimum Samples Split", 2, 10, 2, help="Minimum samples required to split an internal node"), #New hyperparameter
|
555 |
+
'min_samples_leaf': st.slider("Minimum Samples Leaf", 1, 10, 1, help="Minimum samples required to be at a leaf node"), #New hyperparameter
|
556 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
|
558 |
+
#Inside the train model button
|
559 |
+
if st.button("Train Model"):
|
560 |
+
#Feature Selection
|
561 |
+
if feature_selection_method == "SelectKBest":
|
562 |
+
k = st.slider("Number of Features to Select", 1, len(feature_columns), len(feature_columns))
|
563 |
+
selector = SelectKBest(k=k)
|
564 |
+
X_train_selected = selector.fit_transform(X_train_processed, y_train)
|
565 |
+
X_test_selected = selector.transform(X_test_processed)
|
566 |
+
else:
|
567 |
+
X_train_selected = X_train_processed
|
568 |
+
X_test_selected = X_test_processed
|
569 |
+
# Model Training and Hyperparameter Tuning
|
570 |
+
if model_name == "Linear Regression":
|
571 |
+
model = LinearRegression()
|
572 |
+
elif model_name == "Logistic Regression":
|
573 |
+
model = LogisticRegression(max_iter=1000)
|
574 |
+
elif model_name == "Decision Tree":
|
575 |
+
if problem_type == "Regression":
|
576 |
+
model = DecisionTreeRegressor()
|
577 |
+
else:
|
578 |
+
model = DecisionTreeClassifier()
|
579 |
+
elif model_name == "Random Forest":
|
580 |
+
if problem_type == "Regression":
|
581 |
+
model = RandomForestRegressor(random_state=42)
|
582 |
+
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') # Example scoring
|
583 |
+
grid_search.fit(X_train_selected, y_train)
|
584 |
+
model = grid_search.best_estimator_
|
585 |
+
st.write("Best Parameters:", grid_search.best_params_)
|
586 |
+
else:
|
587 |
+
model = RandomForestClassifier(random_state=42)
|
588 |
+
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
|
589 |
+
grid_search.fit(X_train_selected, y_train)
|
590 |
+
model = grid_search.best_estimator_
|
591 |
+
st.write("Best Parameters:", grid_search.best_params_)
|
592 |
+
|
593 |
+
elif model_name == "Gradient Boosting":
|
594 |
+
model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
|
595 |
+
elif model_name == "SVM":
|
596 |
+
model = SVR() if problem_type == "Regression" else SVC()
|
597 |
+
|
598 |
+
# Cross-validation
|
599 |
+
cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5) #example, adjust cv
|
600 |
+
st.write(f"Cross-validation scores: {cv_scores}")
|
601 |
+
st.write(f"Mean cross-validation score: {cv_scores.mean():.4f}")
|
602 |
+
|
603 |
+
model.fit(X_train_selected, y_train)
|
604 |
+
|
605 |
+
# Model Saving
|
606 |
+
model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model")
|
607 |
+
if st.button("Save Model"):
|
608 |
+
try:
|
609 |
+
joblib.dump(st.session_state.model, f"{model_filename}.joblib")
|
610 |
+
st.success(f"Model saved as {model_filename}.joblib")
|
611 |
+
except Exception as e:
|
612 |
+
st.error(f"Error saving model: {e}")
|
613 |
+
# Model loading in a different section
|
614 |
+
model_file = st.file_uploader("Upload Trained Model", type=["joblib"])
|
615 |
+
if model_file is not None:
|
616 |
+
try:
|
617 |
+
st.session_state.model = joblib.load(model_file)
|
618 |
+
st.success("Model loaded successfully!")
|
619 |
+
except Exception as e:
|
620 |
+
st.error(f"Error loading model: {e}")
|
621 |
|
622 |
+
#Model Evaluation Section
|
623 |
+
y_pred = model.predict(X_test_selected)
|
624 |
|
625 |
+
if problem_type == "Regression":
|
626 |
+
mse = mean_squared_error(y_test, y_pred)
|
627 |
+
r2 = r2_score(y_test, y_pred)
|
628 |
+
st.write(f"Mean Squared Error: {mse:.4f}")
|
629 |
+
st.write(f"R-squared: {r2:.4f}")
|
630 |
+
else:
|
631 |
+
accuracy = accuracy_score(y_test, y_pred)
|
632 |
+
st.write(f"Accuracy: {accuracy:.4f}")
|
633 |
|
634 |
elif app_mode == "Predictions":
|
635 |
st.title("🔮 Make Predictions")
|
|
|
670 |
else:
|
671 |
st.write("Please train a model first in the 'Model Training' section.")
|
672 |
|
673 |
+
#Add batch prediction section in prediction tab
|
674 |
+
st.subheader("Batch Predictions")
|
675 |
+
batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"])
|
676 |
+
if batch_file is not None:
|
677 |
+
try:
|
678 |
+
batch_df = pd.read_csv(batch_file)
|
679 |
+
# Preprocess the batch data
|
680 |
+
batch_processed = st.session_state.preprocessor.transform(batch_df)
|
681 |
+
# Make predictions
|
682 |
+
batch_predictions = st.session_state.model.predict(batch_processed)
|
683 |
+
batch_df['Prediction'] = batch_predictions
|
684 |
+
st.dataframe(batch_df)
|
685 |
+
|
686 |
+
# Download predictions
|
687 |
+
csv = batch_df.to_csv(index=False)
|
688 |
+
b64 = base64.b64encode(csv.encode()).decode() # some strings
|
689 |
+
href = f'<a href="data:file/csv;base64,{b64}" download="predictions.csv">Download Predictions CSV</a>'
|
690 |
+
st.markdown(href, unsafe_allow_html=True)
|
691 |
+
|
692 |
+
except Exception as e:
|
693 |
+
st.error(f"Error processing batch file: {e}")
|
694 |
+
|
695 |
+
|
696 |
elif app_mode == "Visualization Lab":
|
697 |
st.title("🔬 Advanced Data Visualization and Clustering Lab")
|
698 |
|
|
|
803 |
st.success("Clustering applied successfully!")
|
804 |
except Exception as e:
|
805 |
st.error(f"An error occurred during clustering: {e}")
|
806 |
+
#Add clustering performance in clustering analysis
|
807 |
+
if len(cluster_cols) >= 2: # Evaluate Silhouette Score
|
808 |
+
try:
|
809 |
+
silhouette_avg = silhouette_score(scaled_data, clusters)
|
810 |
+
st.write(f"Silhouette Score: {silhouette_avg:.4f}")
|
811 |
+
except:
|
812 |
+
st.write("Could not compute silhouette score")
|
813 |
+
|
814 |
+
#Add dimensionality reduction option and 2d/3d plots
|
815 |
+
|
816 |
+
dimension_reduction = st.selectbox("Dimensionality Reduction", ["None", "PCA"])
|
817 |
+
if dimension_reduction == "PCA":
|
818 |
+
n_components = st.slider("Number of Components", 2, min(3, len(cluster_cols)), 2)
|
819 |
+
pca = PCA(n_components=n_components)
|
820 |
+
principal_components = pca.fit_transform(scaled_data)
|
821 |
+
pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i + 1}' for i in range(n_components)])
|
822 |
+
pca_df['Cluster'] = clusters # Add Cluster
|
823 |
+
|
824 |
+
if len(cluster_cols) >= 2: #plotting section
|
825 |
+
fig = None #Initialize fig
|
826 |
+
if dimension_reduction == "None":
|
827 |
+
if len(cluster_cols) == 2:
|
828 |
+
fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
|
829 |
+
st.plotly_chart(fig, use_container_width=True)
|
830 |
+
elif len(cluster_cols) == 3:
|
831 |
+
fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
|
832 |
+
st.plotly_chart(fig, use_container_width=True)
|
833 |
+
else:
|
834 |
+
st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
|
835 |
+
|
836 |
+
elif dimension_reduction == "PCA":
|
837 |
+
if n_components == 2:
|
838 |
+
fig = px.scatter(pca_df, x='PC1', y='PC2', color='Cluster', title="K-Means Clustering (PCA - 2D)")
|
839 |
+
st.plotly_chart(fig, use_container_width=True)
|
840 |
+
elif n_components == 3:
|
841 |
+
fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Cluster', title="K-Means Clustering (PCA - 3D)")
|
842 |
+
st.plotly_chart(fig, use_container_width=True)
|
843 |
+
|
844 |
+
else:
|
845 |
+
st.write("PCA visualization is only supported for 2 or 3 components.")
|
846 |
|
847 |
elif app_mode == "Neural Network Studio":
|
848 |
st.title("🧠 Neural Network Studio")
|
|
|
886 |
test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the data to use for testing.")
|
887 |
|
888 |
# Model Training Button
|
889 |
+
if st.button("Train Neural Network Model"):
|
890 |
with st.spinner("Training neural network model..."):
|
891 |
try:
|
892 |
# Split data
|
|
|
904 |
('onehot', OneHotEncoder(handle_unknown='ignore'))
|
905 |
])
|
906 |
|
907 |
+
numeric_features = X_train.select_dtypes(include=np.number).columns
|
908 |
+
categorical_features = X_train.select_dtypes(include='object').columns
|
909 |
|
910 |
preprocessor = ColumnTransformer(
|
911 |
transformers=[
|
|
|
917 |
X_test_processed = preprocessor.transform(X_test)
|
918 |
|
919 |
# Neural Network Model Selection and Training
|
920 |
+
tf.random.set_seed(42) # for reproducibility
|
921 |
+
|
922 |
+
# Callbacks (Early Stopping)
|
923 |
+
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
|
924 |
+
|
925 |
if model_type == "Simple Neural Network":
|
926 |
model = keras.Sequential()
|
927 |
model.add(layers.Input(shape=(X_train_processed.shape[1],)))
|
928 |
for _ in range(hidden_layers):
|
929 |
+
model.add(layers.Dense(neurons_per_layer, activation=activation)) # Use the selected activation
|
930 |
+
model.add(
|
931 |
+
layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
|
932 |
+
activation='linear' if problem_type == "Regression" else 'softmax'))
|
933 |
|
934 |
+
optimizer = keras.optimizers.Adam(learning_rate=learning_rate) # Use the learning rate
|
935 |
+
|
936 |
+
model.compile(optimizer=optimizer,
|
937 |
loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
|
938 |
metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
|
939 |
|
940 |
+
history = model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size,
|
941 |
+
validation_split=0.2, verbose=0,
|
942 |
+
callbacks=[early_stopping]) # Added early stopping
|
943 |
|
944 |
y_pred = model.predict(X_test_processed)
|
945 |
if problem_type == "Classification":
|
|
|
950 |
X_test_cnn = np.expand_dims(X_test_processed, axis=2)
|
951 |
|
952 |
model = keras.Sequential()
|
953 |
+
model.add(layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu',
|
954 |
+
input_shape=(X_train_cnn.shape[1], 1)))
|
955 |
+
model.add(layers.MaxPooling1D(pool_size=pooling_size))
|
956 |
model.add(layers.Flatten())
|
957 |
model.add(layers.Dense(50, activation='relu'))
|
958 |
+
model.add(
|
959 |
+
layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
|
960 |
+
activation='linear' if problem_type == "Regression" else 'softmax'))
|
961 |
|
962 |
+
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
|
963 |
+
model.compile(optimizer=optimizer,
|
964 |
loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
|
965 |
metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
|
966 |
|
967 |
+
history = model.fit(X_train_cnn, y_train, epochs=epochs_cnn, batch_size=batch_size_cnn,
|
968 |
+
validation_split=0.2, verbose=0,
|
969 |
+
callbacks=[early_stopping])
|
970 |
|
971 |
y_pred = model.predict(X_test_cnn)
|
972 |
if problem_type == "Classification":
|
|
|
974 |
|
975 |
elif model_type == "Recurrent Neural Network (RNN)":
|
976 |
try:
|
977 |
+
X_train_rnn = np.reshape(X_train_processed, (
|
978 |
+
X_train_processed.shape[0], sequence_length,
|
979 |
+
X_train_processed.shape[1] // sequence_length))
|
980 |
+
X_test_rnn = np.reshape(X_test_processed, (
|
981 |
+
X_test_processed.shape[0], sequence_length, X_test_processed.shape[1] // sequence_length))
|
982 |
|
983 |
model = keras.Sequential()
|
984 |
+
model.add(layers.SimpleRNN(units, activation='relu', # Use the selected units
|
985 |
+
dropout=dropout_rate,
|
986 |
+
input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])))
|
987 |
+
model.add(
|
988 |
+
layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
|
989 |
+
activation='linear' if problem_type == "Regression" else 'softmax'))
|
990 |
+
|
991 |
+
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
|
992 |
+
model.compile(optimizer=optimizer,
|
993 |
loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
|
994 |
metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
|
995 |
|
996 |
+
history = model.fit(X_train_rnn, y_train, epochs=epochs_rnn, batch_size=batch_size_rnn,
|
997 |
+
validation_split=0.2, verbose=0,
|
998 |
+
callbacks=[early_stopping])
|
999 |
|
1000 |
y_pred = model.predict(X_test_rnn)
|
1001 |
if problem_type == "Classification":
|
|
|
1026 |
st.write("Classification Report:")
|
1027 |
st.text(classification_report(y_test, y_pred))
|
1028 |
|
1029 |
+
# Visualization
|
1030 |
+
st.subheader("Training History")
|
1031 |
+
fig, ax = plt.subplots() # Use matplotlib directly
|
1032 |
+
|
1033 |
+
ax.plot(history.history['loss'], label='loss')
|
1034 |
+
ax.plot(history.history['val_loss'], label='val_loss')
|
1035 |
+
ax.set_xlabel('Epoch')
|
1036 |
+
ax.set_ylabel('Loss')
|
1037 |
+
ax.legend()
|
1038 |
+
st.pyplot(fig) # Display with st.pyplot
|
1039 |
+
|
1040 |
st.success("Model trained successfully!")
|
1041 |
|
1042 |
+
except Exception as e:
|
1043 |
+
st.error(f"An error occurred during training: {e}")
|
1044 |
+
|
1045 |
except Exception as e:
|
1046 |
st.error(f"An error occurred during training: {e}")
|