Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Feb 28

Commit

68a3b7e

verified ·

1 Parent(s): d139bf8

Update app.py

Browse files

Files changed (1) hide show

app.py +331 -286

app.py CHANGED Viewed

@@ -7,6 +7,11 @@ from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
 from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
 from sklearn.svm import SVR, SVC
 from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
 from sklearn.impute import KNNImputer, SimpleImputer
 from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
@@ -22,6 +27,7 @@ from io import BytesIO
 import base64
 import time
 from sklearn.cluster import KMeans
 # Configurations
 st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
@@ -100,20 +106,38 @@ def show_loader(message="Loading..."):
         unsafe_allow_html=True
     )
-# Cache decorators
-@st.cache_data(ttl=3600)
 def load_data(uploaded_file):
     """Load and cache dataset, with file type validation."""
     if uploaded_file is not None:
         file_extension = uploaded_file.name.split(".")[-1].lower()
-        if file_extension == "csv":
-            return pd.read_csv(uploaded_file)
-        elif file_extension in ["xlsx", "xls"]:
-            return pd.read_excel(uploaded_file)
-        else:
-            st.error("Unsupported file type. Please upload a CSV or Excel file.")
             return None
     else:
         return None
@@ -160,17 +184,6 @@ app_mode = st.sidebar.radio("Navigate", [
     "Neural Network Studio"  # New option
 ])
-# --- Progress Bar ----
-def animated_progress_bar(progress_var, message="Processing..."):
-    """Displays an animated progress bar with a message."""
-    progress_bar = st.progress(0)
-    status_text = st.empty()  # Empty element to update the status message
-    for i in range(progress_var): #progress will increment
-        status_text.text(f"{message} ({i+1}/{progress_var})")
-        progress_bar.progress((i+1)/progress_var) #progress incrementally.
-        time.sleep(0.01)
 # --- Main App Logic ---
 if app_mode == "Data Upload":
     st.title("📤 Data Upload & Initial Analysis")
@@ -193,16 +206,19 @@ if app_mode == "Data Upload":
         unsafe_allow_html=True,
     )
-    uploaded_file = st.file_uploader("Choose a CSV or Excel file", type=["csv", "xlsx"], help="Upload your dataset here. Supported formats: CSV, XLSX")
     if uploaded_file:
         df = load_data(uploaded_file)
-        if df is not None: # only proceed if load_data returned a valid dataframe
             st.session_state.raw_data = df
             st.session_state.cleaned_data = df.copy()
             st.subheader("Data Overview")
             # Data Overview Cards with more context
             col1, col2, col3 = st.columns(3)
             with col1:
@@ -212,180 +228,100 @@ if app_mode == "Data Upload":
             with col3:
                 num_missing = df.isna().sum().sum()
                 st.metric("Total Missing Values", num_missing, help="Total number of missing entries across the entire dataset.")
             # Display Data Types
             st.write("Column Data Types:")
             dtype_counts = df.dtypes.value_counts().to_dict()
             for dtype, count in dtype_counts.items():
                 st.write(f"- {dtype}: {count} column(s)")
             # Sample Data Table with improved display
             st.subheader("Sample Data")
             num_rows_preview = st.slider("Number of Rows to Preview", 5, 20, 10, help="Adjust the number of rows displayed in the sample data.")
-            st.dataframe(df.head(num_rows_preview), use_container_width=True) #full container usage
-            # Column Statistics Expander
             with st.expander("📊 Column Statistics"):
                 for col in df.columns:
                     st.subheader(f"Column: {col}")
                     st.write(f"Data type: {df[col].dtype}")
                     if pd.api.types.is_numeric_dtype(df[col]):
                         st.write("Summary Statistics:")
                         st.write(df[col].describe())
                     else:
                         st.write("Value Counts:")
                         st.write(df[col].value_counts())
             # Automated EDA Report
             with st.expander("🚀 Automated Data Report"):
                 if st.button("Generate Smart Report"):
                     show_loader("Generating EDA Report")
                     pr = generate_profile(df)
                     st_profile_report(pr)
 elif app_mode == "Smart Cleaning":
     st.title("🧼 Intelligent Data Cleaning")
-    if st.session_state.raw_data is not None:
-        df = st.session_state.cleaned_data
-        # Initialize history if not exists
-        if 'data_history' not in st.session_state:
-            st.session_state.data_history = [df.copy()]
-        # Cleaning Toolkit
-        col1, col2 = st.columns([1, 3])
-        with col1:
-            st.subheader("Cleaning Actions")
-            # Reset and Undo buttons
-            col1a, col1b = st.columns(2)
-            with col1a:
-                if st.button("Reset to Original"):
-                    st.session_state.cleaned_data = st.session_state.raw_data.copy()
-                    st.session_state.data_history = [st.session_state.raw_data.copy()]
-                    st.experimental_rerun()
-            with col1b:
-                if len(st.session_state.data_history) > 1:
-                    if st.button("Undo Last Action"):
-                        st.session_state.data_history.pop()
-                        st.session_state.cleaned_data = st.session_state.data_history[-1].copy()
-                        st.experimental_rerun()
-            # Cleaning Operations
-            clean_action = st.selectbox("Choose Operation", [
-                "Handle Missing Values",
-                "Remove Duplicates",
-                "Remove Columns",
-                "Normalize Data",
-                "Encode Categories",
-                "Outlier Removal",
-                "Neural Network Prep"
-            ])
-            # Dynamic Configuration
-            if clean_action == "Handle Missing Values":
-                method = st.selectbox("Imputation Method", [
-                    "KNN Imputation",
-                    "Median Fill",
-                    "Mean Fill",
-                    "Drop Missing"
-                ])
-                if method == "KNN Imputation":
-                    knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5)
-            elif clean_action == "Neural Network Prep":
-                st.markdown("**Neural Network Specific Preparation**")
-                model_choice = st.radio("Model Type", ["RNN", "CNN"])
-                seq_length = st.number_input("Sequence Length (for RNN)", 10, 100, 30)
-                st.info("Prepares data for neural network training.")
-            elif clean_action == "Normalize Data":
-                scaler_type = st.selectbox("Scaler Type", ["RobustScaler", "StandardScaler"])
-            elif clean_action == "Encode Categories":
-                encoding_method = st.selectbox("Encoding Method", ["OneHotEncoder"])
-            elif clean_action == "Outlier Removal":
-                outlier_method = st.selectbox("Outlier Removal Method", ["IQR", "Z-score"])
-                if outlier_method == "IQR":
-                    iqr_threshold = st.slider("IQR Threshold", 1.0, 3.0, 1.5)
-                else:
-                    zscore_threshold = st.slider("Z-score Threshold", 2.0, 4.0, 3.0)
-            elif clean_action == "Remove Columns":
-                remove_cols = st.multiselect("Columns to Remove", df.columns)
-        with col2:
-            if st.button("Apply Transformation"):
-                with st.spinner("Applying changes..."):
-                    current_df = df.copy()
-                    st.session_state.data_history.append(current_df)
-                    # Handle Missing Values
-                    if clean_action == "Handle Missing Values":
-                        if method == "KNN Imputation":
-                            imputer = KNNImputer(n_neighbors=knn_neighbors)
-                            current_df = pd.DataFrame(imputer.fit_transform(current_df), columns=current_df.columns)
-                        elif method == "Median Fill":
-                            current_df = current_df.fillna(current_df.median())
-                        elif method == "Mean Fill":
-                            current_df = current_df.fillna(current_df.mean())
-                        else:
-                            current_df = current_df.dropna()
-                    # Remove Columns
-                    elif clean_action == "Remove Columns":
-                        if remove_cols:
-                            current_df = current_df.drop(columns=remove_cols)
-                    # Normalize Data
-                    elif clean_action == "Normalize Data":
-                        scaler = RobustScaler() if scaler_type == "RobustScaler" else StandardScaler()
-                        num_cols = current_df.select_dtypes(include=np.number).columns
-                        current_df[num_cols] = scaler.fit_transform(current_df[num_cols])
-                    # Encode Categories
-                    elif clean_action == "Encode Categories":
-                        cat_cols = current_df.select_dtypes(include='object').columns
-                        if len(cat_cols) > 0:
-                            encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
-                            encoded_data = encoder.fit_transform(current_df[cat_cols])
-                            encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(cat_cols))
-                            current_df = pd.concat([current_df.drop(columns=cat_cols), encoded_df], axis=1)
-                    # Outlier Removal
-                    elif clean_action == "Outlier Removal":
-                        num_cols = current_df.select_dtypes(include=np.number).columns
-                        for col in num_cols:
-                            if outlier_method == "IQR":
-                                Q1 = current_df[col].quantile(0.25)
-                                Q3 = current_df[col].quantile(0.75)
-                                IQR = Q3 - Q1
-                                lower_bound = Q1 - iqr_threshold * IQR
-                                upper_bound = Q3 + iqr_threshold * IQR
-                                current_df = current_df[(current_df[col] >= lower_bound) & (current_df[col] <= upper_bound)]
-                            else:
-                                z_scores = np.abs((current_df[col] - current_df[col].mean()) / current_df[col].std())
-                                current_df = current_df[z_scores <= zscore_threshold]
-                    # Neural Network Prep
-                    elif clean_action == "Neural Network Prep":
-                        st.info("Data prepared for neural network training.")
-                    st.session_state.cleaned_data = current_df
-                    st.success("Transformation applied!")
-        # Data Comparison
-        st.subheader("Data Version Comparison")
-        col_orig, col_clean = st.columns(2)
-        with col_orig:
-            st.markdown("**Original Data**")
-            st.dataframe(st.session_state.raw_data.head(5), use_container_width=True)
-        with col_clean:
-            st.markdown("**Cleaned Data**")
-            st.dataframe(df.head(5), use_container_width=True)
 elif app_mode == "Advanced EDA":
     st.title("🔍 Advanced Exploratory Analysis")
@@ -586,109 +522,114 @@ elif app_mode == "Advanced EDA":
                 st.plotly_chart(fig, use_container_width=True)
         except Exception as e:
             st.error(f"An error occurred while generating the plot: {e}")
 elif app_mode == "Model Training":
     st.title("🚂 Model Training")
-    if st.session_state.cleaned_data is not None:
-        df = st.session_state.cleaned_data.copy()
-        # Target Variable Selection
-        target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
-        # Problem Type Selection
-        problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of problem.")
-        # Feature Selection
-        feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose features for training.")
-        # Model Selection
-        model_name = st.selectbox("Select Model", [
-            "Linear Regression", "Logistic Regression", "Decision Tree",
-            "Random Forest", "Gradient Boosting", "SVM"
-        ], help="Choose a model.")
-        # Hyperparameter Tuning (Example - Add more as needed)
-        if model_name == "Random Forest":
-            n_estimators = st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest.")
-            max_depth = st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree.")
-        # Train-Test Split
-        test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
-        if st.button("Train Model"):
-            with st.spinner("Training model..."):
-                try:
-                    X = df[feature_columns]
-                    y = df[target_column]
-                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
-                    # Preprocessing Pipeline
-                    numeric_features = X.select_dtypes(include=np.number).columns
-                    categorical_features = X.select_dtypes(exclude=np.number).columns
-                    numeric_transformer = Pipeline(steps=[
-                        ('imputer', SimpleImputer(strategy='median')),
-                        ('scaler', StandardScaler())
-                    ])
-                    categorical_transformer = Pipeline(steps=[
-                        ('imputer', SimpleImputer(strategy='most_frequent')),
-                        ('onehot', OneHotEncoder(handle_unknown='ignore'))
-                    ])
-                    preprocessor = ColumnTransformer(
-                        transformers=[
-                            ('num', numeric_transformer, numeric_features),
-                            ('cat', categorical_transformer, categorical_features)
-                        ])
-                    X_train_processed = preprocessor.fit_transform(X_train)
-                    X_test_processed = preprocessor.transform(X_test)
-                    # Model Training
-                    if model_name == "Linear Regression":
-                        model = LinearRegression()
-                    elif model_name == "Logistic Regression":
-                        model = LogisticRegression(max_iter=1000)
-                    elif model_name == "Decision Tree":
-                        if problem_type == "Regression":
-                            model = DecisionTreeRegressor()
-                        else:
-                            model = DecisionTreeClassifier()
-                    elif model_name == "Random Forest":
-                        if problem_type == "Regression":
-                            model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
-                        else:
-                            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
-                    elif model_name == "Gradient Boosting":
-                        model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
-                    elif model_name == "SVM":
-                        model = SVR() if problem_type == "Regression" else SVC()
-                    model.fit(X_train_processed, y_train)
-                    # Store model and preprocessor
-                    st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
-                    st.session_state.preprocessor = preprocessor
-                    # Model Evaluation
-                    y_pred = model.predict(X_test_processed)
-                    if problem_type == "Regression":
-                        mse = mean_squared_error(y_test, y_pred)
-                        r2 = r2_score(y_test, y_pred)
-                        st.write(f"Mean Squared Error: {mse:.4f}")
-                        st.write(f"R-squared: {r2:.4f}")
-                    else:
-                        accuracy = accuracy_score(y_test, y_pred)
-                        st.write(f"Accuracy: {accuracy:.4f}")
-                    st.success("Model trained successfully!")
-                except Exception as e:
-                    st.error(f"An error occurred: {e}")
-    else:
-        st.write("Please upload and clean data first.")
 elif app_mode == "Predictions":
     st.title("🔮 Make Predictions")
@@ -729,6 +670,29 @@ elif app_mode == "Predictions":
     else:
         st.write("Please train a model first in the 'Model Training' section.")
 elif app_mode == "Visualization Lab":
     st.title("🔬 Advanced Data Visualization and Clustering Lab")
@@ -839,6 +803,46 @@ if st.session_state.cleaned_data is not None:
                     st.success("Clustering applied successfully!")
                 except Exception as e:
                     st.error(f"An error occurred during clustering: {e}")
 elif app_mode == "Neural Network Studio":
     st.title("🧠 Neural Network Studio")
@@ -882,7 +886,7 @@ elif app_mode == "Neural Network Studio":
         test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the data to use for testing.")
         # Model Training Button
-        if st.button("Train Neural Network Model"):
             with st.spinner("Training neural network model..."):
                 try:
                     # Split data
@@ -900,8 +904,8 @@ elif app_mode == "Neural Network Studio":
                         ('onehot', OneHotEncoder(handle_unknown='ignore'))
                     ])
-                    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
-                    categorical_features = X_train.select_dtypes(include=['object']).columns
                     preprocessor = ColumnTransformer(
                         transformers=[
@@ -913,18 +917,29 @@ elif app_mode == "Neural Network Studio":
                     X_test_processed = preprocessor.transform(X_test)
                     # Neural Network Model Selection and Training
                     if model_type == "Simple Neural Network":
                         model = keras.Sequential()
                         model.add(layers.Input(shape=(X_train_processed.shape[1],)))
                         for _ in range(hidden_layers):
-                            model.add(layers.Dense(neurons_per_layer, activation='relu'))
-                        model.add(layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), activation='linear' if problem_type == "Regression" else 'softmax'))
-                        model.compile(optimizer='adam',
                                       loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
                                       metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
-                        model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)
                         y_pred = model.predict(X_test_processed)
                         if problem_type == "Classification":
@@ -935,17 +950,23 @@ elif app_mode == "Neural Network Studio":
                         X_test_cnn = np.expand_dims(X_test_processed, axis=2)
                         model = keras.Sequential()
-                        model.add(layers.Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)))
-                        model.add(layers.MaxPooling1D(pool_size=2))
                         model.add(layers.Flatten())
                         model.add(layers.Dense(50, activation='relu'))
-                        model.add(layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), activation='linear' if problem_type == "Regression" else 'softmax'))
-                        model.compile(optimizer='adam',
                                       loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
                                       metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
-                        model.fit(X_train_cnn, y_train, epochs=epochs_cnn, batch_size=batch_size_cnn, validation_split=0.2, verbose=0)
                         y_pred = model.predict(X_test_cnn)
                         if problem_type == "Classification":
@@ -953,18 +974,28 @@ elif app_mode == "Neural Network Studio":
                     elif model_type == "Recurrent Neural Network (RNN)":
                         try:
-                            X_train_rnn = np.reshape(X_train_processed, (X_train_processed.shape[0], sequence_length, X_train_processed.shape[1] // sequence_length))
-                            X_test_rnn = np.reshape(X_test_processed, (X_test_processed.shape[0], sequence_length, X_test_processed.shape[1] // sequence_length))
                             model = keras.Sequential()
-                            model.add(layers.SimpleRNN(50, activation='relu', input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])))
-                            model.add(layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), activation='linear' if problem_type == "Regression" else 'softmax'))
-                            model.compile(optimizer='adam',
                                           loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
                                           metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
-                            model.fit(X_train_rnn, y_train, epochs=epochs_rnn, batch_size=batch_size_rnn, validation_split=0.2, verbose=0)
                             y_pred = model.predict(X_test_rnn)
                             if problem_type == "Classification":
@@ -995,7 +1026,21 @@ elif app_mode == "Neural Network Studio":
                         st.write("Classification Report:")
                         st.text(classification_report(y_test, y_pred))
                     st.success("Model trained successfully!")
                 except Exception as e:
                     st.error(f"An error occurred during training: {e}")

 from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
 from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
 from sklearn.svm import SVR, SVC
+from sklearn.decomposition import PCA #Import at top
+from sklearn.metrics import silhouette_score #Import at top
+from sklearn.cluster import DBSCAN #Import at top
+from sklearn.feature_selection import SelectKBest #Import at top
+import joblib #Import at top
 from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
 from sklearn.impute import KNNImputer, SimpleImputer
 from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
 import base64
 import time
 from sklearn.cluster import KMeans
+import scipy.stats as stats
 # Configurations
 st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
         unsafe_allow_html=True
     )
+@st.cache_data(ttl=3600, allow_output_mutation=True) #Added allow_output_mutation
 def load_data(uploaded_file):
     """Load and cache dataset, with file type validation."""
     if uploaded_file is not None:
         file_extension = uploaded_file.name.split(".")[-1].lower()
+        mime_type = mimetypes.guess_type(uploaded_file.name)[0]
+        max_file_size_mb = 50  # Set a maximum file size (adjust as needed)
+        file_size_mb = uploaded_file.size / (1024 * 1024)
+        if file_size_mb > max_file_size_mb:
+            st.error(f"File size exceeds the limit of {max_file_size_mb} MB.")
             return None
+        try:  # Wrap file reading in a try...except
+            if file_extension == "csv" or mime_type == 'text/csv':
+                df = pd.read_csv(uploaded_file)
+                return df
+            elif file_extension in ["xlsx", "xls"] or mime_type in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:
+                df = pd.read_excel(uploaded_file)
+                return df
+            else:
+                st.error("Unsupported file type. Please upload a CSV or Excel file.")
+                return None
+        except FileNotFoundError:
+            st.error("File not found. Please check the file path.")
+        except pd.errors.ParserError:  # Catch pandas-specific parsing errors
+            st.error("Error parsing the file.  Make sure it's a valid CSV or Excel file.")
+        except Exception as e:
+            st.error(f"An unexpected error occurred: {type(e).__name__} - {str(e)}")
+            return None  # Handle other potential exceptions
     else:
         return None
     "Neural Network Studio"  # New option
 ])
 # --- Main App Logic ---
 if app_mode == "Data Upload":
     st.title("📤 Data Upload & Initial Analysis")
         unsafe_allow_html=True,
     )
+    uploaded_file = st.file_uploader(
+        "Choose a CSV or Excel file", type=["csv", "xlsx"],
+        help="Upload your dataset here. Supported formats: CSV, XLSX"
+    )
     if uploaded_file:
         df = load_data(uploaded_file)
+        if df is not None:
+            # only proceed if load_data returned a valid dataframe
             st.session_state.raw_data = df
             st.session_state.cleaned_data = df.copy()
             st.subheader("Data Overview")
             # Data Overview Cards with more context
             col1, col2, col3 = st.columns(3)
             with col1:
             with col3:
                 num_missing = df.isna().sum().sum()
                 st.metric("Total Missing Values", num_missing, help="Total number of missing entries across the entire dataset.")
             # Display Data Types
             st.write("Column Data Types:")
             dtype_counts = df.dtypes.value_counts().to_dict()
             for dtype, count in dtype_counts.items():
                 st.write(f"- {dtype}: {count} column(s)")
             # Sample Data Table with improved display
             st.subheader("Sample Data")
             num_rows_preview = st.slider("Number of Rows to Preview", 5, 20, 10, help="Adjust the number of rows displayed in the sample data.")
+            st.dataframe(df.head(num_rows_preview), use_container_width=True)
+            # Column Statistics
             with st.expander("📊 Column Statistics"):
                 for col in df.columns:
                     st.subheader(f"Column: {col}")
                     st.write(f"Data type: {df[col].dtype}")
                     if pd.api.types.is_numeric_dtype(df[col]):
                         st.write("Summary Statistics:")
                         st.write(df[col].describe())
                     else:
                         st.write("Value Counts:")
                         st.write(df[col].value_counts())
             # Automated EDA Report
             with st.expander("🚀 Automated Data Report"):
                 if st.button("Generate Smart Report"):
                     show_loader("Generating EDA Report")
                     pr = generate_profile(df)
                     st_profile_report(pr)
 elif app_mode == "Smart Cleaning":
     st.title("🧼 Intelligent Data Cleaning")
+    elif clean_action == "Handle Missing Values":
+    columns_with_missing = df.columns[df.isnull().any()].tolist()
+    column_to_impute = st.selectbox("Column to Impute", ["All Columns"] + columns_with_missing) #Choose column
+    method = st.selectbox("Imputation Method", [
+        "KNN Imputation",
+        "Median Fill",
+        "Mean Fill",
+        "Drop Missing",
+        "Constant Value Fill" #new
+    ])
+    if method == "KNN Imputation":
+        knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5)
+    elif method == "Constant Value Fill":
+        constant_value = st.text_input("Constant Value")
+elif clean_action == "Clean Text":
+    text_column = st.selectbox("Text Column", df.select_dtypes(include='object').columns)
+    cleaning_operation = st.selectbox("Cleaning Operation", ["Remove Special Characters", "Lowercase", "Uppercase", "Remove Extra Spaces"])
+    if cleaning_operation == "Remove Special Characters":
+        chars_to_remove = st.text_input("Characters to Remove", r'[^a-zA-Z0-9\s]')
+#Inside the Apply Transformations button section
+elif clean_action == "Handle Missing Values":
+    if method == "KNN Imputation":
+        imputer = KNNImputer(n_neighbors=knn_neighbors)
+        if column_to_impute == "All Columns":
+            current_df = pd.DataFrame(imputer.fit_transform(current_df), columns=current_df.columns)
+        else:
+             current_df[[column_to_impute]] = imputer.fit_transform(current_df[[column_to_impute]])
+    elif method == "Median Fill":
+         if column_to_impute == "All Columns":
+            current_df = current_df.fillna(current_df.median())
+         else:
+            current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].median())
+    elif method == "Mean Fill":
+        if column_to_impute == "All Columns":
+            current_df = current_df.fillna(current_df.mean())
+        else:
+            current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].mean())
+    elif method == "Constant Value Fill":
+        if column_to_impute == "All Columns":
+             current_df = current_df.fillna(constant_value)
+        else:
+            current_df[column_to_impute] = current_df[column_to_impute].fillna(constant_value)
+    else:
+        current_df = current_df.dropna()
+ elif clean_action == "Clean Text":
+      def clean_text(text, operation, chars_to_remove=r'[^a-zA-Z0-9\s]'):
+            if operation == "Remove Special Characters":
+                text = re.sub(chars_to_remove, '', str(text)) #Need to import re at top
+            elif operation == "Lowercase":
+                text = str(text).lower()
+            elif operation == "Uppercase":
+                text = str(text).upper()
+            elif operation == "Remove Extra Spaces":
+                text = " ".join(str(text).split())
+            return text
+      current_df[text_column] = current_df[text_column].apply(lambda x: clean_text(x, cleaning_operation, chars_to_remove))
 elif app_mode == "Advanced EDA":
     st.title("🔍 Advanced Exploratory Analysis")
                 st.plotly_chart(fig, use_container_width=True)
         except Exception as e:
             st.error(f"An error occurred while generating the plot: {e}")
+        with st.expander("🧪 Hypothesis Testing"):
+        test_type = st.selectbox("Select Test Type", ["T-test", "Chi-Squared Test"])
+    if test_type == "T-test":
+        col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
+        col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
+        if st.button("Run T-test"):
+            # Example: Split data by category and perform t-test
+            groups = df.groupby(col2)[col1].apply(list)
+            if len(groups) == 2:
+                t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
+                st.write(f"T-statistic: {t_stat:.4f}")
+                st.write(f"P-value: {p_value:.4f}")
+                if p_value < 0.05:
+                    st.write("Reject the null hypothesis.")
+                else:
+                    st.write("Fail to reject the null hypothesis.")
+            else:
+                st.write("Select a categorical column with exactly two categories.")
 elif app_mode == "Model Training":
     st.title("🚂 Model Training")
+    feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
+if model_name == "Random Forest":
+    param_grid = {
+        'n_estimators': st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest."),
+        'max_depth': st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree."),
+        'min_samples_split': st.slider("Minimum Samples Split", 2, 10, 2, help="Minimum samples required to split an internal node"), #New hyperparameter
+        'min_samples_leaf': st.slider("Minimum Samples Leaf", 1, 10, 1, help="Minimum samples required to be at a leaf node"), #New hyperparameter
+    }
+#Inside the train model button
+if st.button("Train Model"):
+     #Feature Selection
+        if feature_selection_method == "SelectKBest":
+            k = st.slider("Number of Features to Select", 1, len(feature_columns), len(feature_columns))
+            selector = SelectKBest(k=k)
+            X_train_selected = selector.fit_transform(X_train_processed, y_train)
+            X_test_selected = selector.transform(X_test_processed)
+        else:
+            X_train_selected = X_train_processed
+            X_test_selected = X_test_processed
+    # Model Training and Hyperparameter Tuning
+        if model_name == "Linear Regression":
+            model = LinearRegression()
+        elif model_name == "Logistic Regression":
+            model = LogisticRegression(max_iter=1000)
+        elif model_name == "Decision Tree":
+            if problem_type == "Regression":
+                model = DecisionTreeRegressor()
+            else:
+                model = DecisionTreeClassifier()
+        elif model_name == "Random Forest":
+            if problem_type == "Regression":
+                model = RandomForestRegressor(random_state=42)
+                grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')  # Example scoring
+                grid_search.fit(X_train_selected, y_train)
+                model = grid_search.best_estimator_
+                st.write("Best Parameters:", grid_search.best_params_)
+            else:
+                model = RandomForestClassifier(random_state=42)
+                grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
+                grid_search.fit(X_train_selected, y_train)
+                model = grid_search.best_estimator_
+                st.write("Best Parameters:", grid_search.best_params_)
+        elif model_name == "Gradient Boosting":
+            model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
+        elif model_name == "SVM":
+            model = SVR() if problem_type == "Regression" else SVC()
+        # Cross-validation
+        cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5) #example, adjust cv
+        st.write(f"Cross-validation scores: {cv_scores}")
+        st.write(f"Mean cross-validation score: {cv_scores.mean():.4f}")
+        model.fit(X_train_selected, y_train)
+       # Model Saving
+        model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model")
+        if st.button("Save Model"):
+            try:
+                joblib.dump(st.session_state.model, f"{model_filename}.joblib")
+                st.success(f"Model saved as {model_filename}.joblib")
+            except Exception as e:
+                st.error(f"Error saving model: {e}")
+       # Model loading in a different section
+        model_file = st.file_uploader("Upload Trained Model", type=["joblib"])
+        if model_file is not None:
+            try:
+                st.session_state.model = joblib.load(model_file)
+                st.success("Model loaded successfully!")
+            except Exception as e:
+                st.error(f"Error loading model: {e}")
+       #Model Evaluation Section
+        y_pred = model.predict(X_test_selected)
+        if problem_type == "Regression":
+            mse = mean_squared_error(y_test, y_pred)
+            r2 = r2_score(y_test, y_pred)
+            st.write(f"Mean Squared Error: {mse:.4f}")
+            st.write(f"R-squared: {r2:.4f}")
+        else:
+            accuracy = accuracy_score(y_test, y_pred)
+            st.write(f"Accuracy: {accuracy:.4f}")
 elif app_mode == "Predictions":
     st.title("🔮 Make Predictions")
     else:
         st.write("Please train a model first in the 'Model Training' section.")
+    #Add batch prediction section in prediction tab
+        st.subheader("Batch Predictions")
+        batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"])
+        if batch_file is not None:
+            try:
+                batch_df = pd.read_csv(batch_file)
+                # Preprocess the batch data
+                batch_processed = st.session_state.preprocessor.transform(batch_df)
+                # Make predictions
+                batch_predictions = st.session_state.model.predict(batch_processed)
+                batch_df['Prediction'] = batch_predictions
+                st.dataframe(batch_df)
+             # Download predictions
+                csv = batch_df.to_csv(index=False)
+                b64 = base64.b64encode(csv.encode()).decode()  # some strings
+                href = f'<a href="data:file/csv;base64,{b64}" download="predictions.csv">Download Predictions CSV</a>'
+                st.markdown(href, unsafe_allow_html=True)
+        except Exception as e:
+            st.error(f"Error processing batch file: {e}")
 elif app_mode == "Visualization Lab":
     st.title("🔬 Advanced Data Visualization and Clustering Lab")
                     st.success("Clustering applied successfully!")
                 except Exception as e:
                     st.error(f"An error occurred during clustering: {e}")
+    #Add clustering performance in clustering analysis
+if len(cluster_cols) >= 2:  # Evaluate Silhouette Score
+    try:
+        silhouette_avg = silhouette_score(scaled_data, clusters)
+        st.write(f"Silhouette Score: {silhouette_avg:.4f}")
+    except:
+        st.write("Could not compute silhouette score")
+#Add dimensionality reduction option and 2d/3d plots
+    dimension_reduction = st.selectbox("Dimensionality Reduction", ["None", "PCA"])
+    if dimension_reduction == "PCA":
+        n_components = st.slider("Number of Components", 2, min(3, len(cluster_cols)), 2)
+        pca = PCA(n_components=n_components)
+        principal_components = pca.fit_transform(scaled_data)
+        pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i + 1}' for i in range(n_components)])
+        pca_df['Cluster'] = clusters  # Add Cluster
+    if len(cluster_cols) >= 2: #plotting section
+        fig = None #Initialize fig
+        if dimension_reduction == "None":
+            if len(cluster_cols) == 2:
+                fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
+                st.plotly_chart(fig, use_container_width=True)
+            elif len(cluster_cols) == 3:
+                fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
+        elif dimension_reduction == "PCA":
+            if n_components == 2:
+                fig = px.scatter(pca_df, x='PC1', y='PC2', color='Cluster', title="K-Means Clustering (PCA - 2D)")
+                st.plotly_chart(fig, use_container_width=True)
+            elif n_components == 3:
+                fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Cluster', title="K-Means Clustering (PCA - 3D)")
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.write("PCA visualization is only supported for 2 or 3 components.")
 elif app_mode == "Neural Network Studio":
     st.title("🧠 Neural Network Studio")
         test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the data to use for testing.")
         # Model Training Button
+            if st.button("Train Neural Network Model"):
             with st.spinner("Training neural network model..."):
                 try:
                     # Split data
                         ('onehot', OneHotEncoder(handle_unknown='ignore'))
                     ])
+                    numeric_features = X_train.select_dtypes(include=np.number).columns
+                    categorical_features = X_train.select_dtypes(include='object').columns
                     preprocessor = ColumnTransformer(
                         transformers=[
                     X_test_processed = preprocessor.transform(X_test)
                     # Neural Network Model Selection and Training
+                    tf.random.set_seed(42)  # for reproducibility
+                    # Callbacks (Early Stopping)
+                    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
                     if model_type == "Simple Neural Network":
                         model = keras.Sequential()
                         model.add(layers.Input(shape=(X_train_processed.shape[1],)))
                         for _ in range(hidden_layers):
+                            model.add(layers.Dense(neurons_per_layer, activation=activation))  # Use the selected activation
+                        model.add(
+                            layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
+                                         activation='linear' if problem_type == "Regression" else 'softmax'))
+                        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)  # Use the learning rate
+                        model.compile(optimizer=optimizer,
                                       loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
                                       metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
+                        history = model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size,
+                                              validation_split=0.2, verbose=0,
+                                              callbacks=[early_stopping])  # Added early stopping
                         y_pred = model.predict(X_test_processed)
                         if problem_type == "Classification":
                         X_test_cnn = np.expand_dims(X_test_processed, axis=2)
                         model = keras.Sequential()
+                        model.add(layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu',
+                                                 input_shape=(X_train_cnn.shape[1], 1)))
+                        model.add(layers.MaxPooling1D(pool_size=pooling_size))
                         model.add(layers.Flatten())
                         model.add(layers.Dense(50, activation='relu'))
+                        model.add(
+                            layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
+                                         activation='linear' if problem_type == "Regression" else 'softmax'))
+                        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
+                        model.compile(optimizer=optimizer,
                                       loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
                                       metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
+                        history = model.fit(X_train_cnn, y_train, epochs=epochs_cnn, batch_size=batch_size_cnn,
+                                              validation_split=0.2, verbose=0,
+                                               callbacks=[early_stopping])
                         y_pred = model.predict(X_test_cnn)
                         if problem_type == "Classification":
                     elif model_type == "Recurrent Neural Network (RNN)":
                         try:
+                            X_train_rnn = np.reshape(X_train_processed, (
+                                X_train_processed.shape[0], sequence_length,
+                                X_train_processed.shape[1] // sequence_length))
+                            X_test_rnn = np.reshape(X_test_processed, (
+                                X_test_processed.shape[0], sequence_length, X_test_processed.shape[1] // sequence_length))
                             model = keras.Sequential()
+                            model.add(layers.SimpleRNN(units, activation='relu',  # Use the selected units
+                                                      dropout=dropout_rate,
+                                                      input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])))
+                            model.add(
+                                layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
+                                             activation='linear' if problem_type == "Regression" else 'softmax'))
+                            optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
+                            model.compile(optimizer=optimizer,
                                           loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
                                           metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
+                            history = model.fit(X_train_rnn, y_train, epochs=epochs_rnn, batch_size=batch_size_rnn,
+                                                  validation_split=0.2, verbose=0,
+                                                   callbacks=[early_stopping])
                             y_pred = model.predict(X_test_rnn)
                             if problem_type == "Classification":
                         st.write("Classification Report:")
                         st.text(classification_report(y_test, y_pred))
+                    # Visualization
+                    st.subheader("Training History")
+                    fig, ax = plt.subplots()  # Use matplotlib directly
+                    ax.plot(history.history['loss'], label='loss')
+                    ax.plot(history.history['val_loss'], label='val_loss')
+                    ax.set_xlabel('Epoch')
+                    ax.set_ylabel('Loss')
+                    ax.legend()
+                    st.pyplot(fig)  # Display with st.pyplot
                     st.success("Model trained successfully!")
+                except Exception as e:
+                    st.error(f"An error occurred during training: {e}")
                 except Exception as e:
                     st.error(f"An error occurred during training: {e}")