Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Feb 27

Commit

7ec0dc1

verified ·

1 Parent(s): 41390aa

Update app.py

Browse files

Files changed (1) hide show

app.py +394 -152

app.py CHANGED Viewed

@@ -2,15 +2,20 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import plotly.express as px
-from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LinearRegression
-from sklearn.tree import DecisionTreeRegressor
-from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.impute import KNNImputer
-from sklearn.preprocessing import RobustScaler
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
 from io import StringIO
 # Configuration
 st.set_page_config(page_title="Data Wizard Pro", layout="wide", page_icon="🧙")
@@ -25,12 +30,12 @@ st.markdown(
             color: #e0e0ff; /* Light text */
             font-family: 'Courier New', monospace; /* Monospace font */
         }
         /* Main content area */
         .stApp {
             background-color: #0a0a1a; /* Match body background */
         }
         /* Containers and blocks */
         .st-emotion-cache-16idsys,
         .st-emotion-cache-1v0mbdj,
@@ -46,44 +51,44 @@ st.markdown(
             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5); /* Enhanced shadow */
             color: #e0e0ff; /* Light text color */
         }
         /* Sidebar */
         .st-bb {
             background-color: #141422; /* Dark sidebar background */
             padding: 20px;
             border-radius: 10px;
         }
         /* Headers */
         h1, h2, h3, h4, h5, h6, .st-bb {
             color: #00f7ff; /* Cyan color for headers */
         }
         /* Selectboxes and Buttons */
         .st-cb, .st-ci, .st-cj, .st-ch {
             background-color: #141422; /* Dark selectbox background */
             color: #00f7ff !important; /* Cyan text color */
             border: 1px solid #00f7ff; /* Cyan border */
         }
         /* Selectbox text */
         .st-cv {
             color: #00f7ff !important; /* Cyan color for selectbox text */
         }
         /* Number input and text input */
         .st-cr {
             background-color: #141422 !important; /* Dark input background */
             color: #00f7ff !important; /* Cyan text color */
             border: 1px solid #00f7ff !important; /* Cyan border */
         }
         /* Slider */
         .st-cw {
             background-color: #141422 !important; /* Dark slider background */
             border: 1px solid #00f7ff !important; /* Cyan border */
         }
         /* Buttons */
         .st-bz, .st-b0 {
             background-color: #141422; /* Darker Button background */
@@ -95,7 +100,7 @@ st.markdown(
             background-color: #00f7ff; /* Hover color */
             color: #0a0a1a; /* Hover text color */
         }
         /* File uploader */
         .st-ae {
             background-color: #141422 !important; /* Dark file uploader background */
@@ -103,24 +108,21 @@ st.markdown(
             border: 1px solid #00f7ff !important; /* Cyan border */
             border-radius: 10px; /* Rounded corners */
         }
         /* Metric */
         .st-emotion-cache-10trblm {
             border-radius: 10px !important; /* Rounded corners */
             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5) !important; /* Enhanced shadow */
         }
         /* Dataframes and tables */
         .dataframe {
             background-color: #1e1e30 !important; /* Dark table background */
             color: #e0e0ff !important; /* Light text in tables */
             border: 1px solid #00f7ff !important; /* Cyan border for tables */
         }
         .dataframe tr:nth-child(odd) {
             background-color: #141422 !important; /* Alternating row color */
         }
         /* Expanders*/
         .st-emotion-cache-10oheav {
             color: #00f7ff !important; /* Cyan text color */
@@ -142,10 +144,20 @@ st.markdown(
 # Cache decorators
 @st.cache_data(ttl=3600)
 def load_data(uploaded_file):
-    """Load and cache dataset"""
-    if uploaded_file.name.endswith('.csv'):
-        return pd.read_csv(uploaded_file)
-    return pd.read_excel(uploaded_file)
 @st.cache_data(ttl=3600)
 def generate_profile(df):
@@ -161,12 +173,14 @@ if 'train_test' not in st.session_state:
     st.session_state.train_test = {}
 if 'model' not in st.session_state:
     st.session_state.model = None
 # Sidebar Navigation
 st.sidebar.title("🔮 Data Wizard Pro")
 app_mode = st.sidebar.radio("Navigate", [
-    "Data Upload",
-    "Smart Cleaning",
     "Advanced EDA",
     "Model Training",
     "Predictions",
@@ -176,35 +190,36 @@ app_mode = st.sidebar.radio("Navigate", [
 # Data Upload Section
 if app_mode == "Data Upload":
     st.title("📤 Data Upload & Analysis")
     uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx"])
     if uploaded_file:
         df = load_data(uploaded_file)
-        st.session_state.raw_data = df
-        st.session_state.cleaned_data = df.copy()
-        # Data Overview Cards
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("Rows", df.shape[0])
-        with col2:
-            st.metric("Columns", df.shape[1])
-        with col3:
-            st.metric("Missing Values", df.isna().sum().sum())
-        # Automated EDA Report
-        with st.expander("🚀 Automated Data Report"):
-            if st.button("Generate Smart Report"):
-                pr = generate_profile(df)
-                st_profile_report(pr)
 # Smart Cleaning Section
 elif app_mode == "Smart Cleaning":
     st.title("🧼 Intelligent Data Cleaning")
     if st.session_state.raw_data is not None:
         df = st.session_state.cleaned_data
         # Cleaning Toolkit
         col1, col2 = st.columns([1, 3])
         with col1:
@@ -213,9 +228,10 @@ elif app_mode == "Smart Cleaning":
                 "Handle Missing Values",
                 "Remove Duplicates",
                 "Normalize Data",
-                "Encode Categories"
             ])
             if clean_action == "Handle Missing Values":
                 method = st.selectbox("Imputation Method", [
                     "KNN Imputation",
@@ -223,160 +239,383 @@ elif app_mode == "Smart Cleaning":
                     "Mean Fill",
                     "Drop Missing"
                 ])
         with col2:
             if st.button("Apply Transformation"):
                 with st.spinner("Applying changes..."):
                     if clean_action == "Handle Missing Values":
-                        if method == "KNN Imputation":
-                            imputer = KNNImputer()
-                            df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
-                        elif method == "Median Fill":
-                            df = df.fillna(df.median())
-                        elif method == "Mean Fill":
-                            df = df.fillna(df.mean())
                         else:
-                            df = df.dropna()
                     elif clean_action == "Remove Duplicates":
                         df = df.drop_duplicates()
                     elif clean_action == "Normalize Data":
-                        scaler = RobustScaler()
-                        numerical_cols = df.select_dtypes(include=np.number).columns
-                        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
                     st.session_state.cleaned_data = df
                     st.success("Transformation applied!")
         # Data Comparison
         st.subheader("Data Version Comparison")
         col1, col2 = st.columns(2)
         with col1:
-            st.write("Original Data", st.session_state.raw_data.head(3))
         with col2:
             st.write("Cleaned Data", df.head(3))
 # Advanced EDA Section
 elif app_mode == "Advanced EDA":
     st.title("🔍 Advanced Exploratory Analysis")
     if st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data
         # Visualization Selector
         plot_type = st.selectbox("Choose Visualization", [
-            "Histogram",
             "Scatter Plot",
             "Box Plot",
             "Correlation Heatmap",
-            "3D Scatter"
         ])
         # Dynamic Axis Selection
         cols = st.columns(3)
         with cols[0]:
             x_col = st.selectbox("X Axis", df.columns)
         with cols[1]:
-            y_col = st.selectbox("Y Axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot"] else None
         with cols[2]:
             z_col = st.selectbox("Z Axis", df.columns) if plot_type == "3D Scatter" else None
         # Generate Plot
         if st.button("Generate Visualization"):
-            if plot_type == "Histogram":
-                fig = px.histogram(df, x=x_col, nbins=30, template="plotly_dark")
-            elif plot_type == "Scatter Plot":
-                fig = px.scatter(df, x=x_col, y=y_col, color_discrete_sequence=['#00f7ff'])
-            elif plot_type == "3D Scatter":
-                fig = px.scatter_3d(df, x=x_col, y=y_col, z=z_col, color=x_col)
-            elif plot_type == "Correlation Heatmap":
-                corr = df.corr()
-                fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu')
-            elif plot_type == "Box Plot":
-                fig = px.box(df,x=x_col, y=y_col, color_discrete_sequence=['#00f7ff'])
-            fig.update_layout(
-                plot_bgcolor="#1e1e30",
-                paper_bgcolor="#1e1e30",
-                font_color="#e0e0ff"
-            )
-            st.plotly_chart(fig, use_container_width=True)
 # Model Training Section
 elif app_mode == "Model Training":
     st.title("🤖 Model Training Studio")
     if st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data
         # Model Setup
         col1, col2 = st.columns([1, 3])
         with col1:
-            model_type = st.selectbox("Choose Model", [
-                "Linear Regression",
-                "Decision Tree"
-            ])
             test_size = st.slider("Test Size", 0.1, 0.5, 0.2)
             target = st.selectbox("Target Variable", df.columns)
         with col2:
             if st.button("Train Model"):
-                X = df.drop(columns=[target])
-                y = df[target]
-                X_train, X_test, y_train, y_test = train_test_split(
-                    X, y, test_size=test_size, random_state=42
-                )
-                if model_type == "Linear Regression":
-                    model = LinearRegression()
-                elif model_type == "Decision Tree":
-                    model = DecisionTreeRegressor()
-                model.fit(X_train, y_train)
-                st.session_state.model = model
-                st.session_state.train_test = {
-                    'X_test': X_test,
-                    'y_test': y_test
-                }
-                # Evaluation Metrics
-                y_pred = model.predict(X_test)
-                st.metric("R² Score", round(r2_score(y_test, y_pred), 2))
-                st.metric("MSE", round(mean_squared_error(y_test, y_pred), 2))
 # Predictions Section
 elif app_mode == "Predictions":
     st.title("🔮 Make Predictions")
-    if st.session_state.model is not None:
-        model = st.session_state.model
         # Prediction Interface
         input_data = {}
-        for col in st.session_state.train_test['X_test'].columns:
-            input_data[col] = st.number_input(col, value=0.0)
         if st.button("Predict"):
-            input_df = pd.DataFrame([input_data])
-            prediction = model.predict(input_df)
-            st.success(f"Predicted Value: {prediction[0]:.2f}")
 elif app_mode == "Visualization Lab":
     st.title("📊 Advanced Visualization Lab")
     if st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data
         # Visualization Gallery
         viz_type = st.selectbox("Choose Visualization Type", [
             "3D Scatter Plot",
             "Interactive Heatmap",
             "Time Series Analysis",
-            "Cluster Analysis"
         ])
         # Dynamic Controls
         cols = st.columns(3)
         with cols[0]:
@@ -385,23 +624,26 @@ elif app_mode == "Visualization Lab":
             y_axis = st.selectbox("Y Axis", df.columns)
         with cols[2]:
             z_axis = st.selectbox("Z Axis", df.columns) if viz_type == "3D Scatter Plot" else None
         # Generate Visualization
-        if viz_type == "3D Scatter Plot":
-            fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, color=x_axis)
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Interactive Heatmap":
-            corr = df.corr()
-            fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu')
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Time Series Analysis":
-            # Basic time series plot
-            time_col = st.selectbox("Time Column", df.columns)
-            value_col = st.selectbox("Value Column", df.columns)
-            fig = px.line(df, x=time_col, y=value_col)
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Cluster Analysis":
-            st.write("Cluster Analysis Feature Coming Soon!")  # placeholder for future development

 import pandas as pd
 import numpy as np
 import plotly.express as px
+from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
+from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
+from sklearn.svm import SVR, SVC
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
 from sklearn.impute import KNNImputer
+from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
 from io import StringIO
+import joblib  # For saving and loading models
 # Configuration
 st.set_page_config(page_title="Data Wizard Pro", layout="wide", page_icon="🧙")
             color: #e0e0ff; /* Light text */
             font-family: 'Courier New', monospace; /* Monospace font */
         }
         /* Main content area */
         .stApp {
             background-color: #0a0a1a; /* Match body background */
         }
         /* Containers and blocks */
         .st-emotion-cache-16idsys,
         .st-emotion-cache-1v0mbdj,
             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5); /* Enhanced shadow */
             color: #e0e0ff; /* Light text color */
         }
         /* Sidebar */
         .st-bb {
             background-color: #141422; /* Dark sidebar background */
             padding: 20px;
             border-radius: 10px;
         }
         /* Headers */
         h1, h2, h3, h4, h5, h6, .st-bb {
             color: #00f7ff; /* Cyan color for headers */
         }
         /* Selectboxes and Buttons */
         .st-cb, .st-ci, .st-cj, .st-ch {
             background-color: #141422; /* Dark selectbox background */
             color: #00f7ff !important; /* Cyan text color */
             border: 1px solid #00f7ff; /* Cyan border */
         }
         /* Selectbox text */
         .st-cv {
             color: #00f7ff !important; /* Cyan color for selectbox text */
         }
         /* Number input and text input */
         .st-cr {
             background-color: #141422 !important; /* Dark input background */
             color: #00f7ff !important; /* Cyan text color */
             border: 1px solid #00f7ff !important; /* Cyan border */
         }
         /* Slider */
         .st-cw {
             background-color: #141422 !important; /* Dark slider background */
             border: 1px solid #00f7ff !important; /* Cyan border */
         }
         /* Buttons */
         .st-bz, .st-b0 {
             background-color: #141422; /* Darker Button background */
             background-color: #00f7ff; /* Hover color */
             color: #0a0a1a; /* Hover text color */
         }
         /* File uploader */
         .st-ae {
             background-color: #141422 !important; /* Dark file uploader background */
             border: 1px solid #00f7ff !important; /* Cyan border */
             border-radius: 10px; /* Rounded corners */
         }
         /* Metric */
         .st-emotion-cache-10trblm {
             border-radius: 10px !important; /* Rounded corners */
             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5) !important; /* Enhanced shadow */
         }
         /* Dataframes and tables */
         .dataframe {
             background-color: #1e1e30 !important; /* Dark table background */
             color: #e0e0ff !important; /* Light text in tables */
             border: 1px solid #00f7ff !important; /* Cyan border for tables */
         }
         .dataframe tr:nth-child(odd) {
             background-color: #141422 !important; /* Alternating row color */
         }
         /* Expanders*/
         .st-emotion-cache-10oheav {
             color: #00f7ff !important; /* Cyan text color */
 # Cache decorators
 @st.cache_data(ttl=3600)
 def load_data(uploaded_file):
+    """Load and cache dataset, with file type validation."""
+    if uploaded_file is not None:
+        file_extension = uploaded_file.name.split(".")[-1].lower()
+        if file_extension == "csv":
+            return pd.read_csv(uploaded_file)
+        elif file_extension in ["xlsx", "xls"]:
+            return pd.read_excel(uploaded_file)
+        else:
+            st.error("Unsupported file type. Please upload a CSV or Excel file.")
+            return None
+    else:
+        return None
 @st.cache_data(ttl=3600)
 def generate_profile(df):
     st.session_state.train_test = {}
 if 'model' not in st.session_state:
     st.session_state.model = None
+if 'preprocessor' not in st.session_state:
+    st.session_state.preprocessor = None # to store the column transformer
 # Sidebar Navigation
 st.sidebar.title("🔮 Data Wizard Pro")
 app_mode = st.sidebar.radio("Navigate", [
+    "Data Upload",
+    "Smart Cleaning",
     "Advanced EDA",
     "Model Training",
     "Predictions",
 # Data Upload Section
 if app_mode == "Data Upload":
     st.title("📤 Data Upload & Analysis")
     uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx"])
     if uploaded_file:
         df = load_data(uploaded_file)
+        if df is not None: # only proceed if load_data returned a valid dataframe
+            st.session_state.raw_data = df
+            st.session_state.cleaned_data = df.copy()
+            # Data Overview Cards
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Rows", df.shape[0])
+            with col2:
+                st.metric("Columns", df.shape[1])
+            with col3:
+                st.metric("Missing Values", df.isna().sum().sum())
+            # Automated EDA Report
+            with st.expander("🚀 Automated Data Report"):
+                if st.button("Generate Smart Report"):
+                    pr = generate_profile(df)
+                    st_profile_report(pr)
 # Smart Cleaning Section
 elif app_mode == "Smart Cleaning":
     st.title("🧼 Intelligent Data Cleaning")
     if st.session_state.raw_data is not None:
         df = st.session_state.cleaned_data
         # Cleaning Toolkit
         col1, col2 = st.columns([1, 3])
         with col1:
                 "Handle Missing Values",
                 "Remove Duplicates",
                 "Normalize Data",
+                "Encode Categories",
+                "Outlier Removal"
             ])
             if clean_action == "Handle Missing Values":
                 method = st.selectbox("Imputation Method", [
                     "KNN Imputation",
                     "Mean Fill",
                     "Drop Missing"
                 ])
+                impute_cols = st.multiselect("Columns to Impute", df.columns)
+            elif clean_action == "Normalize Data":
+                scaler_type = st.selectbox("Scaler Type", ["RobustScaler", "StandardScaler"])
+                normalize_cols = st.multiselect("Columns to Normalize", df.select_dtypes(include=np.number).columns.tolist())
+            elif clean_action == "Encode Categories":
+                encode_cols = st.multiselect("Columns to Encode", df.select_dtypes(include='object').columns.tolist())
+                encoding_method = st.selectbox("Encoding Method", ["OneHotEncoder"])  # Add more if needed
+            elif clean_action == "Outlier Removal":
+                outlier_cols = st.multiselect("Columns to Remove Outliers From", df.select_dtypes(include=np.number).columns.tolist())
+                outlier_method = st.selectbox("Outlier Removal Method", ["IQR", "Z-score"])
+                if outlier_method == "IQR":
+                    iqr_threshold = st.slider("IQR Threshold", 1.0, 3.0, 1.5)
+                else:
+                    zscore_threshold = st.slider("Z-score Threshold", 2.0, 4.0, 3.0)
         with col2:
             if st.button("Apply Transformation"):
                 with st.spinner("Applying changes..."):
                     if clean_action == "Handle Missing Values":
+                        if not impute_cols:
+                            st.warning("Please select columns to impute.")
                         else:
+                            if method == "KNN Imputation":
+                                imputer = KNNImputer()
+                                df[impute_cols] = imputer.fit_transform(df[impute_cols])
+                            elif method == "Median Fill":
+                                df[impute_cols] = df[impute_cols].fillna(df[impute_cols].median())
+                            elif method == "Mean Fill":
+                                df[impute_cols] = df[impute_cols].fillna(df[impute_cols].mean())
+                            else:
+                                df = df.dropna(subset=impute_cols)
                     elif clean_action == "Remove Duplicates":
                         df = df.drop_duplicates()
                     elif clean_action == "Normalize Data":
+                        if not normalize_cols:
+                            st.warning("Please select columns to normalize.")
+                        else:
+                            if scaler_type == "RobustScaler":
+                                scaler = RobustScaler()
+                            else:
+                                scaler = StandardScaler()
+                            df[normalize_cols] = scaler.fit_transform(df[normalize_cols])
+                    elif clean_action == "Encode Categories":
+                        if not encode_cols:
+                            st.warning("Please select columns to encode.")
+                        else:
+                            if encoding_method == "OneHotEncoder":
+                                encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
+                                encoded_data = encoder.fit_transform(df[encode_cols])
+                                encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(encode_cols))
+                                df = pd.concat([df.drop(columns=encode_cols), encoded_df], axis=1)
+                    elif clean_action == "Outlier Removal":
+                        if not outlier_cols:
+                            st.warning("Please select columns to remove outliers from.")
+                        else:
+                            for col in outlier_cols:
+                                if outlier_method == "IQR":
+                                    Q1 = df[col].quantile(0.25)
+                                    Q3 = df[col].quantile(0.75)
+                                    IQR = Q3 - Q1
+                                    lower_bound = Q1 - iqr_threshold * IQR
+                                    upper_bound = Q3 + iqr_threshold * IQR
+                                    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
+                                else:  # Z-score
+                                    z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
+                                    df = df[z_scores <= zscore_threshold]
                     st.session_state.cleaned_data = df
                     st.success("Transformation applied!")
         # Data Comparison
         st.subheader("Data Version Comparison")
         col1, col2 = st.columns(2)
         with col1:
+            st.write("Original Data", st.session_state.raw_data.head(3) if st.session_state.raw_data is not None else "No data uploaded")
         with col2:
             st.write("Cleaned Data", df.head(3))
 # Advanced EDA Section
 elif app_mode == "Advanced EDA":
     st.title("🔍 Advanced Exploratory Analysis")
     if st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data
         # Visualization Selector
         plot_type = st.selectbox("Choose Visualization", [
+            "Histogram",
             "Scatter Plot",
             "Box Plot",
             "Correlation Heatmap",
+            "3D Scatter",
+            "Violin Plot",
+            "Time Series"
         ])
         # Dynamic Axis Selection
         cols = st.columns(3)
         with cols[0]:
             x_col = st.selectbox("X Axis", df.columns)
         with cols[1]:
+            y_col = st.selectbox("Y Axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series"] else None
         with cols[2]:
             z_col = st.selectbox("Z Axis", df.columns) if plot_type == "3D Scatter" else None
+        if plot_type == "Time Series":
+            time_col = x_col # rename for clarity
+            value_col = y_col
+        #Interactive filtering
+        filter_col = st.selectbox("Filter Column", [None] + list(df.columns))
+        if filter_col:
+            unique_values = df[filter_col].unique()
+            filter_options = st.multiselect("Filter Values", unique_values, default=unique_values)
+            df = df[df[filter_col].isin(filter_options)]
         # Generate Plot
         if st.button("Generate Visualization"):
+            try:  # add try-except block for potential errors
+                if plot_type == "Histogram":
+                    fig = px.histogram(df, x=x_col, nbins=30, template="plotly_dark")
+                elif plot_type == "Scatter Plot":
+                    fig = px.scatter(df, x=x_col, y=y_col, color_discrete_sequence=['#00f7ff'])
+                elif plot_type == "3D Scatter":
+                    fig = px.scatter_3d(df, x=x_col, y=y_col, z=z_col, color=x_col)
+                elif plot_type == "Correlation Heatmap":
+                    corr = df.corr(numeric_only=True) #handle non-numeric cols
+                    fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu')
+                elif plot_type == "Box Plot":
+                    fig = px.box(df,x=x_col, y=y_col, color_discrete_sequence=['#00f7ff'])
+                elif plot_type == "Violin Plot":
+                    fig = px.violin(df, x=x_col, y=y_col, color_discrete_sequence=['#00f7ff'])
+                elif plot_type == "Time Series":
+                     fig = px.line(df, x=time_col, y=value_col)
+                fig.update_layout(
+                    plot_bgcolor="#1e1e30",
+                    paper_bgcolor="#1e1e30",
+                    font_color="#e0e0ff"
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            except Exception as e:
+                st.error(f"Error generating plot: {e}")
 # Model Training Section
 elif app_mode == "Model Training":
     st.title("🤖 Model Training Studio")
     if st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data
+        # Check for missing values before proceeding
+        if df.isnull().sum().sum() > 0:
+            st.error("Data contains missing values. Please handle them in the 'Smart Cleaning' section before training.")
+            st.stop()
         # Model Setup
         col1, col2 = st.columns([1, 3])
         with col1:
+            task_type = st.selectbox("Choose Task", ["Regression", "Classification"])
+            if task_type == "Regression":
+                model_type = st.selectbox("Choose Model", [
+                    "Linear Regression",
+                    "Decision Tree",
+                    "Random Forest",
+                    "Gradient Boosting"
+                ])
+            else:  # Classification
+                model_type = st.selectbox("Choose Model", [
+                    "Logistic Regression",
+                    "Decision Tree",
+                    "Random Forest",
+                    "Support Vector Machine" #SVC
+                ])
             test_size = st.slider("Test Size", 0.1, 0.5, 0.2)
             target = st.selectbox("Target Variable", df.columns)
+            features = [col for col in df.columns if col != target] #Exclude target
+            numeric_features = df[features].select_dtypes(include=np.number).columns.tolist()
+            categorical_features = [col for col in features if col not in numeric_features]
+            # Hyperparameter tuning options (example for RandomForest)
+            enable_hyperparameter_tuning = st.checkbox("Enable Hyperparameter Tuning")
+            if enable_hyperparameter_tuning and model_type in ["Random Forest", "Gradient Boosting", "Support Vector Machine", "Logistic Regression", "Decision Tree"]: # Add more models later
+                st.write("Hyperparameter Tuning Options:")
+                if model_type == "Random Forest":
+                    n_estimators = st.slider("Number of Estimators", 50, 200, 100)
+                    max_depth = st.slider("Max Depth", 5, 20, None) #None for unlimited
+                    param_grid = {'n_estimators': [n_estimators], 'max_depth': [max_depth]}
+                elif model_type == "Gradient Boosting":
+                    n_estimators = st.slider("Number of Estimators", 50, 200, 100, key = "gb_n_estimators")
+                    learning_rate = st.slider("Learning Rate", 0.01, 0.1, 0.05, key = "gb_learning_rate")
+                    max_depth = st.slider("Max Depth", 3, 10, 5, key = "gb_max_depth")
+                    param_grid = {'n_estimators': [n_estimators], 'learning_rate': [learning_rate], 'max_depth': [max_depth]}
+                elif model_type == "Support Vector Machine": #SVC/SVR
+                    kernel = st.selectbox("Kernel", ['linear', 'rbf', 'poly'])
+                    C = st.slider("C (Regularization)", 0.1, 1.0, 0.5)
+                    param_grid = {'kernel': [kernel], 'C': [C]}
+                elif model_type == "Logistic Regression":
+                  C = st.slider("C (Regularization)", 0.1, 1.0, 0.5)
+                  param_grid = {'C': [C]} # add more as needed
+                elif model_type == "Decision Tree":
+                    max_depth = st.slider("Max Depth", 5, 20, None)  # None for unlimited
+                    param_grid = {'max_depth': [max_depth]}
         with col2:
             if st.button("Train Model"):
+                try:
+                    X = df.drop(columns=[target])
+                    y = df[target]
+                    X_train, X_test, y_train, y_test = train_test_split(
+                        X, y, test_size=test_size, random_state=42
+                    )
+                    # Preprocessing
+                    numeric_transformer = StandardScaler() #StandardScaler or other scalers
+                    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False) #sparse=False for array output
+                    preprocessor = ColumnTransformer(
+                        transformers=[
+                            ('num', numeric_transformer, numeric_features),
+                            ('cat', categorical_transformer, categorical_features)
+                        ],
+                        remainder='passthrough'  # or 'drop' if you want to drop untransformed cols
+                    )
+                    X_train = preprocessor.fit_transform(X_train)
+                    X_test = preprocessor.transform(X_test)
+                    st.session_state.preprocessor = preprocessor #store for prediction later
+                    # Model Training
+                    if task_type == "Regression":
+                        if model_type == "Linear Regression":
+                            model = LinearRegression()
+                        elif model_type == "Decision Tree":
+                            model = DecisionTreeRegressor()
+                        elif model_type == "Random Forest":
+                            model = RandomForestRegressor()
+                        elif model_type == "Gradient Boosting":
+                            model = GradientBoostingRegressor()
+                        elif model_type == "Support Vector Machine":
+                            model = SVR()
+                    else: #Classification
+                        if model_type == "Logistic Regression":
+                            model = LogisticRegression(max_iter=1000) #increase max_iter if needed
+                        elif model_type == "Decision Tree":
+                            model = DecisionTreeClassifier()
+                        elif model_type == "Random Forest":
+                            model = RandomForestClassifier()
+                        elif model_type == "Support Vector Machine":
+                            model = SVC(probability=True) #probability=True needed for ROC AUC
+                    #Hyperparameter tuning
+                    if enable_hyperparameter_tuning and model_type in ["Random Forest", "Gradient Boosting", "Support Vector Machine", "Logistic Regression", "Decision Tree"]:
+                        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error' if task_type == "Regression" else 'accuracy')
+                        grid_search.fit(X_train, y_train)
+                        model = grid_search.best_estimator_ #use best model
+                        st.write("Best Parameters:", grid_search.best_params_)
+                    else:
+                        model.fit(X_train, y_train)
+                    st.session_state.model = model
+                    st.session_state.train_test = {
+                        'X_test': X_test,
+                        'y_test': y_test,
+                        'task': task_type #Store task for eval
+                    }
+                    # Evaluation Metrics
+                    y_pred = model.predict(X_test)
+                    if task_type == "Regression":
+                        r2 = r2_score(y_test, y_pred)
+                        mse = mean_squared_error(y_test, y_pred)
+                        mae = mean_absolute_error(y_test, y_pred) #ADDED
+                        st.metric("R² Score", round(r2, 2))
+                        st.metric("MSE", round(mse, 2))
+                        st.metric("MAE", round(mae, 2)) #ADDED
+                    else: #Classification
+                        accuracy = accuracy_score(y_test, y_pred)
+                        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
+                        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
+                        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
+                        try:
+                            roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) #requires probabilities
+                            st.metric("ROC AUC", round(roc_auc, 2))
+                        except:
+                            st.warning("ROC AUC score not available for this classifier.")
+                        st.metric("Accuracy", round(accuracy, 2))
+                        st.metric("Precision", round(precision, 2))
+                        st.metric("Recall", round(recall, 2))
+                        st.metric("F1 Score", round(f1, 2))
+                    #Cross Validation
+                    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error' if task_type == "Regression" else 'accuracy') # use appropriate scoring
+                    st.write("Cross-Validation Scores:", scores)
+                    st.write("Mean Cross-Validation Score:", scores.mean())
+                    #Model persistence
+                    if st.checkbox("Save Model"):
+                        model_filename = st.text_input("Model Filename", "trained_model.joblib")
+                        joblib.dump((model, preprocessor), model_filename) # save both model AND preprocessor
+                        st.success(f"Model saved as {model_filename}")
+                except Exception as e:
+                    st.error(f"Error during training: {e}")
 # Predictions Section
 elif app_mode == "Predictions":
     st.title("🔮 Make Predictions")
+    if st.session_state.model is not None and st.session_state.preprocessor is not None:
+        model, preprocessor = st.session_state.model, st.session_state.preprocessor
+        X_test_cols = st.session_state.train_test['X_test'].shape[1] #get the number of input cols
         # Prediction Interface
         input_data = {}
+        X_test_columns = [f"feature_{i}" for i in range(X_test_cols)]  # Generate placeholder column names
+        input_data = {}
+        for i in range(X_test_cols):
+            input_data[f"feature_{i}"] = st.number_input(f"Feature {i+1}", value=0.0)
+        #for col in st.session_state.train_test['X_test'].columns: # causes error since its preprocessed
         if st.button("Predict"):
+            try:
+                input_df = pd.DataFrame([input_data])
+                # Preprocess input
+                input_processed = preprocessor.transform(input_df)
+                prediction = model.predict(input_processed)
+                if st.session_state.train_test['task'] == "Regression":
+                    st.success(f"Predicted Value: {prediction[0]:.2f}")
+                else:
+                    st.success(f"Predicted Class: {prediction[0]}")
+                    # Show probabilities if it's a classifier
+                    if hasattr(model, "predict_proba"):
+                        proba = model.predict_proba(input_processed)[0]
+                        for i, p in enumerate(proba):
+                            st.write(f"Probability of class {i}: {p:.2f}")
+            except Exception as e:
+                st.error(f"Error during prediction: {e}")
+    else:
+        st.warning("Please train a model first.")
 elif app_mode == "Visualization Lab":
     st.title("📊 Advanced Visualization Lab")
     if st.session_state.cleaned_data is not None:
         df = st.session_state.cleaned_data
         # Visualization Gallery
         viz_type = st.selectbox("Choose Visualization Type", [
             "3D Scatter Plot",
             "Interactive Heatmap",
             "Time Series Analysis",
+            "Cluster Analysis (Coming Soon)" #Removed placeholder, keep in mind
         ])
         # Dynamic Controls
         cols = st.columns(3)
         with cols[0]:
             y_axis = st.selectbox("Y Axis", df.columns)
         with cols[2]:
             z_axis = st.selectbox("Z Axis", df.columns) if viz_type == "3D Scatter Plot" else None
         # Generate Visualization
+        try: #Add try-except
+            if viz_type == "3D Scatter Plot":
+                fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, color=x_axis)
+                st.plotly_chart(fig, use_container_width=True)
+            elif viz_type == "Interactive Heatmap":
+                corr = df.corr(numeric_only=True) #Add numeric_only=True
+                fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu')
+                st.plotly_chart(fig, use_container_width=True)
+            elif viz_type == "Time Series Analysis":
+                # Basic time series plot
+                time_col = st.selectbox("Time Column", df.columns)
+                value_col = st.selectbox("Value Column", df.columns)
+                fig = px.line(df, x=time_col, y=value_col)
+                st.plotly_chart(fig, use_container_width=True)
+            elif viz_type == "Cluster Analysis (Coming Soon)": #Removed placeholder
+               st.write("Cluster Analysis Feature Coming Soon!")  # placeholder for future development
+        except Exception as e:
+            st.error(f"Error generating visualization: {e}")