Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 3

Commit

583cf71

verified ·

1 Parent(s): f0a87cd

Update app.py

Browse files

Files changed (1) hide show

app.py +238 -242

app.py CHANGED Viewed

@@ -524,30 +524,30 @@ elif app_mode == "Data Cleaning":
     # --------------------------
     # Label Encoding
     # --------------------------
-# --------------------------
-# Label/One-Hot Encoding
-# --------------------------
-enhance_section_title("Encoding Options", "🔢")
-with st.expander("🔢 Encoding Options"):
-    encoding_method = st.radio("Select Encoding Method", ("Label Encoding", "One-Hot Encoding"))
-    data_to_encode = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
-    if data_to_encode:
-        if st.button("Apply Encoding"):
-            new_df = df.copy()
-            if encoding_method == "Label Encoding":
-                label_encoders = {}
-                for col in data_to_encode:
-                    le = LabelEncoder()
-                    new_df[col] = le.fit_transform(new_df[col].astype(str))
-                    label_encoders[col] = le
-            elif encoding_method == "One-Hot Encoding":
-                new_df = pd.get_dummies(new_df, columns=data_to_encode, drop_first=True)
-            update_cleaned_data(new_df)
-            st.rerun() # Force re-run after apply
-        except Exception as e:
-                st.error(f"Error: {str(e)}")
     # --------------------------
@@ -585,141 +585,137 @@ with st.expander("🔢 Encoding Options"):
             except Exception as e:
                 st.error(f"Error: {str(e)}")
-    # --------------------------
     # Bulk Operations
     # --------------------------
-# --------------------------
-# Bulk Operations
-# --------------------------
-enhance_section_title("Bulk Actions", "🚀")
-with st.expander("🚀 Bulk Actions"):
-    bulk_action = st.selectbox("Select Bulk Action", [
-        "Auto-Clean Common Issues",
-        "Drop All Missing Values",
-        "Fill Missing Values",
-        "One-Hot Encode All Categorical Columns",
-        "Min-Max Scaling",
-        "Remove Outliers",
-        "Tokenize Text Columns",
-        "Vectorize Text Columns (TF-IDF)",
-        "Extract Date Features",
-        "Target Encoding",
-        "Principal Component Analysis (PCA)"
-    ])
-    if bulk_action == "Auto-Clean Common Issues":
-        if st.button("Apply Auto-Clean"):
-            new_df = df.copy()
-            new_df = new_df.dropna(axis=1, how='all')  # Remove empty cols
-            new_df = new_df.convert_dtypes()  # Better type inference
-            text_cols = new_df.select_dtypes(include='object').columns
-            new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
-            update_cleaned_data(new_df)
-            st.rerun()  # Force re-run after apply
-    if bulk_action == "Drop All Missing Values":
-        if st.button("Apply Drop All Missing"):
-            new_df = df.copy()
-            new_df = new_df.dropna()  # Drop rows with any missing values
-            update_cleaned_data(new_df)
-            st.rerun()  # Force re-run after apply
-    if bulk_action == "Fill Missing Values":
-        fill_value = st.text_input("Fill Value (e.g., 0, mean, median)")
-        if st.button("Apply Fill Missing"):
-            new_df = df.copy()
-            if fill_value.lower() == "mean":
-                new_df = new_df.fillna(new_df.mean())
-            elif fill_value.lower() == "median":
-                new_df = new_df.fillna(new_df.median())
-            else:
-                new_df = new_df.fillna(fill_value)
-            update_cleaned_data(new_df)
-            st.rerun()  # Force re-run after apply
-    if bulk_action == "One-Hot Encode All Categorical Columns":
-        if st.button("Apply One-Hot Encoding"):
-            new_df = df.copy()
-            categorical_cols = new_df.select_dtypes(include='object').columns
-            new_df = pd.get_dummies(new_df, columns=categorical_cols, drop_first=True)
-            update_cleaned_data(new_df)
-            st.rerun()  # Force re-run after apply
-    if bulk_action == "Min-Max Scaling":
-        if st.button("Apply Min-Max Scaling"):
-            new_df = df.copy()
-            scaler = MinMaxScaler()
-            numerical_cols = new_df.select_dtypes(include=np.number).columns
-            new_df[numerical_cols] = scaler.fit_transform(new_df[numerical_cols])
-            update_cleaned_data(new_df)
-            st.rerun()  # Force re-run after apply
-    if bulk_action == "Remove Outliers":
-        if st.button("Apply Remove Outliers"):
-            new_df = df.copy()
-            z_scores = np.abs(stats.zscore(new_df.select_dtypes(include=np.number)))
-            new_df = new_df[(z_scores < 3).all(axis=1)]  # Remove rows with z-score > 3
-            update_cleaned_data(new_df)
-            st.rerun()  # Force re-run after apply
-    if bulk_action == "Tokenize Text Columns":
-        text_cols = st.multiselect("Select text columns to tokenize", df.select_dtypes(include='object').columns)
-        if text_cols:
-            if st.button("Apply Tokenization"):
-                tokenizer = Tokenizer()
                 new_df = df.copy()
-                for col in text_cols:
-                    tokenizer.fit_on_texts(new_df[col])
-                    new_df[col] = tokenizer.texts_to_sequences(new_df[col])
                 update_cleaned_data(new_df)
                 st.rerun()  # Force re-run after apply
-    if bulk_action == "Vectorize Text Columns (TF-IDF)":
-        text_cols = st.multiselect("Select text columns to vectorize", df.select_dtypes(include='object').columns)
-        if text_cols:
-            if st.button("Apply TF-IDF Vectorization"):
-                tfidf = TfidfVectorizer()
                 new_df = df.copy()
-                for col in text_cols:
-                    new_col = tfidf.fit_transform(new_df[col]).toarray()
-                    new_df = new_df.drop(columns=[col])
-                    new_df = new_df.join(pd.DataFrame(new_col, columns=[f'{col}_{i}' for i in range(new_col.shape[1])]))
                 update_cleaned_data(new_df)
                 st.rerun()  # Force re-run after apply
-    if bulk_action == "Extract Date Features":
-        date_cols = st.multiselect("Select date columns to extract features from", df.select_dtypes(include='datetime').columns)
-        if date_cols:
-            if st.button("Apply Date Feature Extraction"):
                 new_df = df.copy()
-                for col in date_cols:
-                    new_df[f'{col}_year'] = new_df[col].dt.year
-                    new_df[f'{col}_month'] = new_df[col].dt.month
-                    new_df[f'{col}_day'] = new_df[col].dt.day
-                    new_df[f'{col}_weekday'] = new_df[col].dt.weekday
-                    new_df[f'{col}_hour'] = new_df[col].dt.hour
                 update_cleaned_data(new_df)
                 st.rerun()  # Force re-run after apply
-    if bulk_action == "Target Encoding":
-        target_col = st.selectbox("Select target column", df.columns)
-        cat_cols = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
-        if cat_cols:
-            if st.button("Apply Target Encoding"):
                 new_df = df.copy()
-                for col in cat_cols:
-                    target_mean = new_df.groupby(col)[target_col].mean()
-                    new_df[col] = new_df[col].map(target_mean)
                 update_cleaned_data(new_df)
                 st.rerun()  # Force re-run after apply
-    if bulk_action == "Principal Component Analysis (PCA)":
-        n_components = st.slider("Number of components", min_value=1, max_value=min(df.shape[1], 10), value=2)
-        if st.button("Apply PCA"):
-            new_df = df.copy()
-            pca = PCA(n_components=n_components)
-            pca_result = pca.fit_transform(new_df.select_dtypes(include=np.number))
-            new_df = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range
     # --------------------------
@@ -1360,110 +1356,110 @@ elif app_mode == "Model Training":
             num_trials = st.number_input("Number of Trials", 1, 100, 10, help="Number of trials for hyperparameter search.")
     # ----- [5. Training & Monitoring] -----
-st.subheader("🎯 Training Configuration")
-import shap  # Ensure SHAP is installed: pip install shap
-class LiveMetrics(Callback):
-    def on_epoch_end(self, epoch, logs=None):
-        if 'metrics' not in st.session_state:
-            st.session_state.metrics = []
-        st.session_state.metrics.append(logs)
-        self.update_chart()
-    def update_chart(self):
-        df = pd.DataFrame(st.session_state.metrics)
-        fig = px.line(df, y=['loss', 'val_loss'], title="Training Progress")
-        loss_chart.plotly_chart(fig)
-if st.button("🚀 Start Training"):
-    try:
-        model = tf.keras.Sequential()
-        # Add layers with regularization
-        for layer in st.session_state.layers:
-            layer_class = {
-                "Dense": Dense,
-                "Conv2D": Conv2D,
-                "LSTM": LSTM
-            }[layer['type']]
-            # Add regularization
-            if l2_reg > 0:
-                layer['kernel_regularizer'] = tf.keras.regularizers.l2(l2_reg)
-            model.add(layer_class(**layer))
-            # Add batch norm after each layer
-            if batch_norm:
-                model.add(BatchNormalization())
-            # Add global dropout
-            if dropout > 0:
-                model.add(Dropout(dropout))
-        model.compile(
-            optimizer=optimizer,
-            loss=loss,
-            metrics=metrics
-        )
-        # Show model summary
-        st.subheader("Model Architecture")
-        with tempfile.NamedTemporaryFile(suffix='.png') as tmp:
-            plot_model(model, to_file=tmp.name, show_shapes=True)
-            st.image(tmp.name)
-        # Start training
-        st.subheader("Live Training Metrics")
-        loss_chart = st.empty()
-        model.fit(X_train, y_train,
-                  epochs=10,
-                  validation_data=(X_val, y_val),
-                  callbacks=[LiveMetrics()])
-        # SHAP explanations
-        st.subheader("SHAP Explanations")
-        explainer = shap.KernelExplainer(model.predict, X_train[:100])
-        shap_values = explainer.shap_values(X_train[:100])
-        shap.summary_plot(shap_values, X_train[:100], plot_type="bar")
-        st.pyplot(bbox_inches='tight')
-    except Exception as e:
-        st.error(f"Training failed: {str(e)}")
-    # ----- [6. Export & Deployment] -----
-    st.subheader("💾 Export Model")
-    export_format = st.radio("Format", [
-        "TensorFlow SavedModel",
-        "HDF5",
-        "ONNX"
-    ])
-    if st.button("Export"):
-        with tempfile.NamedTemporaryFile(delete=False) as tmp:
-            if export_format == "HDF5":
-                model.save(tmp.name + '.h5')
-            elif export_format == "ONNX":
-                import tf2onnx
-                model_proto, _ = tf2onnx.convert.from_keras(model)
-                with open(tmp.name + '.onnx', 'wb') as f:
-                    f.write(model_proto.SerializeToString())
-            else:
-                tf.saved_model.save(model, tmp.name)
-            with open(tmp.name, 'rb') as f:
-                st.download_button(
-                    "Download Model",
-                    f.read(),
-                    file_name=f"model.{'h5' if export_format=='HDF5' else 'onnx'}"
-                )
 # Predictions Section (Fixed)
-if app_mode == "Predictions":
     st.title("🔮 Predictive Analytics - Informed Business Decisions")
     st.warning("Note: SHAP explanations currently work best with tree-based models like Random Forest")

     # --------------------------
     # Label Encoding
     # --------------------------
+    # --------------------------
+    # Label/One-Hot Encoding
+    # --------------------------
+    enhance_section_title("Encoding Options", "🔢")
+    with st.expander("🔢 Encoding Options"):
+        encoding_method = st.radio("Select Encoding Method", ("Label Encoding", "One-Hot Encoding"))
+        data_to_encode = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
+        if data_to_encode:
+            if st.button("Apply Encoding"):
+                new_df = df.copy()
+                if encoding_method == "Label Encoding":
+                    label_encoders = {}
+                    for col in data_to_encode:
+                        le = LabelEncoder()
+                        new_df[col] = le.fit_transform(new_df[col].astype(str))
+                        label_encoders[col] = le
+                elif encoding_method == "One-Hot Encoding":
+                    new_df = pd.get_dummies(new_df, columns=data_to_encode, drop_first=True)
+                update_cleaned_data(new_df)
+                st.rerun() # Force re-run after apply
+            except Exception as e:
+                    st.error(f"Error: {str(e)}")
     # --------------------------
             except Exception as e:
                 st.error(f"Error: {str(e)}")
     # Bulk Operations
     # --------------------------
+    enhance_section_title("Bulk Actions", "🚀")
+    with st.expander("🚀 Bulk Actions"):
+        bulk_action = st.selectbox("Select Bulk Action", [
+            "Auto-Clean Common Issues",
+            "Drop All Missing Values",
+            "Fill Missing Values",
+            "One-Hot Encode All Categorical Columns",
+            "Min-Max Scaling",
+            "Remove Outliers",
+            "Tokenize Text Columns",
+            "Vectorize Text Columns (TF-IDF)",
+            "Extract Date Features",
+            "Target Encoding",
+            "Principal Component Analysis (PCA)"
+        ])
+        if bulk_action == "Auto-Clean Common Issues":
+            if st.button("Apply Auto-Clean"):
                 new_df = df.copy()
+                new_df = new_df.dropna(axis=1, how='all')  # Remove empty cols
+                new_df = new_df.convert_dtypes()  # Better type inference
+                text_cols = new_df.select_dtypes(include='object').columns
+                new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
                 update_cleaned_data(new_df)
                 st.rerun()  # Force re-run after apply
+        if bulk_action == "Drop All Missing Values":
+            if st.button("Apply Drop All Missing"):
                 new_df = df.copy()
+                new_df = new_df.dropna()  # Drop rows with any missing values
                 update_cleaned_data(new_df)
                 st.rerun()  # Force re-run after apply
+        if bulk_action == "Fill Missing Values":
+            fill_value = st.text_input("Fill Value (e.g., 0, mean, median)")
+            if st.button("Apply Fill Missing"):
                 new_df = df.copy()
+                if fill_value.lower() == "mean":
+                    new_df = new_df.fillna(new_df.mean())
+                elif fill_value.lower() == "median":
+                    new_df = new_df.fillna(new_df.median())
+                else:
+                    new_df = new_df.fillna(fill_value)
                 update_cleaned_data(new_df)
                 st.rerun()  # Force re-run after apply
+        if bulk_action == "One-Hot Encode All Categorical Columns":
+            if st.button("Apply One-Hot Encoding"):
                 new_df = df.copy()
+                categorical_cols = new_df.select_dtypes(include='object').columns
+                new_df = pd.get_dummies(new_df, columns=categorical_cols, drop_first=True)
                 update_cleaned_data(new_df)
                 st.rerun()  # Force re-run after apply
+        if bulk_action == "Min-Max Scaling":
+            if st.button("Apply Min-Max Scaling"):
+                new_df = df.copy()
+                scaler = MinMaxScaler()
+                numerical_cols = new_df.select_dtypes(include=np.number).columns
+                new_df[numerical_cols] = scaler.fit_transform(new_df[numerical_cols])
+                update_cleaned_data(new_df)
+                st.rerun()  # Force re-run after apply
+        if bulk_action == "Remove Outliers":
+            if st.button("Apply Remove Outliers"):
+                new_df = df.copy()
+                z_scores = np.abs(stats.zscore(new_df.select_dtypes(include=np.number)))
+                new_df = new_df[(z_scores < 3).all(axis=1)]  # Remove rows with z-score > 3
+                update_cleaned_data(new_df)
+                st.rerun()  # Force re-run after apply
+        if bulk_action == "Tokenize Text Columns":
+            text_cols = st.multiselect("Select text columns to tokenize", df.select_dtypes(include='object').columns)
+            if text_cols:
+                if st.button("Apply Tokenization"):
+                    tokenizer = Tokenizer()
+                    new_df = df.copy()
+                    for col in text_cols:
+                        tokenizer.fit_on_texts(new_df[col])
+                        new_df[col] = tokenizer.texts_to_sequences(new_df[col])
+                    update_cleaned_data(new_df)
+                    st.rerun()  # Force re-run after apply
+        if bulk_action == "Vectorize Text Columns (TF-IDF)":
+            text_cols = st.multiselect("Select text columns to vectorize", df.select_dtypes(include='object').columns)
+            if text_cols:
+                if st.button("Apply TF-IDF Vectorization"):
+                    tfidf = TfidfVectorizer()
+                    new_df = df.copy()
+                    for col in text_cols:
+                        new_col = tfidf.fit_transform(new_df[col]).toarray()
+                        new_df = new_df.drop(columns=[col])
+                        new_df = new_df.join(pd.DataFrame(new_col, columns=[f'{col}_{i}' for i in range(new_col.shape[1])]))
+                    update_cleaned_data(new_df)
+                    st.rerun()  # Force re-run after apply
+        if bulk_action == "Extract Date Features":
+            date_cols = st.multiselect("Select date columns to extract features from", df.select_dtypes(include='datetime').columns)
+            if date_cols:
+                if st.button("Apply Date Feature Extraction"):
+                    new_df = df.copy()
+                    for col in date_cols:
+                        new_df[f'{col}_year'] = new_df[col].dt.year
+                        new_df[f'{col}_month'] = new_df[col].dt.month
+                        new_df[f'{col}_day'] = new_df[col].dt.day
+                        new_df[f'{col}_weekday'] = new_df[col].dt.weekday
+                        new_df[f'{col}_hour'] = new_df[col].dt.hour
+                    update_cleaned_data(new_df)
+                    st.rerun()  # Force re-run after apply
+        if bulk_action == "Target Encoding":
+            target_col = st.selectbox("Select target column", df.columns)
+            cat_cols = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
+            if cat_cols:
+                if st.button("Apply Target Encoding"):
+                    new_df = df.copy()
+                    for col in cat_cols:
+                        target_mean = new_df.groupby(col)[target_col].mean()
+                        new_df[col] = new_df[col].map(target_mean)
+                    update_cleaned_data(new_df)
+                    st.rerun()  # Force re-run after apply
+        if bulk_action == "Principal Component Analysis (PCA)":
+            n_components = st.slider("Number of components", min_value=1, max_value=min(df.shape[1], 10), value=2)
+            if st.button("Apply PCA"):
+                new_df = df.copy()
+                pca = PCA(n_components=n_components)
+                pca_result = pca.fit_transform(new_df.select_dtypes(include=np.number))
+                new_df = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range
     # --------------------------
             num_trials = st.number_input("Number of Trials", 1, 100, 10, help="Number of trials for hyperparameter search.")
     # ----- [5. Training & Monitoring] -----
+    st.subheader("🎯 Training Configuration")
+    import shap  # Ensure SHAP is installed: pip install shap
+    class LiveMetrics(Callback):
+        def on_epoch_end(self, epoch, logs=None):
+            if 'metrics' not in st.session_state:
+                st.session_state.metrics = []
+            st.session_state.metrics.append(logs)
+            self.update_chart()
+        def update_chart(self):
+            df = pd.DataFrame(st.session_state.metrics)
+            fig = px.line(df, y=['loss', 'val_loss'], title="Training Progress")
+            loss_chart.plotly_chart(fig)
+    if st.button("🚀 Start Training"):
+        try:
+            model = tf.keras.Sequential()
+            # Add layers with regularization
+            for layer in st.session_state.layers:
+                layer_class = {
+                    "Dense": Dense,
+                    "Conv2D": Conv2D,
+                    "LSTM": LSTM
+                }[layer['type']]
+                # Add regularization
+                if l2_reg > 0:
+                    layer['kernel_regularizer'] = tf.keras.regularizers.l2(l2_reg)
+                model.add(layer_class(**layer))
+                # Add batch norm after each layer
+                if batch_norm:
+                    model.add(BatchNormalization())
+                # Add global dropout
+                if dropout > 0:
+                    model.add(Dropout(dropout))
+            model.compile(
+                optimizer=optimizer,
+                loss=loss,
+                metrics=metrics
+            )
+            # Show model summary
+            st.subheader("Model Architecture")
+            with tempfile.NamedTemporaryFile(suffix='.png') as tmp:
+                plot_model(model, to_file=tmp.name, show_shapes=True)
+                st.image(tmp.name)
+            # Start training
+            st.subheader("Live Training Metrics")
+            loss_chart = st.empty()
+            model.fit(X_train, y_train,
+                      epochs=10,
+                      validation_data=(X_val, y_val),
+                      callbacks=[LiveMetrics()])
+            # SHAP explanations
+            st.subheader("SHAP Explanations")
+            explainer = shap.KernelExplainer(model.predict, X_train[:100])
+            shap_values = explainer.shap_values(X_train[:100])
+            shap.summary_plot(shap_values, X_train[:100], plot_type="bar")
+            st.pyplot(bbox_inches='tight')
+        except Exception as e:
+            st.error(f"Training failed: {str(e)}")
+        # ----- [6. Export & Deployment] -----
+        st.subheader("💾 Export Model")
+        export_format = st.radio("Format", [
+            "TensorFlow SavedModel",
+            "HDF5",
+            "ONNX"
+        ])
+        if st.button("Export"):
+            with tempfile.NamedTemporaryFile(delete=False) as tmp:
+                if export_format == "HDF5":
+                    model.save(tmp.name + '.h5')
+                elif export_format == "ONNX":
+                    import tf2onnx
+                    model_proto, _ = tf2onnx.convert.from_keras(model)
+                    with open(tmp.name + '.onnx', 'wb') as f:
+                        f.write(model_proto.SerializeToString())
+                else:
+                    tf.saved_model.save(model, tmp.name)
+                with open(tmp.name, 'rb') as f:
+                    st.download_button(
+                        "Download Model",
+                        f.read(),
+                        file_name=f"model.{'h5' if export_format=='HDF5' else 'onnx'}"
+                    )
 # Predictions Section (Fixed)
+elif app_mode == "Predictions":
     st.title("🔮 Predictive Analytics - Informed Business Decisions")
     st.warning("Note: SHAP explanations currently work best with tree-based models like Random Forest")