Spaces:

LeonceNsh
/

baho

Running

App Files Files Community

LeonceNsh commited on Dec 31, 2024

Commit

54ff036

verified ·

1 Parent(s): bed9d39

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -204

app.py CHANGED Viewed

@@ -9,162 +9,39 @@ from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_selection import SelectKBest, f_regression
-# ---------------------------------------------------------------------------
-# 1. LOADING DATA
-# ---------------------------------------------------------------------------
 def load_data(parquet_file: str) -> pd.DataFrame:
-    """
-    Load data from a Parquet file into a DuckDB in-memory database,
-    and return the result as a pandas DataFrame.
-    Parameters:
-    -----------
-    parquet_file : str
-        The path to the Parquet file to be loaded.
-    Returns:
-    --------
-    df : pd.DataFrame
-        Pandas DataFrame containing all columns from the Parquet file.
-    """
     con = duckdb.connect(database=':memory:')
     query = f"""
     CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
     """
     con.execute(query)
     df = con.execute("SELECT * FROM data").fetchdf()
     return df
-# ---------------------------------------------------------------------------
-# 2. DATA PREPROCESSING & OPTIONAL FEATURE SELECTION
-# ---------------------------------------------------------------------------
-def preprocess_data(
-    df: pd.DataFrame,
-    target_column: str,
-    k_best: int = None
-):
-    """
-    Perform data cleaning and (optionally) feature selection.
-    Parameters:
-    -----------
-    df : pd.DataFrame
-        The input DataFrame.
-    target_column : str
-        The name of the target variable in df.
-    k_best : int, optional
-        If provided, use SelectKBest with f_regression to select the
-        top k features. If None, no feature selection is performed.
-    Returns:
-    --------
-    X : pd.DataFrame
-        Preprocessed and optionally reduced DataFrame (features).
-    y : pd.Series
-        The target column as a Series.
-    """
-    # Separate out the numeric columns for PCA
     X = df.select_dtypes(include=[float, int]).copy()
     y = df[target_column].copy()
-    # Replace infinities with NaN
     X.replace([np.inf, -np.inf], np.nan, inplace=True)
-    # Impute missing values with the median
     X = X.fillna(X.median())
     y = y.fillna(y.median())
-    # Optional: Use SelectKBest to filter down to top k features
     if k_best is not None and k_best < X.shape[1]:
         selector = SelectKBest(score_func=f_regression, k=k_best)
         X_selected = selector.fit_transform(X, y)
-        # We still want feature names to keep track of the columns
         selected_indices = selector.get_support(indices=True)
         X = X.iloc[:, selected_indices]
     return X, y
-# ---------------------------------------------------------------------------
-# 3. PCA TRANSFORMATION
-# ---------------------------------------------------------------------------
-def apply_pca(
-    X: pd.DataFrame,
-    n_components: int = 5
-):
-    """
-    Standardize the data and apply PCA.
-    Parameters:
-    -----------
-    X : pd.DataFrame
-        The preprocessed numeric data.
-    n_components : int
-        Number of principal components to retain.
-    Returns:
-    --------
-    X_pca : np.ndarray
-        PCA-transformed dataset.
-    pca : PCA
-        The trained PCA model (can be used for explained_variance_ etc.).
-    """
-    # Standardize the data
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X)
-    # Apply PCA
     pca = PCA(n_components=n_components)
     X_pca = pca.fit_transform(X_scaled)
     return X_pca, pca
-# ---------------------------------------------------------------------------
-# 4. VISUALIZATION
-# ---------------------------------------------------------------------------
-def visualize_pca(
-    X_pca: np.ndarray,
-    y: pd.Series,
-    pca,
-    target_label: str = 'Median Income Household'
-):
-    """
-    Generate visualizations:
-      1) Scatter plot of first two principal components.
-      2) Pair plot of the first few principal components (up to 5).
-      3) Scree plot showing the explained variance of each component.
-    Parameters:
-    -----------
-    X_pca : np.ndarray
-        PCA-transformed data (rows x components).
-    y : pd.Series
-        Target values, used for coloring points in plots.
-    pca : PCA
-        The fitted PCA model (for explained variance, etc.).
-    target_label : str
-        Label for the colorbar representing the target variable.
-    Returns:
-    --------
-    scatter_plot_file : str
-        Filename for the PCA scatter plot.
-    pair_plot_file : str
-        Filename for the PCA pair plot.
-    scree_plot_file : str
-        Filename for the Scree plot.
-    """
-    # 4.a: Scatter plot for the first two components
     scatter_plot_file = 'pca_scatter.png'
     plt.figure(figsize=(10, 6))
-    plt.scatter(
-        X_pca[:, 0],
-        X_pca[:, 1],
-        c=y,
-        cmap='viridis',
-        edgecolor='k',
-        s=50
-    )
     plt.xlabel('Principal Component 1')
     plt.ylabel('Principal Component 2')
     plt.title('PCA - First Two Principal Components')
@@ -173,33 +50,22 @@ def visualize_pca(
     plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
     plt.close()
-    # 4.b: Pair plot of the first few components
-    num_components = min(X_pca.shape[1], 5)
     pair_plot_file = 'pca_pairplot.png'
-    pca_df = pd.DataFrame(
-        X_pca[:, :num_components],
-        columns=[f'PC{i+1}' for i in range(num_components)]
-    )
-    pca_df[target_label] = y.values  # Add target for hue in pairplot
-    sns.pairplot(
-        pca_df,
-        vars=[f'PC{i+1}' for i in range(num_components)],
-        hue=target_label,
-        palette='viridis'
-    )
     plt.suptitle('Pair Plot of Principal Components', y=1.02)
     plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
     plt.close()
-    # 4.c: Scree plot for explained variance
     scree_plot_file = 'pca_scree.png'
     plt.figure(figsize=(8, 5))
-    plt.bar(
-        range(1, pca.n_components_ + 1),
-        pca.explained_variance_ratio_,
-        alpha=0.7,
-        color='red'
-    )
     plt.xlabel('Principal Components')
     plt.ylabel('Variance Explained')
     plt.title('Scree Plot')
@@ -209,81 +75,37 @@ def visualize_pca(
     return scatter_plot_file, pair_plot_file, scree_plot_file
-# ---------------------------------------------------------------------------
-# 5. GRADIO INTERFACE FUNCTION
-# ---------------------------------------------------------------------------
-def gradio_interface(
-    target_column: str,
-    n_components: int = 5,
-    k_best: int = 0
-):
-    """
-    Main interface function that:
-     - Loads the Parquet data
-     - Preprocesses the data (optionally using SelectKBest feature selection)
-     - Applies PCA
-     - Visualizes the PCA results
-     - Returns file paths to the generated plots.
-    Parameters:
-    -----------
-    target_column : str
-        The name of the target column in the DataFrame.
-    n_components : int
-        Number of PCA components to use.
-    k_best : int
-        If > 0, select the top k_best features before PCA.
-        If 0 or None, no feature selection is performed.
-    Returns:
-    --------
-    scatter_plot : str
-        File path to the PCA scatter plot.
-    pair_plot : str
-        File path to the pair plot of principal components.
-    scree_plot : str
-        File path to the scree plot of explained variance.
-    """
-    # Load data
     df = load_data('df_usa_health_features.parquet')
-    # Preprocess data (optionally do feature selection)
     X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
-    # Apply PCA
     X_pca, pca_model = apply_pca(X, n_components)
-    # Generate and return visualizations
     scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
     return scatter_plot, pair_plot, scree_plot
-# ---------------------------------------------------------------------------
-# 6. BUILDING THE GRADIO APP
-# ---------------------------------------------------------------------------
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
-        gr.inputs.Textbox(label="Target Column", default="Median_Income_Household"),
-        gr.inputs.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, default=5),
-        gr.inputs.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, default=0)
     ],
     outputs=[
-        gr.outputs.Image(type="file", label="PCA Scatter Plot"),
-        gr.outputs.Image(type="file", label="PCA Pair Plot"),
-        gr.outputs.Image(type="file", label="Scree Plot")
     ],
     title="PCA Visualization with DuckDB and Gradio",
     description=(
         "Load data from a Parquet file, optionally perform feature selection, "
-        "run PCA, and visualize the results. "
-        "1) Enter the target column name (e.g., 'Median_Income_Household'). "
-        "2) Choose the number of PCA components. "
         "3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
     )
 )
-# ---------------------------------------------------------------------------
-# 7. LAUNCH THE APPLICATION
-# ---------------------------------------------------------------------------
 if __name__ == "__main__":
     iface.launch()

 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_selection import SelectKBest, f_regression
 def load_data(parquet_file: str) -> pd.DataFrame:
     con = duckdb.connect(database=':memory:')
     query = f"""
     CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
     """
     con.execute(query)
     df = con.execute("SELECT * FROM data").fetchdf()
     return df
+def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None):
     X = df.select_dtypes(include=[float, int]).copy()
     y = df[target_column].copy()
     X.replace([np.inf, -np.inf], np.nan, inplace=True)
     X = X.fillna(X.median())
     y = y.fillna(y.median())
     if k_best is not None and k_best < X.shape[1]:
         selector = SelectKBest(score_func=f_regression, k=k_best)
         X_selected = selector.fit_transform(X, y)
         selected_indices = selector.get_support(indices=True)
         X = X.iloc[:, selected_indices]
     return X, y
+def apply_pca(X: pd.DataFrame, n_components: int = 5):
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X)
     pca = PCA(n_components=n_components)
     X_pca = pca.fit_transform(X_scaled)
     return X_pca, pca
+def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Median Income Household'):
     scatter_plot_file = 'pca_scatter.png'
     plt.figure(figsize=(10, 6))
+    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
     plt.xlabel('Principal Component 1')
     plt.ylabel('Principal Component 2')
     plt.title('PCA - First Two Principal Components')
     plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
     plt.close()
+    # Pair plot
     pair_plot_file = 'pca_pairplot.png'
+    num_components = min(X_pca.shape[1], 5)
+    pca_df = pd.DataFrame(X_pca[:, :num_components],
+                          columns=[f'PC{i+1}' for i in range(num_components)])
+    pca_df[target_label] = y.values
+    sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)],
+                 hue=target_label, palette='viridis')
     plt.suptitle('Pair Plot of Principal Components', y=1.02)
     plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
     plt.close()
+    # Scree plot
     scree_plot_file = 'pca_scree.png'
     plt.figure(figsize=(8, 5))
+    plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
     plt.xlabel('Principal Components')
     plt.ylabel('Variance Explained')
     plt.title('Scree Plot')
     return scatter_plot_file, pair_plot_file, scree_plot_file
+def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0):
     df = load_data('df_usa_health_features.parquet')
     X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
     X_pca, pca_model = apply_pca(X, n_components)
     scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
     return scatter_plot, pair_plot, scree_plot
+# ------------------------------------------------------------------------------
+# HERE is the updated Gradio interface with direct component calls (no .inputs)
+# ------------------------------------------------------------------------------
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
+        gr.Textbox(label="Target Column", value="Median_Income_Household"),
+        gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
+        gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
     ],
     outputs=[
+        gr.Image(type="file", label="PCA Scatter Plot"),
+        gr.Image(type="file", label="PCA Pair Plot"),
+        gr.Image(type="file", label="Scree Plot")
     ],
     title="PCA Visualization with DuckDB and Gradio",
     description=(
         "Load data from a Parquet file, optionally perform feature selection, "
+        "run PCA, and visualize the results.\n"
+        "1) Enter the target column name (e.g., 'Median_Income_Household').\n"
+        "2) Choose the number of PCA components.\n"
         "3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
     )
 )
 if __name__ == "__main__":
     iface.launch()