Spaces:

LeonceNsh
/

baho

Sleeping

App Files Files Community

LeonceNsh commited on Dec 31, 2024

Commit

bed9d39

verified ·

1 Parent(s): 70b6418

Update app.py

Browse files

Files changed (1) hide show

app.py +240 -39

app.py CHANGED Viewed

@@ -1,33 +1,114 @@
 import gradio as gr
 import pandas as pd
 import duckdb
-from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler
-from sklearn.feature_selection import SelectKBest, f_regression
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-# Function to load data from a Parquet file into a DuckDB in-memory database
-def load_data(parquet_file):
     con = duckdb.connect(database=':memory:')
-    con.execute(f"CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')")
     df = con.execute("SELECT * FROM data").fetchdf()
     return df
-# Function to preprocess data and perform PCA
-def preprocess_and_pca(df, target_column, n_components=5):
-    # Drop non-numeric columns
-    X = df.select_dtypes(include=[float, int])
-    y = df[target_column]
-    # Replace infinity values with NaN
     X.replace([np.inf, -np.inf], np.nan, inplace=True)
-    # Handle missing values by imputing with the median
     X = X.fillna(X.median())
     y = y.fillna(y.median())
     # Standardize the data
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X)
@@ -36,53 +117,173 @@ def preprocess_and_pca(df, target_column, n_components=5):
     pca = PCA(n_components=n_components)
     X_pca = pca.fit_transform(X_scaled)
-    return X_pca, y
-# Function to visualize the PCA components
-def visualize_pca(X_pca, y):
-    # Visualize the first two principal components
     plt.figure(figsize=(10, 6))
-    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
     plt.xlabel('Principal Component 1')
     plt.ylabel('Principal Component 2')
     plt.title('PCA - First Two Principal Components')
-    plt.colorbar(label='Median Income Household')
-    plt.savefig('pca_scatter.png')
     plt.close()
-    # Create a DataFrame with the first few principal components for pair plot
-    pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
-    pca_df['Median_Income_Household'] = y
-    # Pair plot of the first few principal components
-    sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(5)], hue='Median_Income_Household', palette='viridis')
     plt.suptitle('Pair Plot of Principal Components', y=1.02)
-    plt.savefig('pca_pairplot.png')
     plt.close()
-    return 'pca_scatter.png', 'pca_pairplot.png'
-# Gradio interface function
-def gradio_interface(target_column):
     df = load_data('df_usa_health_features.parquet')
-    X_pca, y = preprocess_and_pca(df, target_column)
-    scatter_plot, pair_plot = visualize_pca(X_pca, y)
-    return scatter_plot, pair_plot
-# Create Gradio interface
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
-        gr.inputs.Textbox(label="Target Column")
     ],
     outputs=[
         gr.outputs.Image(type="file", label="PCA Scatter Plot"),
-        gr.outputs.Image(type="file", label="PCA Pair Plot")
     ],
     title="PCA Visualization with DuckDB and Gradio",
-    description="Specify the target column to visualize PCA components from the df_usa_health_features.parquet file."
 )
-# Launch the Gradio app
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import pandas as pd
 import duckdb
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from sklearn.feature_selection import SelectKBest, f_regression
+# ---------------------------------------------------------------------------
+# 1. LOADING DATA
+# ---------------------------------------------------------------------------
+def load_data(parquet_file: str) -> pd.DataFrame:
+    """
+    Load data from a Parquet file into a DuckDB in-memory database,
+    and return the result as a pandas DataFrame.
+    Parameters:
+    -----------
+    parquet_file : str
+        The path to the Parquet file to be loaded.
+    Returns:
+    --------
+    df : pd.DataFrame
+        Pandas DataFrame containing all columns from the Parquet file.
+    """
     con = duckdb.connect(database=':memory:')
+    query = f"""
+    CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
+    """
+    con.execute(query)
     df = con.execute("SELECT * FROM data").fetchdf()
     return df
+# ---------------------------------------------------------------------------
+# 2. DATA PREPROCESSING & OPTIONAL FEATURE SELECTION
+# ---------------------------------------------------------------------------
+def preprocess_data(
+    df: pd.DataFrame,
+    target_column: str,
+    k_best: int = None
+):
+    """
+    Perform data cleaning and (optionally) feature selection.
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        The input DataFrame.
+    target_column : str
+        The name of the target variable in df.
+    k_best : int, optional
+        If provided, use SelectKBest with f_regression to select the
+        top k features. If None, no feature selection is performed.
+    Returns:
+    --------
+    X : pd.DataFrame
+        Preprocessed and optionally reduced DataFrame (features).
+    y : pd.Series
+        The target column as a Series.
+    """
+    # Separate out the numeric columns for PCA
+    X = df.select_dtypes(include=[float, int]).copy()
+    y = df[target_column].copy()
+    # Replace infinities with NaN
     X.replace([np.inf, -np.inf], np.nan, inplace=True)
+    # Impute missing values with the median
     X = X.fillna(X.median())
     y = y.fillna(y.median())
+    # Optional: Use SelectKBest to filter down to top k features
+    if k_best is not None and k_best < X.shape[1]:
+        selector = SelectKBest(score_func=f_regression, k=k_best)
+        X_selected = selector.fit_transform(X, y)
+        # We still want feature names to keep track of the columns
+        selected_indices = selector.get_support(indices=True)
+        X = X.iloc[:, selected_indices]
+    return X, y
+# ---------------------------------------------------------------------------
+# 3. PCA TRANSFORMATION
+# ---------------------------------------------------------------------------
+def apply_pca(
+    X: pd.DataFrame,
+    n_components: int = 5
+):
+    """
+    Standardize the data and apply PCA.
+    Parameters:
+    -----------
+    X : pd.DataFrame
+        The preprocessed numeric data.
+    n_components : int
+        Number of principal components to retain.
+    Returns:
+    --------
+    X_pca : np.ndarray
+        PCA-transformed dataset.
+    pca : PCA
+        The trained PCA model (can be used for explained_variance_ etc.).
+    """
     # Standardize the data
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X)
     pca = PCA(n_components=n_components)
     X_pca = pca.fit_transform(X_scaled)
+    return X_pca, pca
+# ---------------------------------------------------------------------------
+# 4. VISUALIZATION
+# ---------------------------------------------------------------------------
+def visualize_pca(
+    X_pca: np.ndarray,
+    y: pd.Series,
+    pca,
+    target_label: str = 'Median Income Household'
+):
+    """
+    Generate visualizations:
+      1) Scatter plot of first two principal components.
+      2) Pair plot of the first few principal components (up to 5).
+      3) Scree plot showing the explained variance of each component.
+    Parameters:
+    -----------
+    X_pca : np.ndarray
+        PCA-transformed data (rows x components).
+    y : pd.Series
+        Target values, used for coloring points in plots.
+    pca : PCA
+        The fitted PCA model (for explained variance, etc.).
+    target_label : str
+        Label for the colorbar representing the target variable.
+    Returns:
+    --------
+    scatter_plot_file : str
+        Filename for the PCA scatter plot.
+    pair_plot_file : str
+        Filename for the PCA pair plot.
+    scree_plot_file : str
+        Filename for the Scree plot.
+    """
+    # 4.a: Scatter plot for the first two components
+    scatter_plot_file = 'pca_scatter.png'
     plt.figure(figsize=(10, 6))
+    plt.scatter(
+        X_pca[:, 0],
+        X_pca[:, 1],
+        c=y,
+        cmap='viridis',
+        edgecolor='k',
+        s=50
+    )
     plt.xlabel('Principal Component 1')
     plt.ylabel('Principal Component 2')
     plt.title('PCA - First Two Principal Components')
+    cbar = plt.colorbar()
+    cbar.set_label(target_label)
+    plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
     plt.close()
+    # 4.b: Pair plot of the first few components
+    num_components = min(X_pca.shape[1], 5)
+    pair_plot_file = 'pca_pairplot.png'
+    pca_df = pd.DataFrame(
+        X_pca[:, :num_components],
+        columns=[f'PC{i+1}' for i in range(num_components)]
+    )
+    pca_df[target_label] = y.values  # Add target for hue in pairplot
+    sns.pairplot(
+        pca_df,
+        vars=[f'PC{i+1}' for i in range(num_components)],
+        hue=target_label,
+        palette='viridis'
+    )
     plt.suptitle('Pair Plot of Principal Components', y=1.02)
+    plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
+    plt.close()
+    # 4.c: Scree plot for explained variance
+    scree_plot_file = 'pca_scree.png'
+    plt.figure(figsize=(8, 5))
+    plt.bar(
+        range(1, pca.n_components_ + 1),
+        pca.explained_variance_ratio_,
+        alpha=0.7,
+        color='red'
+    )
+    plt.xlabel('Principal Components')
+    plt.ylabel('Variance Explained')
+    plt.title('Scree Plot')
+    plt.xticks(range(1, pca.n_components_ + 1))
+    plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight')
     plt.close()
+    return scatter_plot_file, pair_plot_file, scree_plot_file
+# ---------------------------------------------------------------------------
+# 5. GRADIO INTERFACE FUNCTION
+# ---------------------------------------------------------------------------
+def gradio_interface(
+    target_column: str,
+    n_components: int = 5,
+    k_best: int = 0
+):
+    """
+    Main interface function that:
+     - Loads the Parquet data
+     - Preprocesses the data (optionally using SelectKBest feature selection)
+     - Applies PCA
+     - Visualizes the PCA results
+     - Returns file paths to the generated plots.
+    Parameters:
+    -----------
+    target_column : str
+        The name of the target column in the DataFrame.
+    n_components : int
+        Number of PCA components to use.
+    k_best : int
+        If > 0, select the top k_best features before PCA.
+        If 0 or None, no feature selection is performed.
+    Returns:
+    --------
+    scatter_plot : str
+        File path to the PCA scatter plot.
+    pair_plot : str
+        File path to the pair plot of principal components.
+    scree_plot : str
+        File path to the scree plot of explained variance.
+    """
+    # Load data
     df = load_data('df_usa_health_features.parquet')
+    # Preprocess data (optionally do feature selection)
+    X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
+    # Apply PCA
+    X_pca, pca_model = apply_pca(X, n_components)
+    # Generate and return visualizations
+    scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
+    return scatter_plot, pair_plot, scree_plot
+# ---------------------------------------------------------------------------
+# 6. BUILDING THE GRADIO APP
+# ---------------------------------------------------------------------------
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
+        gr.inputs.Textbox(label="Target Column", default="Median_Income_Household"),
+        gr.inputs.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, default=5),
+        gr.inputs.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, default=0)
     ],
     outputs=[
         gr.outputs.Image(type="file", label="PCA Scatter Plot"),
+        gr.outputs.Image(type="file", label="PCA Pair Plot"),
+        gr.outputs.Image(type="file", label="Scree Plot")
     ],
     title="PCA Visualization with DuckDB and Gradio",
+    description=(
+        "Load data from a Parquet file, optionally perform feature selection, "
+        "run PCA, and visualize the results. "
+        "1) Enter the target column name (e.g., 'Median_Income_Household'). "
+        "2) Choose the number of PCA components. "
+        "3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
+    )
 )
+# ---------------------------------------------------------------------------
+# 7. LAUNCH THE APPLICATION
+# ---------------------------------------------------------------------------
 if __name__ == "__main__":
+    iface.launch()