baho / app.py
LeonceNsh's picture
Update app.py
d18faab verified
import gradio as gr
import pandas as pd
import duckdb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
def load_data(parquet_file: str) -> pd.DataFrame:
con = duckdb.connect(database=':memory:')
query = f"""
CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
"""
con.execute(query)
df = con.execute("SELECT * FROM data").fetchdf()
return df
def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None):
X = df.select_dtypes(include=[float, int]).copy()
y = df[target_column].copy()
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.fillna(X.median())
y = y.fillna(y.median())
# Optionally select top k features
if k_best is not None and k_best < X.shape[1]:
selector = SelectKBest(score_func=f_regression, k=k_best)
X_selected = selector.fit_transform(X, y)
selected_indices = selector.get_support(indices=True)
X = X.iloc[:, selected_indices]
return X, y
def apply_pca(X: pd.DataFrame, n_components: int = 5):
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
return X_pca, pca
def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str):
# 1) Scatter plot of PC1 vs PC2
scatter_plot_file = 'pca_scatter.png'
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA - First Two Principal Components')
cbar = plt.colorbar()
cbar.set_label(target_label)
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
plt.close()
# 2) Pair plot of the first few components
pair_plot_file = 'pca_pairplot.png'
num_components = min(X_pca.shape[1], 5)
pca_df = pd.DataFrame(X_pca[:, :num_components],
columns=[f'PC{i+1}' for i in range(num_components)])
pca_df[target_label] = y.values
sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)],
hue=target_label, palette='viridis')
plt.suptitle('Pair Plot of Principal Components', y=1.02)
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
plt.close()
# 3) Scree plot
scree_plot_file = 'pca_scree.png'
plt.figure(figsize=(8, 5))
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
plt.xlabel('Principal Components')
plt.ylabel('Variance Explained')
plt.title('Scree Plot')
plt.xticks(range(1, pca.n_components_ + 1))
plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight')
plt.close()
return scatter_plot_file, pair_plot_file, scree_plot_file
def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0):
df = load_data('df_usa_health_features.parquet')
X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
X_pca, pca_model = apply_pca(X, n_components)
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
return scatter_plot, pair_plot, scree_plot
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(label="Target Column", value="Median_Income_Household"),
gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
],
# Use 'filepath' instead of 'file' so Gradio knows you're returning local image paths
outputs=[
gr.Image(type="filepath", label="PCA Scatter Plot"),
gr.Image(type="filepath", label="PCA Pair Plot"),
gr.Image(type="filepath", label="Scree Plot")
],
title="PCA Visualization with DuckDB and Gradio",
description=(
"Load data from a Parquet file, optionally perform feature selection, "
"run PCA, and visualize the results.\n"
"1) Enter the target column name (e.g., 'Median_Income_Household').\n"
"2) Choose the number of PCA components.\n"
"3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
)
)
if __name__ == "__main__":
iface.launch()