baho / app.py
LeonceNsh's picture
Update app.py
54ff036 verified
raw
history blame
4.55 kB
import gradio as gr
import pandas as pd
import duckdb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
def load_data(parquet_file: str) -> pd.DataFrame:
con = duckdb.connect(database=':memory:')
query = f"""
CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
"""
con.execute(query)
df = con.execute("SELECT * FROM data").fetchdf()
return df
def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None):
X = df.select_dtypes(include=[float, int]).copy()
y = df[target_column].copy()
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.fillna(X.median())
y = y.fillna(y.median())
if k_best is not None and k_best < X.shape[1]:
selector = SelectKBest(score_func=f_regression, k=k_best)
X_selected = selector.fit_transform(X, y)
selected_indices = selector.get_support(indices=True)
X = X.iloc[:, selected_indices]
return X, y
def apply_pca(X: pd.DataFrame, n_components: int = 5):
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
return X_pca, pca
def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Median Income Household'):
scatter_plot_file = 'pca_scatter.png'
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA - First Two Principal Components')
cbar = plt.colorbar()
cbar.set_label(target_label)
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
plt.close()
# Pair plot
pair_plot_file = 'pca_pairplot.png'
num_components = min(X_pca.shape[1], 5)
pca_df = pd.DataFrame(X_pca[:, :num_components],
columns=[f'PC{i+1}' for i in range(num_components)])
pca_df[target_label] = y.values
sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)],
hue=target_label, palette='viridis')
plt.suptitle('Pair Plot of Principal Components', y=1.02)
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
plt.close()
# Scree plot
scree_plot_file = 'pca_scree.png'
plt.figure(figsize=(8, 5))
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
plt.xlabel('Principal Components')
plt.ylabel('Variance Explained')
plt.title('Scree Plot')
plt.xticks(range(1, pca.n_components_ + 1))
plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight')
plt.close()
return scatter_plot_file, pair_plot_file, scree_plot_file
def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0):
df = load_data('df_usa_health_features.parquet')
X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
X_pca, pca_model = apply_pca(X, n_components)
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
return scatter_plot, pair_plot, scree_plot
# ------------------------------------------------------------------------------
# HERE is the updated Gradio interface with direct component calls (no .inputs)
# ------------------------------------------------------------------------------
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(label="Target Column", value="Median_Income_Household"),
gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
],
outputs=[
gr.Image(type="file", label="PCA Scatter Plot"),
gr.Image(type="file", label="PCA Pair Plot"),
gr.Image(type="file", label="Scree Plot")
],
title="PCA Visualization with DuckDB and Gradio",
description=(
"Load data from a Parquet file, optionally perform feature selection, "
"run PCA, and visualize the results.\n"
"1) Enter the target column name (e.g., 'Median_Income_Household').\n"
"2) Choose the number of PCA components.\n"
"3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
)
)
if __name__ == "__main__":
iface.launch()